summaryrefslogtreecommitdiff
path: root/ANDROID_3.4.5/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'ANDROID_3.4.5/arch/x86/kvm')
-rw-r--r--ANDROID_3.4.5/arch/x86/kvm/Kconfig87
-rw-r--r--ANDROID_3.4.5/arch/x86/kvm/Makefile21
-rw-r--r--ANDROID_3.4.5/arch/x86/kvm/cpuid.c670
-rw-r--r--ANDROID_3.4.5/arch/x86/kvm/cpuid.h54
-rw-r--r--ANDROID_3.4.5/arch/x86/kvm/emulate.c4360
-rw-r--r--ANDROID_3.4.5/arch/x86/kvm/i8254.c765
-rw-r--r--ANDROID_3.4.5/arch/x86/kvm/i8254.h58
-rw-r--r--ANDROID_3.4.5/arch/x86/kvm/i8259.c664
-rw-r--r--ANDROID_3.4.5/arch/x86/kvm/irq.c96
-rw-r--r--ANDROID_3.4.5/arch/x86/kvm/irq.h106
-rw-r--r--ANDROID_3.4.5/arch/x86/kvm/kvm_cache_regs.h102
-rw-r--r--ANDROID_3.4.5/arch/x86/kvm/kvm_timer.h18
-rw-r--r--ANDROID_3.4.5/arch/x86/kvm/lapic.c1387
-rw-r--r--ANDROID_3.4.5/arch/x86/kvm/lapic.h63
-rw-r--r--ANDROID_3.4.5/arch/x86/kvm/mmu.c4027
-rw-r--r--ANDROID_3.4.5/arch/x86/kvm/mmu.h104
-rw-r--r--ANDROID_3.4.5/arch/x86/kvm/mmu_audit.c303
-rw-r--r--ANDROID_3.4.5/arch/x86/kvm/mmutrace.h254
-rw-r--r--ANDROID_3.4.5/arch/x86/kvm/paging_tmpl.h837
-rw-r--r--ANDROID_3.4.5/arch/x86/kvm/pmu.c537
-rw-r--r--ANDROID_3.4.5/arch/x86/kvm/svm.c4336
-rw-r--r--ANDROID_3.4.5/arch/x86/kvm/timer.c47
-rw-r--r--ANDROID_3.4.5/arch/x86/kvm/trace.h830
-rw-r--r--ANDROID_3.4.5/arch/x86/kvm/tss.h59
-rw-r--r--ANDROID_3.4.5/arch/x86/kvm/vmx.c7272
-rw-r--r--ANDROID_3.4.5/arch/x86/kvm/x86.c6607
-rw-r--r--ANDROID_3.4.5/arch/x86/kvm/x86.h127
27 files changed, 0 insertions, 33791 deletions
diff --git a/ANDROID_3.4.5/arch/x86/kvm/Kconfig b/ANDROID_3.4.5/arch/x86/kvm/Kconfig
deleted file mode 100644
index 1a7fe868..00000000
--- a/ANDROID_3.4.5/arch/x86/kvm/Kconfig
+++ /dev/null
@@ -1,87 +0,0 @@
-#
-# KVM configuration
-#
-
-source "virt/kvm/Kconfig"
-
-menuconfig VIRTUALIZATION
- bool "Virtualization"
- depends on HAVE_KVM || X86
- default y
- ---help---
- Say Y here to get to see options for using your Linux host to run other
- operating systems inside virtual machines (guests).
- This option alone does not add any kernel code.
-
- If you say N, all options in this submenu will be skipped and disabled.
-
-if VIRTUALIZATION
-
-config KVM
- tristate "Kernel-based Virtual Machine (KVM) support"
- depends on HAVE_KVM
- # for device assignment:
- depends on PCI
- # for TASKSTATS/TASK_DELAY_ACCT:
- depends on NET
- select PREEMPT_NOTIFIERS
- select MMU_NOTIFIER
- select ANON_INODES
- select HAVE_KVM_IRQCHIP
- select HAVE_KVM_EVENTFD
- select KVM_APIC_ARCHITECTURE
- select KVM_ASYNC_PF
- select USER_RETURN_NOTIFIER
- select KVM_MMIO
- select TASKSTATS
- select TASK_DELAY_ACCT
- select PERF_EVENTS
- ---help---
- Support hosting fully virtualized guest machines using hardware
- virtualization extensions. You will need a fairly recent
- processor equipped with virtualization extensions. You will also
- need to select one or more of the processor modules below.
-
- This module provides access to the hardware capabilities through
- a character device node named /dev/kvm.
-
- To compile this as a module, choose M here: the module
- will be called kvm.
-
- If unsure, say N.
-
-config KVM_INTEL
- tristate "KVM for Intel processors support"
- depends on KVM
- # for perf_guest_get_msrs():
- depends on CPU_SUP_INTEL
- ---help---
- Provides support for KVM on Intel processors equipped with the VT
- extensions.
-
- To compile this as a module, choose M here: the module
- will be called kvm-intel.
-
-config KVM_AMD
- tristate "KVM for AMD processors support"
- depends on KVM
- ---help---
- Provides support for KVM on AMD processors equipped with the AMD-V
- (SVM) extensions.
-
- To compile this as a module, choose M here: the module
- will be called kvm-amd.
-
-config KVM_MMU_AUDIT
- bool "Audit KVM MMU"
- depends on KVM && TRACEPOINTS
- ---help---
- This option adds a R/W kVM module parameter 'mmu_audit', which allows
- audit KVM MMU at runtime.
-
-# OK, it's a little counter-intuitive to do this, but it puts it neatly under
-# the virtualization menu.
-source drivers/vhost/Kconfig
-source drivers/lguest/Kconfig
-
-endif # VIRTUALIZATION
diff --git a/ANDROID_3.4.5/arch/x86/kvm/Makefile b/ANDROID_3.4.5/arch/x86/kvm/Makefile
deleted file mode 100644
index 4f579e8d..00000000
--- a/ANDROID_3.4.5/arch/x86/kvm/Makefile
+++ /dev/null
@@ -1,21 +0,0 @@
-
-ccflags-y += -Ivirt/kvm -Iarch/x86/kvm
-
-CFLAGS_x86.o := -I.
-CFLAGS_svm.o := -I.
-CFLAGS_vmx.o := -I.
-
-kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
- coalesced_mmio.o irq_comm.o eventfd.o \
- assigned-dev.o)
-kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
-kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o)
-
-kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
- i8254.o timer.o cpuid.o pmu.o
-kvm-intel-y += vmx.o
-kvm-amd-y += svm.o
-
-obj-$(CONFIG_KVM) += kvm.o
-obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
-obj-$(CONFIG_KVM_AMD) += kvm-amd.o
diff --git a/ANDROID_3.4.5/arch/x86/kvm/cpuid.c b/ANDROID_3.4.5/arch/x86/kvm/cpuid.c
deleted file mode 100644
index 9fed5bed..00000000
--- a/ANDROID_3.4.5/arch/x86/kvm/cpuid.c
+++ /dev/null
@@ -1,670 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- * cpuid support routines
- *
- * derived from arch/x86/kvm/x86.c
- *
- * Copyright 2011 Red Hat, Inc. and/or its affiliates.
- * Copyright IBM Corporation, 2008
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- *
- */
-
-#include <linux/kvm_host.h>
-#include <linux/module.h>
-#include <linux/vmalloc.h>
-#include <linux/uaccess.h>
-#include <asm/user.h>
-#include <asm/xsave.h>
-#include "cpuid.h"
-#include "lapic.h"
-#include "mmu.h"
-#include "trace.h"
-
-void kvm_update_cpuid(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best;
- struct kvm_lapic *apic = vcpu->arch.apic;
-
- best = kvm_find_cpuid_entry(vcpu, 1, 0);
- if (!best)
- return;
-
- /* Update OSXSAVE bit */
- if (cpu_has_xsave && best->function == 0x1) {
- best->ecx &= ~(bit(X86_FEATURE_OSXSAVE));
- if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
- best->ecx |= bit(X86_FEATURE_OSXSAVE);
- }
-
- if (apic) {
- if (best->ecx & bit(X86_FEATURE_TSC_DEADLINE_TIMER))
- apic->lapic_timer.timer_mode_mask = 3 << 17;
- else
- apic->lapic_timer.timer_mode_mask = 1 << 17;
- }
-
- kvm_pmu_cpuid_update(vcpu);
-}
-
-static int is_efer_nx(void)
-{
- unsigned long long efer = 0;
-
- rdmsrl_safe(MSR_EFER, &efer);
- return efer & EFER_NX;
-}
-
-static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
-{
- int i;
- struct kvm_cpuid_entry2 *e, *entry;
-
- entry = NULL;
- for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
- e = &vcpu->arch.cpuid_entries[i];
- if (e->function == 0x80000001) {
- entry = e;
- break;
- }
- }
- if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
- entry->edx &= ~(1 << 20);
- printk(KERN_INFO "kvm: guest NX capability removed\n");
- }
-}
-
-/* when an old userspace process fills a new kernel module */
-int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
- struct kvm_cpuid *cpuid,
- struct kvm_cpuid_entry __user *entries)
-{
- int r, i;
- struct kvm_cpuid_entry *cpuid_entries;
-
- r = -E2BIG;
- if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
- goto out;
- r = -ENOMEM;
- cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
- if (!cpuid_entries)
- goto out;
- r = -EFAULT;
- if (copy_from_user(cpuid_entries, entries,
- cpuid->nent * sizeof(struct kvm_cpuid_entry)))
- goto out_free;
- for (i = 0; i < cpuid->nent; i++) {
- vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
- vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
- vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
- vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
- vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
- vcpu->arch.cpuid_entries[i].index = 0;
- vcpu->arch.cpuid_entries[i].flags = 0;
- vcpu->arch.cpuid_entries[i].padding[0] = 0;
- vcpu->arch.cpuid_entries[i].padding[1] = 0;
- vcpu->arch.cpuid_entries[i].padding[2] = 0;
- }
- vcpu->arch.cpuid_nent = cpuid->nent;
- cpuid_fix_nx_cap(vcpu);
- r = 0;
- kvm_apic_set_version(vcpu);
- kvm_x86_ops->cpuid_update(vcpu);
- kvm_update_cpuid(vcpu);
-
-out_free:
- vfree(cpuid_entries);
-out:
- return r;
-}
-
-int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
- struct kvm_cpuid2 *cpuid,
- struct kvm_cpuid_entry2 __user *entries)
-{
- int r;
-
- r = -E2BIG;
- if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
- goto out;
- r = -EFAULT;
- if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
- cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
- goto out;
- vcpu->arch.cpuid_nent = cpuid->nent;
- kvm_apic_set_version(vcpu);
- kvm_x86_ops->cpuid_update(vcpu);
- kvm_update_cpuid(vcpu);
- return 0;
-
-out:
- return r;
-}
-
-int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
- struct kvm_cpuid2 *cpuid,
- struct kvm_cpuid_entry2 __user *entries)
-{
- int r;
-
- r = -E2BIG;
- if (cpuid->nent < vcpu->arch.cpuid_nent)
- goto out;
- r = -EFAULT;
- if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
- vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
- goto out;
- return 0;
-
-out:
- cpuid->nent = vcpu->arch.cpuid_nent;
- return r;
-}
-
-static void cpuid_mask(u32 *word, int wordnum)
-{
- *word &= boot_cpu_data.x86_capability[wordnum];
-}
-
-static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
- u32 index)
-{
- entry->function = function;
- entry->index = index;
- cpuid_count(entry->function, entry->index,
- &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
- entry->flags = 0;
-}
-
-static bool supported_xcr0_bit(unsigned bit)
-{
- u64 mask = ((u64)1 << bit);
-
- return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0;
-}
-
-#define F(x) bit(X86_FEATURE_##x)
-
-static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
- u32 index, int *nent, int maxnent)
-{
- int r;
- unsigned f_nx = is_efer_nx() ? F(NX) : 0;
-#ifdef CONFIG_X86_64
- unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
- ? F(GBPAGES) : 0;
- unsigned f_lm = F(LM);
-#else
- unsigned f_gbpages = 0;
- unsigned f_lm = 0;
-#endif
- unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
-
- /* cpuid 1.edx */
- const u32 kvm_supported_word0_x86_features =
- F(FPU) | F(VME) | F(DE) | F(PSE) |
- F(TSC) | F(MSR) | F(PAE) | F(MCE) |
- F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
- F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
- F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) |
- 0 /* Reserved, DS, ACPI */ | F(MMX) |
- F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
- 0 /* HTT, TM, Reserved, PBE */;
- /* cpuid 0x80000001.edx */
- const u32 kvm_supported_word1_x86_features =
- F(FPU) | F(VME) | F(DE) | F(PSE) |
- F(TSC) | F(MSR) | F(PAE) | F(MCE) |
- F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
- F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
- F(PAT) | F(PSE36) | 0 /* Reserved */ |
- f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
- F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
- 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
- /* cpuid 1.ecx */
- const u32 kvm_supported_word4_x86_features =
- F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
- 0 /* DS-CPL, VMX, SMX, EST */ |
- 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
- F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ |
- 0 /* Reserved, DCA */ | F(XMM4_1) |
- F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
- 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
- F(F16C) | F(RDRAND);
- /* cpuid 0x80000001.ecx */
- const u32 kvm_supported_word6_x86_features =
- F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
- F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
- F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
- 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
-
- /* cpuid 0xC0000001.edx */
- const u32 kvm_supported_word5_x86_features =
- F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
- F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
- F(PMM) | F(PMM_EN);
-
- /* cpuid 7.0.ebx */
- const u32 kvm_supported_word9_x86_features =
- F(FSGSBASE) | F(BMI1) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS);
-
- /* all calls to cpuid_count() should be made on the same cpu */
- get_cpu();
-
- r = -E2BIG;
-
- if (*nent >= maxnent)
- goto out;
-
- do_cpuid_1_ent(entry, function, index);
- ++*nent;
-
- switch (function) {
- case 0:
- entry->eax = min(entry->eax, (u32)0xd);
- break;
- case 1:
- entry->edx &= kvm_supported_word0_x86_features;
- cpuid_mask(&entry->edx, 0);
- entry->ecx &= kvm_supported_word4_x86_features;
- cpuid_mask(&entry->ecx, 4);
- /* we support x2apic emulation even if host does not support
- * it since we emulate x2apic in software */
- entry->ecx |= F(X2APIC);
- break;
- /* function 2 entries are STATEFUL. That is, repeated cpuid commands
- * may return different values. This forces us to get_cpu() before
- * issuing the first command, and also to emulate this annoying behavior
- * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
- case 2: {
- int t, times = entry->eax & 0xff;
-
- entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
- entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
- for (t = 1; t < times; ++t) {
- if (*nent >= maxnent)
- goto out;
-
- do_cpuid_1_ent(&entry[t], function, 0);
- entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
- ++*nent;
- }
- break;
- }
- /* function 4 has additional index. */
- case 4: {
- int i, cache_type;
-
- entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
- /* read more entries until cache_type is zero */
- for (i = 1; ; ++i) {
- if (*nent >= maxnent)
- goto out;
-
- cache_type = entry[i - 1].eax & 0x1f;
- if (!cache_type)
- break;
- do_cpuid_1_ent(&entry[i], function, i);
- entry[i].flags |=
- KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
- ++*nent;
- }
- break;
- }
- case 7: {
- entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
- /* Mask ebx against host capbability word 9 */
- if (index == 0) {
- entry->ebx &= kvm_supported_word9_x86_features;
- cpuid_mask(&entry->ebx, 9);
- } else
- entry->ebx = 0;
- entry->eax = 0;
- entry->ecx = 0;
- entry->edx = 0;
- break;
- }
- case 9:
- break;
- case 0xa: { /* Architectural Performance Monitoring */
- struct x86_pmu_capability cap;
- union cpuid10_eax eax;
- union cpuid10_edx edx;
-
- perf_get_x86_pmu_capability(&cap);
-
- /*
- * Only support guest architectural pmu on a host
- * with architectural pmu.
- */
- if (!cap.version)
- memset(&cap, 0, sizeof(cap));
-
- eax.split.version_id = min(cap.version, 2);
- eax.split.num_counters = cap.num_counters_gp;
- eax.split.bit_width = cap.bit_width_gp;
- eax.split.mask_length = cap.events_mask_len;
-
- edx.split.num_counters_fixed = cap.num_counters_fixed;
- edx.split.bit_width_fixed = cap.bit_width_fixed;
- edx.split.reserved = 0;
-
- entry->eax = eax.full;
- entry->ebx = cap.events_mask;
- entry->ecx = 0;
- entry->edx = edx.full;
- break;
- }
- /* function 0xb has additional index. */
- case 0xb: {
- int i, level_type;
-
- entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
- /* read more entries until level_type is zero */
- for (i = 1; ; ++i) {
- if (*nent >= maxnent)
- goto out;
-
- level_type = entry[i - 1].ecx & 0xff00;
- if (!level_type)
- break;
- do_cpuid_1_ent(&entry[i], function, i);
- entry[i].flags |=
- KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
- ++*nent;
- }
- break;
- }
- case 0xd: {
- int idx, i;
-
- entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
- for (idx = 1, i = 1; idx < 64; ++idx) {
- if (*nent >= maxnent)
- goto out;
-
- do_cpuid_1_ent(&entry[i], function, idx);
- if (entry[i].eax == 0 || !supported_xcr0_bit(idx))
- continue;
- entry[i].flags |=
- KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
- ++*nent;
- ++i;
- }
- break;
- }
- case KVM_CPUID_SIGNATURE: {
- char signature[12] = "KVMKVMKVM\0\0";
- u32 *sigptr = (u32 *)signature;
- entry->eax = 0;
- entry->ebx = sigptr[0];
- entry->ecx = sigptr[1];
- entry->edx = sigptr[2];
- break;
- }
- case KVM_CPUID_FEATURES:
- entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
- (1 << KVM_FEATURE_NOP_IO_DELAY) |
- (1 << KVM_FEATURE_CLOCKSOURCE2) |
- (1 << KVM_FEATURE_ASYNC_PF) |
- (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
-
- if (sched_info_on())
- entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
-
- entry->ebx = 0;
- entry->ecx = 0;
- entry->edx = 0;
- break;
- case 0x80000000:
- entry->eax = min(entry->eax, 0x8000001a);
- break;
- case 0x80000001:
- entry->edx &= kvm_supported_word1_x86_features;
- cpuid_mask(&entry->edx, 1);
- entry->ecx &= kvm_supported_word6_x86_features;
- cpuid_mask(&entry->ecx, 6);
- break;
- case 0x80000008: {
- unsigned g_phys_as = (entry->eax >> 16) & 0xff;
- unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U);
- unsigned phys_as = entry->eax & 0xff;
-
- if (!g_phys_as)
- g_phys_as = phys_as;
- entry->eax = g_phys_as | (virt_as << 8);
- entry->ebx = entry->edx = 0;
- break;
- }
- case 0x80000019:
- entry->ecx = entry->edx = 0;
- break;
- case 0x8000001a:
- break;
- case 0x8000001d:
- break;
- /*Add support for Centaur's CPUID instruction*/
- case 0xC0000000:
- /*Just support up to 0xC0000004 now*/
- entry->eax = min(entry->eax, 0xC0000004);
- break;
- case 0xC0000001:
- entry->edx &= kvm_supported_word5_x86_features;
- cpuid_mask(&entry->edx, 5);
- break;
- case 3: /* Processor serial number */
- case 5: /* MONITOR/MWAIT */
- case 6: /* Thermal management */
- case 0x80000007: /* Advanced power management */
- case 0xC0000002:
- case 0xC0000003:
- case 0xC0000004:
- default:
- entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
- break;
- }
-
- kvm_x86_ops->set_supported_cpuid(function, entry);
-
- r = 0;
-
-out:
- put_cpu();
-
- return r;
-}
-
-#undef F
-
-struct kvm_cpuid_param {
- u32 func;
- u32 idx;
- bool has_leaf_count;
- bool (*qualifier)(struct kvm_cpuid_param *param);
-};
-
-static bool is_centaur_cpu(struct kvm_cpuid_param *param)
-{
- return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR;
-}
-
-int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
- struct kvm_cpuid_entry2 __user *entries)
-{
- struct kvm_cpuid_entry2 *cpuid_entries;
- int limit, nent = 0, r = -E2BIG, i;
- u32 func;
- static struct kvm_cpuid_param param[] = {
- { .func = 0, .has_leaf_count = true },
- { .func = 0x80000000, .has_leaf_count = true },
- { .func = 0xC0000000, .qualifier = is_centaur_cpu, .has_leaf_count = true },
- { .func = KVM_CPUID_SIGNATURE },
- { .func = KVM_CPUID_FEATURES },
- };
-
- if (cpuid->nent < 1)
- goto out;
- if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
- cpuid->nent = KVM_MAX_CPUID_ENTRIES;
- r = -ENOMEM;
- cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
- if (!cpuid_entries)
- goto out;
-
- r = 0;
- for (i = 0; i < ARRAY_SIZE(param); i++) {
- struct kvm_cpuid_param *ent = &param[i];
-
- if (ent->qualifier && !ent->qualifier(ent))
- continue;
-
- r = do_cpuid_ent(&cpuid_entries[nent], ent->func, ent->idx,
- &nent, cpuid->nent);
-
- if (r)
- goto out_free;
-
- if (!ent->has_leaf_count)
- continue;
-
- limit = cpuid_entries[nent - 1].eax;
- for (func = ent->func + 1; func <= limit && nent < cpuid->nent && r == 0; ++func)
- r = do_cpuid_ent(&cpuid_entries[nent], func, ent->idx,
- &nent, cpuid->nent);
-
- if (r)
- goto out_free;
- }
-
- r = -EFAULT;
- if (copy_to_user(entries, cpuid_entries,
- nent * sizeof(struct kvm_cpuid_entry2)))
- goto out_free;
- cpuid->nent = nent;
- r = 0;
-
-out_free:
- vfree(cpuid_entries);
-out:
- return r;
-}
-
-static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
-{
- struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
- int j, nent = vcpu->arch.cpuid_nent;
-
- e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
- /* when no next entry is found, the current entry[i] is reselected */
- for (j = i + 1; ; j = (j + 1) % nent) {
- struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
- if (ej->function == e->function) {
- ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
- return j;
- }
- }
- return 0; /* silence gcc, even though control never reaches here */
-}
-
-/* find an entry with matching function, matching index (if needed), and that
- * should be read next (if it's stateful) */
-static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
- u32 function, u32 index)
-{
- if (e->function != function)
- return 0;
- if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
- return 0;
- if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
- !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
- return 0;
- return 1;
-}
-
-struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
- u32 function, u32 index)
-{
- int i;
- struct kvm_cpuid_entry2 *best = NULL;
-
- for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
- struct kvm_cpuid_entry2 *e;
-
- e = &vcpu->arch.cpuid_entries[i];
- if (is_matching_cpuid_entry(e, function, index)) {
- if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
- move_to_next_stateful_cpuid_entry(vcpu, i);
- best = e;
- break;
- }
- }
- return best;
-}
-EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
-
-int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best;
-
- best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
- if (!best || best->eax < 0x80000008)
- goto not_found;
- best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
- if (best)
- return best->eax & 0xff;
-not_found:
- return 36;
-}
-
-/*
- * If no match is found, check whether we exceed the vCPU's limit
- * and return the content of the highest valid _standard_ leaf instead.
- * This is to satisfy the CPUID specification.
- */
-static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu,
- u32 function, u32 index)
-{
- struct kvm_cpuid_entry2 *maxlevel;
-
- maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0);
- if (!maxlevel || maxlevel->eax >= function)
- return NULL;
- if (function & 0x80000000) {
- maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0);
- if (!maxlevel)
- return NULL;
- }
- return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index);
-}
-
-void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
-{
- u32 function, index;
- struct kvm_cpuid_entry2 *best;
-
- function = kvm_register_read(vcpu, VCPU_REGS_RAX);
- index = kvm_register_read(vcpu, VCPU_REGS_RCX);
- kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
- kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
- kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
- kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
- best = kvm_find_cpuid_entry(vcpu, function, index);
-
- if (!best)
- best = check_cpuid_limit(vcpu, function, index);
-
- if (best) {
- kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
- kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
- kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
- kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
- }
- kvm_x86_ops->skip_emulated_instruction(vcpu);
- trace_kvm_cpuid(function,
- kvm_register_read(vcpu, VCPU_REGS_RAX),
- kvm_register_read(vcpu, VCPU_REGS_RBX),
- kvm_register_read(vcpu, VCPU_REGS_RCX),
- kvm_register_read(vcpu, VCPU_REGS_RDX));
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
diff --git a/ANDROID_3.4.5/arch/x86/kvm/cpuid.h b/ANDROID_3.4.5/arch/x86/kvm/cpuid.h
deleted file mode 100644
index 26d1fb43..00000000
--- a/ANDROID_3.4.5/arch/x86/kvm/cpuid.h
+++ /dev/null
@@ -1,54 +0,0 @@
-#ifndef ARCH_X86_KVM_CPUID_H
-#define ARCH_X86_KVM_CPUID_H
-
-#include "x86.h"
-
-void kvm_update_cpuid(struct kvm_vcpu *vcpu);
-struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
- u32 function, u32 index);
-int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
- struct kvm_cpuid_entry2 __user *entries);
-int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
- struct kvm_cpuid *cpuid,
- struct kvm_cpuid_entry __user *entries);
-int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
- struct kvm_cpuid2 *cpuid,
- struct kvm_cpuid_entry2 __user *entries);
-int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
- struct kvm_cpuid2 *cpuid,
- struct kvm_cpuid_entry2 __user *entries);
-
-
-static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best;
-
- best = kvm_find_cpuid_entry(vcpu, 1, 0);
- return best && (best->ecx & bit(X86_FEATURE_XSAVE));
-}
-
-static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best;
-
- best = kvm_find_cpuid_entry(vcpu, 7, 0);
- return best && (best->ebx & bit(X86_FEATURE_SMEP));
-}
-
-static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best;
-
- best = kvm_find_cpuid_entry(vcpu, 7, 0);
- return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
-}
-
-static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best;
-
- best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
- return best && (best->ecx & bit(X86_FEATURE_OSVW));
-}
-
-#endif
diff --git a/ANDROID_3.4.5/arch/x86/kvm/emulate.c b/ANDROID_3.4.5/arch/x86/kvm/emulate.c
deleted file mode 100644
index 83756223..00000000
--- a/ANDROID_3.4.5/arch/x86/kvm/emulate.c
+++ /dev/null
@@ -1,4360 +0,0 @@
-/******************************************************************************
- * emulate.c
- *
- * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
- *
- * Copyright (c) 2005 Keir Fraser
- *
- * Linux coding style, mod r/m decoder, segment base fixes, real-mode
- * privileged instructions:
- *
- * Copyright (C) 2006 Qumranet
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
- *
- * Avi Kivity <avi@qumranet.com>
- * Yaniv Kamay <yaniv@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- *
- * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
- */
-
-#include <linux/kvm_host.h>
-#include "kvm_cache_regs.h"
-#include <linux/module.h>
-#include <asm/kvm_emulate.h>
-
-#include "x86.h"
-#include "tss.h"
-
-/*
- * Operand types
- */
-#define OpNone 0ull
-#define OpImplicit 1ull /* No generic decode */
-#define OpReg 2ull /* Register */
-#define OpMem 3ull /* Memory */
-#define OpAcc 4ull /* Accumulator: AL/AX/EAX/RAX */
-#define OpDI 5ull /* ES:DI/EDI/RDI */
-#define OpMem64 6ull /* Memory, 64-bit */
-#define OpImmUByte 7ull /* Zero-extended 8-bit immediate */
-#define OpDX 8ull /* DX register */
-#define OpCL 9ull /* CL register (for shifts) */
-#define OpImmByte 10ull /* 8-bit sign extended immediate */
-#define OpOne 11ull /* Implied 1 */
-#define OpImm 12ull /* Sign extended immediate */
-#define OpMem16 13ull /* Memory operand (16-bit). */
-#define OpMem32 14ull /* Memory operand (32-bit). */
-#define OpImmU 15ull /* Immediate operand, zero extended */
-#define OpSI 16ull /* SI/ESI/RSI */
-#define OpImmFAddr 17ull /* Immediate far address */
-#define OpMemFAddr 18ull /* Far address in memory */
-#define OpImmU16 19ull /* Immediate operand, 16 bits, zero extended */
-#define OpES 20ull /* ES */
-#define OpCS 21ull /* CS */
-#define OpSS 22ull /* SS */
-#define OpDS 23ull /* DS */
-#define OpFS 24ull /* FS */
-#define OpGS 25ull /* GS */
-#define OpMem8 26ull /* 8-bit zero extended memory operand */
-
-#define OpBits 5 /* Width of operand field */
-#define OpMask ((1ull << OpBits) - 1)
-
-/*
- * Opcode effective-address decode tables.
- * Note that we only emulate instructions that have at least one memory
- * operand (excluding implicit stack references). We assume that stack
- * references and instruction fetches will never occur in special memory
- * areas that require emulation. So, for example, 'mov <imm>,<reg>' need
- * not be handled.
- */
-
-/* Operand sizes: 8-bit operands or specified/overridden size. */
-#define ByteOp (1<<0) /* 8-bit operands. */
-/* Destination operand type. */
-#define DstShift 1
-#define ImplicitOps (OpImplicit << DstShift)
-#define DstReg (OpReg << DstShift)
-#define DstMem (OpMem << DstShift)
-#define DstAcc (OpAcc << DstShift)
-#define DstDI (OpDI << DstShift)
-#define DstMem64 (OpMem64 << DstShift)
-#define DstImmUByte (OpImmUByte << DstShift)
-#define DstDX (OpDX << DstShift)
-#define DstMask (OpMask << DstShift)
-/* Source operand type. */
-#define SrcShift 6
-#define SrcNone (OpNone << SrcShift)
-#define SrcReg (OpReg << SrcShift)
-#define SrcMem (OpMem << SrcShift)
-#define SrcMem16 (OpMem16 << SrcShift)
-#define SrcMem32 (OpMem32 << SrcShift)
-#define SrcImm (OpImm << SrcShift)
-#define SrcImmByte (OpImmByte << SrcShift)
-#define SrcOne (OpOne << SrcShift)
-#define SrcImmUByte (OpImmUByte << SrcShift)
-#define SrcImmU (OpImmU << SrcShift)
-#define SrcSI (OpSI << SrcShift)
-#define SrcImmFAddr (OpImmFAddr << SrcShift)
-#define SrcMemFAddr (OpMemFAddr << SrcShift)
-#define SrcAcc (OpAcc << SrcShift)
-#define SrcImmU16 (OpImmU16 << SrcShift)
-#define SrcDX (OpDX << SrcShift)
-#define SrcMem8 (OpMem8 << SrcShift)
-#define SrcMask (OpMask << SrcShift)
-#define BitOp (1<<11)
-#define MemAbs (1<<12) /* Memory operand is absolute displacement */
-#define String (1<<13) /* String instruction (rep capable) */
-#define Stack (1<<14) /* Stack instruction (push/pop) */
-#define GroupMask (7<<15) /* Opcode uses one of the group mechanisms */
-#define Group (1<<15) /* Bits 3:5 of modrm byte extend opcode */
-#define GroupDual (2<<15) /* Alternate decoding of mod == 3 */
-#define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */
-#define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */
-#define Sse (1<<18) /* SSE Vector instruction */
-/* Generic ModRM decode. */
-#define ModRM (1<<19)
-/* Destination is only written; never read. */
-#define Mov (1<<20)
-/* Misc flags */
-#define Prot (1<<21) /* instruction generates #UD if not in prot-mode */
-#define VendorSpecific (1<<22) /* Vendor specific instruction */
-#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */
-#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */
-#define Undefined (1<<25) /* No Such Instruction */
-#define Lock (1<<26) /* lock prefix is allowed for the instruction */
-#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */
-#define No64 (1<<28)
-#define PageTable (1 << 29) /* instruction used to write page table */
-/* Source 2 operand type */
-#define Src2Shift (30)
-#define Src2None (OpNone << Src2Shift)
-#define Src2CL (OpCL << Src2Shift)
-#define Src2ImmByte (OpImmByte << Src2Shift)
-#define Src2One (OpOne << Src2Shift)
-#define Src2Imm (OpImm << Src2Shift)
-#define Src2ES (OpES << Src2Shift)
-#define Src2CS (OpCS << Src2Shift)
-#define Src2SS (OpSS << Src2Shift)
-#define Src2DS (OpDS << Src2Shift)
-#define Src2FS (OpFS << Src2Shift)
-#define Src2GS (OpGS << Src2Shift)
-#define Src2Mask (OpMask << Src2Shift)
-
-#define X2(x...) x, x
-#define X3(x...) X2(x), x
-#define X4(x...) X2(x), X2(x)
-#define X5(x...) X4(x), x
-#define X6(x...) X4(x), X2(x)
-#define X7(x...) X4(x), X3(x)
-#define X8(x...) X4(x), X4(x)
-#define X16(x...) X8(x), X8(x)
-
-struct opcode {
- u64 flags : 56;
- u64 intercept : 8;
- union {
- int (*execute)(struct x86_emulate_ctxt *ctxt);
- struct opcode *group;
- struct group_dual *gdual;
- struct gprefix *gprefix;
- } u;
- int (*check_perm)(struct x86_emulate_ctxt *ctxt);
-};
-
-struct group_dual {
- struct opcode mod012[8];
- struct opcode mod3[8];
-};
-
-struct gprefix {
- struct opcode pfx_no;
- struct opcode pfx_66;
- struct opcode pfx_f2;
- struct opcode pfx_f3;
-};
-
-/* EFLAGS bit definitions. */
-#define EFLG_ID (1<<21)
-#define EFLG_VIP (1<<20)
-#define EFLG_VIF (1<<19)
-#define EFLG_AC (1<<18)
-#define EFLG_VM (1<<17)
-#define EFLG_RF (1<<16)
-#define EFLG_IOPL (3<<12)
-#define EFLG_NT (1<<14)
-#define EFLG_OF (1<<11)
-#define EFLG_DF (1<<10)
-#define EFLG_IF (1<<9)
-#define EFLG_TF (1<<8)
-#define EFLG_SF (1<<7)
-#define EFLG_ZF (1<<6)
-#define EFLG_AF (1<<4)
-#define EFLG_PF (1<<2)
-#define EFLG_CF (1<<0)
-
-#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a
-#define EFLG_RESERVED_ONE_MASK 2
-
-/*
- * Instruction emulation:
- * Most instructions are emulated directly via a fragment of inline assembly
- * code. This allows us to save/restore EFLAGS and thus very easily pick up
- * any modified flags.
- */
-
-#if defined(CONFIG_X86_64)
-#define _LO32 "k" /* force 32-bit operand */
-#define _STK "%%rsp" /* stack pointer */
-#elif defined(__i386__)
-#define _LO32 "" /* force 32-bit operand */
-#define _STK "%%esp" /* stack pointer */
-#endif
-
-/*
- * These EFLAGS bits are restored from saved value during emulation, and
- * any changes are written back to the saved value after emulation.
- */
-#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
-
-/* Before executing instruction: restore necessary bits in EFLAGS. */
-#define _PRE_EFLAGS(_sav, _msk, _tmp) \
- /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \
- "movl %"_sav",%"_LO32 _tmp"; " \
- "push %"_tmp"; " \
- "push %"_tmp"; " \
- "movl %"_msk",%"_LO32 _tmp"; " \
- "andl %"_LO32 _tmp",("_STK"); " \
- "pushf; " \
- "notl %"_LO32 _tmp"; " \
- "andl %"_LO32 _tmp",("_STK"); " \
- "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); " \
- "pop %"_tmp"; " \
- "orl %"_LO32 _tmp",("_STK"); " \
- "popf; " \
- "pop %"_sav"; "
-
-/* After executing instruction: write-back necessary bits in EFLAGS. */
-#define _POST_EFLAGS(_sav, _msk, _tmp) \
- /* _sav |= EFLAGS & _msk; */ \
- "pushf; " \
- "pop %"_tmp"; " \
- "andl %"_msk",%"_LO32 _tmp"; " \
- "orl %"_LO32 _tmp",%"_sav"; "
-
-#ifdef CONFIG_X86_64
-#define ON64(x) x
-#else
-#define ON64(x)
-#endif
-
-#define ____emulate_2op(ctxt, _op, _x, _y, _suffix, _dsttype) \
- do { \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0", "4", "2") \
- _op _suffix " %"_x"3,%1; " \
- _POST_EFLAGS("0", "4", "2") \
- : "=m" ((ctxt)->eflags), \
- "+q" (*(_dsttype*)&(ctxt)->dst.val), \
- "=&r" (_tmp) \
- : _y ((ctxt)->src.val), "i" (EFLAGS_MASK)); \
- } while (0)
-
-
-/* Raw emulation: instruction has two explicit operands. */
-#define __emulate_2op_nobyte(ctxt,_op,_wx,_wy,_lx,_ly,_qx,_qy) \
- do { \
- unsigned long _tmp; \
- \
- switch ((ctxt)->dst.bytes) { \
- case 2: \
- ____emulate_2op(ctxt,_op,_wx,_wy,"w",u16); \
- break; \
- case 4: \
- ____emulate_2op(ctxt,_op,_lx,_ly,"l",u32); \
- break; \
- case 8: \
- ON64(____emulate_2op(ctxt,_op,_qx,_qy,"q",u64)); \
- break; \
- } \
- } while (0)
-
-#define __emulate_2op(ctxt,_op,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
- do { \
- unsigned long _tmp; \
- switch ((ctxt)->dst.bytes) { \
- case 1: \
- ____emulate_2op(ctxt,_op,_bx,_by,"b",u8); \
- break; \
- default: \
- __emulate_2op_nobyte(ctxt, _op, \
- _wx, _wy, _lx, _ly, _qx, _qy); \
- break; \
- } \
- } while (0)
-
-/* Source operand is byte-sized and may be restricted to just %cl. */
-#define emulate_2op_SrcB(ctxt, _op) \
- __emulate_2op(ctxt, _op, "b", "c", "b", "c", "b", "c", "b", "c")
-
-/* Source operand is byte, word, long or quad sized. */
-#define emulate_2op_SrcV(ctxt, _op) \
- __emulate_2op(ctxt, _op, "b", "q", "w", "r", _LO32, "r", "", "r")
-
-/* Source operand is word, long or quad sized. */
-#define emulate_2op_SrcV_nobyte(ctxt, _op) \
- __emulate_2op_nobyte(ctxt, _op, "w", "r", _LO32, "r", "", "r")
-
-/* Instruction has three operands and one operand is stored in ECX register */
-#define __emulate_2op_cl(ctxt, _op, _suffix, _type) \
- do { \
- unsigned long _tmp; \
- _type _clv = (ctxt)->src2.val; \
- _type _srcv = (ctxt)->src.val; \
- _type _dstv = (ctxt)->dst.val; \
- \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0", "5", "2") \
- _op _suffix " %4,%1 \n" \
- _POST_EFLAGS("0", "5", "2") \
- : "=m" ((ctxt)->eflags), "+r" (_dstv), "=&r" (_tmp) \
- : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \
- ); \
- \
- (ctxt)->src2.val = (unsigned long) _clv; \
- (ctxt)->src2.val = (unsigned long) _srcv; \
- (ctxt)->dst.val = (unsigned long) _dstv; \
- } while (0)
-
-#define emulate_2op_cl(ctxt, _op) \
- do { \
- switch ((ctxt)->dst.bytes) { \
- case 2: \
- __emulate_2op_cl(ctxt, _op, "w", u16); \
- break; \
- case 4: \
- __emulate_2op_cl(ctxt, _op, "l", u32); \
- break; \
- case 8: \
- ON64(__emulate_2op_cl(ctxt, _op, "q", ulong)); \
- break; \
- } \
- } while (0)
-
-#define __emulate_1op(ctxt, _op, _suffix) \
- do { \
- unsigned long _tmp; \
- \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0", "3", "2") \
- _op _suffix " %1; " \
- _POST_EFLAGS("0", "3", "2") \
- : "=m" ((ctxt)->eflags), "+m" ((ctxt)->dst.val), \
- "=&r" (_tmp) \
- : "i" (EFLAGS_MASK)); \
- } while (0)
-
-/* Instruction has only one explicit operand (no source operand). */
-#define emulate_1op(ctxt, _op) \
- do { \
- switch ((ctxt)->dst.bytes) { \
- case 1: __emulate_1op(ctxt, _op, "b"); break; \
- case 2: __emulate_1op(ctxt, _op, "w"); break; \
- case 4: __emulate_1op(ctxt, _op, "l"); break; \
- case 8: ON64(__emulate_1op(ctxt, _op, "q")); break; \
- } \
- } while (0)
-
-#define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \
- do { \
- unsigned long _tmp; \
- ulong *rax = &(ctxt)->regs[VCPU_REGS_RAX]; \
- ulong *rdx = &(ctxt)->regs[VCPU_REGS_RDX]; \
- \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0", "5", "1") \
- "1: \n\t" \
- _op _suffix " %6; " \
- "2: \n\t" \
- _POST_EFLAGS("0", "5", "1") \
- ".pushsection .fixup,\"ax\" \n\t" \
- "3: movb $1, %4 \n\t" \
- "jmp 2b \n\t" \
- ".popsection \n\t" \
- _ASM_EXTABLE(1b, 3b) \
- : "=m" ((ctxt)->eflags), "=&r" (_tmp), \
- "+a" (*rax), "+d" (*rdx), "+qm"(_ex) \
- : "i" (EFLAGS_MASK), "m" ((ctxt)->src.val), \
- "a" (*rax), "d" (*rdx)); \
- } while (0)
-
-/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */
-#define emulate_1op_rax_rdx(ctxt, _op, _ex) \
- do { \
- switch((ctxt)->src.bytes) { \
- case 1: \
- __emulate_1op_rax_rdx(ctxt, _op, "b", _ex); \
- break; \
- case 2: \
- __emulate_1op_rax_rdx(ctxt, _op, "w", _ex); \
- break; \
- case 4: \
- __emulate_1op_rax_rdx(ctxt, _op, "l", _ex); \
- break; \
- case 8: ON64( \
- __emulate_1op_rax_rdx(ctxt, _op, "q", _ex)); \
- break; \
- } \
- } while (0)
-
-static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
- enum x86_intercept intercept,
- enum x86_intercept_stage stage)
-{
- struct x86_instruction_info info = {
- .intercept = intercept,
- .rep_prefix = ctxt->rep_prefix,
- .modrm_mod = ctxt->modrm_mod,
- .modrm_reg = ctxt->modrm_reg,
- .modrm_rm = ctxt->modrm_rm,
- .src_val = ctxt->src.val64,
- .src_bytes = ctxt->src.bytes,
- .dst_bytes = ctxt->dst.bytes,
- .ad_bytes = ctxt->ad_bytes,
- .next_rip = ctxt->eip,
- };
-
- return ctxt->ops->intercept(ctxt, &info, stage);
-}
-
-static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt)
-{
- return (1UL << (ctxt->ad_bytes << 3)) - 1;
-}
-
-/* Access/update address held in a register, based on addressing mode. */
-static inline unsigned long
-address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg)
-{
- if (ctxt->ad_bytes == sizeof(unsigned long))
- return reg;
- else
- return reg & ad_mask(ctxt);
-}
-
-static inline unsigned long
-register_address(struct x86_emulate_ctxt *ctxt, unsigned long reg)
-{
- return address_mask(ctxt, reg);
-}
-
-static inline void
-register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, int inc)
-{
- if (ctxt->ad_bytes == sizeof(unsigned long))
- *reg += inc;
- else
- *reg = (*reg & ~ad_mask(ctxt)) | ((*reg + inc) & ad_mask(ctxt));
-}
-
-static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
-{
- register_address_increment(ctxt, &ctxt->_eip, rel);
-}
-
-static u32 desc_limit_scaled(struct desc_struct *desc)
-{
- u32 limit = get_desc_limit(desc);
-
- return desc->g ? (limit << 12) | 0xfff : limit;
-}
-
-static void set_seg_override(struct x86_emulate_ctxt *ctxt, int seg)
-{
- ctxt->has_seg_override = true;
- ctxt->seg_override = seg;
-}
-
-static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
-{
- if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
- return 0;
-
- return ctxt->ops->get_cached_segment_base(ctxt, seg);
-}
-
-static unsigned seg_override(struct x86_emulate_ctxt *ctxt)
-{
- if (!ctxt->has_seg_override)
- return 0;
-
- return ctxt->seg_override;
-}
-
-static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
- u32 error, bool valid)
-{
- ctxt->exception.vector = vec;
- ctxt->exception.error_code = error;
- ctxt->exception.error_code_valid = valid;
- return X86EMUL_PROPAGATE_FAULT;
-}
-
-static int emulate_db(struct x86_emulate_ctxt *ctxt)
-{
- return emulate_exception(ctxt, DB_VECTOR, 0, false);
-}
-
-static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
-{
- return emulate_exception(ctxt, GP_VECTOR, err, true);
-}
-
-static int emulate_ss(struct x86_emulate_ctxt *ctxt, int err)
-{
- return emulate_exception(ctxt, SS_VECTOR, err, true);
-}
-
-static int emulate_ud(struct x86_emulate_ctxt *ctxt)
-{
- return emulate_exception(ctxt, UD_VECTOR, 0, false);
-}
-
-static int emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
-{
- return emulate_exception(ctxt, TS_VECTOR, err, true);
-}
-
-static int emulate_de(struct x86_emulate_ctxt *ctxt)
-{
- return emulate_exception(ctxt, DE_VECTOR, 0, false);
-}
-
-static int emulate_nm(struct x86_emulate_ctxt *ctxt)
-{
- return emulate_exception(ctxt, NM_VECTOR, 0, false);
-}
-
-static u16 get_segment_selector(struct x86_emulate_ctxt *ctxt, unsigned seg)
-{
- u16 selector;
- struct desc_struct desc;
-
- ctxt->ops->get_segment(ctxt, &selector, &desc, NULL, seg);
- return selector;
-}
-
-static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector,
- unsigned seg)
-{
- u16 dummy;
- u32 base3;
- struct desc_struct desc;
-
- ctxt->ops->get_segment(ctxt, &dummy, &desc, &base3, seg);
- ctxt->ops->set_segment(ctxt, selector, &desc, base3, seg);
-}
-
-static int __linearize(struct x86_emulate_ctxt *ctxt,
- struct segmented_address addr,
- unsigned size, bool write, bool fetch,
- ulong *linear)
-{
- struct desc_struct desc;
- bool usable;
- ulong la;
- u32 lim;
- u16 sel;
- unsigned cpl, rpl;
-
- la = seg_base(ctxt, addr.seg) + addr.ea;
- switch (ctxt->mode) {
- case X86EMUL_MODE_REAL:
- break;
- case X86EMUL_MODE_PROT64:
- if (((signed long)la << 16) >> 16 != la)
- return emulate_gp(ctxt, 0);
- break;
- default:
- usable = ctxt->ops->get_segment(ctxt, &sel, &desc, NULL,
- addr.seg);
- if (!usable)
- goto bad;
- /* code segment or read-only data segment */
- if (((desc.type & 8) || !(desc.type & 2)) && write)
- goto bad;
- /* unreadable code segment */
- if (!fetch && (desc.type & 8) && !(desc.type & 2))
- goto bad;
- lim = desc_limit_scaled(&desc);
- if ((desc.type & 8) || !(desc.type & 4)) {
- /* expand-up segment */
- if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim)
- goto bad;
- } else {
- /* exapand-down segment */
- if (addr.ea <= lim || (u32)(addr.ea + size - 1) <= lim)
- goto bad;
- lim = desc.d ? 0xffffffff : 0xffff;
- if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim)
- goto bad;
- }
- cpl = ctxt->ops->cpl(ctxt);
- rpl = sel & 3;
- cpl = max(cpl, rpl);
- if (!(desc.type & 8)) {
- /* data segment */
- if (cpl > desc.dpl)
- goto bad;
- } else if ((desc.type & 8) && !(desc.type & 4)) {
- /* nonconforming code segment */
- if (cpl != desc.dpl)
- goto bad;
- } else if ((desc.type & 8) && (desc.type & 4)) {
- /* conforming code segment */
- if (cpl < desc.dpl)
- goto bad;
- }
- break;
- }
- if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : ctxt->ad_bytes != 8)
- la &= (u32)-1;
- *linear = la;
- return X86EMUL_CONTINUE;
-bad:
- if (addr.seg == VCPU_SREG_SS)
- return emulate_ss(ctxt, addr.seg);
- else
- return emulate_gp(ctxt, addr.seg);
-}
-
-static int linearize(struct x86_emulate_ctxt *ctxt,
- struct segmented_address addr,
- unsigned size, bool write,
- ulong *linear)
-{
- return __linearize(ctxt, addr, size, write, false, linear);
-}
-
-
-static int segmented_read_std(struct x86_emulate_ctxt *ctxt,
- struct segmented_address addr,
- void *data,
- unsigned size)
-{
- int rc;
- ulong linear;
-
- rc = linearize(ctxt, addr, size, false, &linear);
- if (rc != X86EMUL_CONTINUE)
- return rc;
- return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception);
-}
-
-/*
- * Fetch the next byte of the instruction being emulated which is pointed to
- * by ctxt->_eip, then increment ctxt->_eip.
- *
- * Also prefetch the remaining bytes of the instruction without crossing page
- * boundary if they are not in fetch_cache yet.
- */
-static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt, u8 *dest)
-{
- struct fetch_cache *fc = &ctxt->fetch;
- int rc;
- int size, cur_size;
-
- if (ctxt->_eip == fc->end) {
- unsigned long linear;
- struct segmented_address addr = { .seg = VCPU_SREG_CS,
- .ea = ctxt->_eip };
- cur_size = fc->end - fc->start;
- size = min(15UL - cur_size,
- PAGE_SIZE - offset_in_page(ctxt->_eip));
- rc = __linearize(ctxt, addr, size, false, true, &linear);
- if (unlikely(rc != X86EMUL_CONTINUE))
- return rc;
- rc = ctxt->ops->fetch(ctxt, linear, fc->data + cur_size,
- size, &ctxt->exception);
- if (unlikely(rc != X86EMUL_CONTINUE))
- return rc;
- fc->end += size;
- }
- *dest = fc->data[ctxt->_eip - fc->start];
- ctxt->_eip++;
- return X86EMUL_CONTINUE;
-}
-
-static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
- void *dest, unsigned size)
-{
- int rc;
-
- /* x86 instructions are limited to 15 bytes. */
- if (unlikely(ctxt->_eip + size - ctxt->eip > 15))
- return X86EMUL_UNHANDLEABLE;
- while (size--) {
- rc = do_insn_fetch_byte(ctxt, dest++);
- if (rc != X86EMUL_CONTINUE)
- return rc;
- }
- return X86EMUL_CONTINUE;
-}
-
-/* Fetch next part of the instruction being emulated. */
-#define insn_fetch(_type, _ctxt) \
-({ unsigned long _x; \
- rc = do_insn_fetch(_ctxt, &_x, sizeof(_type)); \
- if (rc != X86EMUL_CONTINUE) \
- goto done; \
- (_type)_x; \
-})
-
-#define insn_fetch_arr(_arr, _size, _ctxt) \
-({ rc = do_insn_fetch(_ctxt, _arr, (_size)); \
- if (rc != X86EMUL_CONTINUE) \
- goto done; \
-})
-
-/*
- * Given the 'reg' portion of a ModRM byte, and a register block, return a
- * pointer into the block that addresses the relevant register.
- * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
- */
-static void *decode_register(u8 modrm_reg, unsigned long *regs,
- int highbyte_regs)
-{
- void *p;
-
- p = &regs[modrm_reg];
- if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
- p = (unsigned char *)&regs[modrm_reg & 3] + 1;
- return p;
-}
-
-static int read_descriptor(struct x86_emulate_ctxt *ctxt,
- struct segmented_address addr,
- u16 *size, unsigned long *address, int op_bytes)
-{
- int rc;
-
- if (op_bytes == 2)
- op_bytes = 3;
- *address = 0;
- rc = segmented_read_std(ctxt, addr, size, 2);
- if (rc != X86EMUL_CONTINUE)
- return rc;
- addr.ea += 2;
- rc = segmented_read_std(ctxt, addr, address, op_bytes);
- return rc;
-}
-
-static int test_cc(unsigned int condition, unsigned int flags)
-{
- int rc = 0;
-
- switch ((condition & 15) >> 1) {
- case 0: /* o */
- rc |= (flags & EFLG_OF);
- break;
- case 1: /* b/c/nae */
- rc |= (flags & EFLG_CF);
- break;
- case 2: /* z/e */
- rc |= (flags & EFLG_ZF);
- break;
- case 3: /* be/na */
- rc |= (flags & (EFLG_CF|EFLG_ZF));
- break;
- case 4: /* s */
- rc |= (flags & EFLG_SF);
- break;
- case 5: /* p/pe */
- rc |= (flags & EFLG_PF);
- break;
- case 7: /* le/ng */
- rc |= (flags & EFLG_ZF);
- /* fall through */
- case 6: /* l/nge */
- rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF));
- break;
- }
-
- /* Odd condition identifiers (lsb == 1) have inverted sense. */
- return (!!rc ^ (condition & 1));
-}
-
-static void fetch_register_operand(struct operand *op)
-{
- switch (op->bytes) {
- case 1:
- op->val = *(u8 *)op->addr.reg;
- break;
- case 2:
- op->val = *(u16 *)op->addr.reg;
- break;
- case 4:
- op->val = *(u32 *)op->addr.reg;
- break;
- case 8:
- op->val = *(u64 *)op->addr.reg;
- break;
- }
-}
-
-static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg)
-{
- ctxt->ops->get_fpu(ctxt);
- switch (reg) {
- case 0: asm("movdqu %%xmm0, %0" : "=m"(*data)); break;
- case 1: asm("movdqu %%xmm1, %0" : "=m"(*data)); break;
- case 2: asm("movdqu %%xmm2, %0" : "=m"(*data)); break;
- case 3: asm("movdqu %%xmm3, %0" : "=m"(*data)); break;
- case 4: asm("movdqu %%xmm4, %0" : "=m"(*data)); break;
- case 5: asm("movdqu %%xmm5, %0" : "=m"(*data)); break;
- case 6: asm("movdqu %%xmm6, %0" : "=m"(*data)); break;
- case 7: asm("movdqu %%xmm7, %0" : "=m"(*data)); break;
-#ifdef CONFIG_X86_64
- case 8: asm("movdqu %%xmm8, %0" : "=m"(*data)); break;
- case 9: asm("movdqu %%xmm9, %0" : "=m"(*data)); break;
- case 10: asm("movdqu %%xmm10, %0" : "=m"(*data)); break;
- case 11: asm("movdqu %%xmm11, %0" : "=m"(*data)); break;
- case 12: asm("movdqu %%xmm12, %0" : "=m"(*data)); break;
- case 13: asm("movdqu %%xmm13, %0" : "=m"(*data)); break;
- case 14: asm("movdqu %%xmm14, %0" : "=m"(*data)); break;
- case 15: asm("movdqu %%xmm15, %0" : "=m"(*data)); break;
-#endif
- default: BUG();
- }
- ctxt->ops->put_fpu(ctxt);
-}
-
-static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
- int reg)
-{
- ctxt->ops->get_fpu(ctxt);
- switch (reg) {
- case 0: asm("movdqu %0, %%xmm0" : : "m"(*data)); break;
- case 1: asm("movdqu %0, %%xmm1" : : "m"(*data)); break;
- case 2: asm("movdqu %0, %%xmm2" : : "m"(*data)); break;
- case 3: asm("movdqu %0, %%xmm3" : : "m"(*data)); break;
- case 4: asm("movdqu %0, %%xmm4" : : "m"(*data)); break;
- case 5: asm("movdqu %0, %%xmm5" : : "m"(*data)); break;
- case 6: asm("movdqu %0, %%xmm6" : : "m"(*data)); break;
- case 7: asm("movdqu %0, %%xmm7" : : "m"(*data)); break;
-#ifdef CONFIG_X86_64
- case 8: asm("movdqu %0, %%xmm8" : : "m"(*data)); break;
- case 9: asm("movdqu %0, %%xmm9" : : "m"(*data)); break;
- case 10: asm("movdqu %0, %%xmm10" : : "m"(*data)); break;
- case 11: asm("movdqu %0, %%xmm11" : : "m"(*data)); break;
- case 12: asm("movdqu %0, %%xmm12" : : "m"(*data)); break;
- case 13: asm("movdqu %0, %%xmm13" : : "m"(*data)); break;
- case 14: asm("movdqu %0, %%xmm14" : : "m"(*data)); break;
- case 15: asm("movdqu %0, %%xmm15" : : "m"(*data)); break;
-#endif
- default: BUG();
- }
- ctxt->ops->put_fpu(ctxt);
-}
-
-static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
- struct operand *op)
-{
- unsigned reg = ctxt->modrm_reg;
- int highbyte_regs = ctxt->rex_prefix == 0;
-
- if (!(ctxt->d & ModRM))
- reg = (ctxt->b & 7) | ((ctxt->rex_prefix & 1) << 3);
-
- if (ctxt->d & Sse) {
- op->type = OP_XMM;
- op->bytes = 16;
- op->addr.xmm = reg;
- read_sse_reg(ctxt, &op->vec_val, reg);
- return;
- }
-
- op->type = OP_REG;
- if (ctxt->d & ByteOp) {
- op->addr.reg = decode_register(reg, ctxt->regs, highbyte_regs);
- op->bytes = 1;
- } else {
- op->addr.reg = decode_register(reg, ctxt->regs, 0);
- op->bytes = ctxt->op_bytes;
- }
- fetch_register_operand(op);
- op->orig_val = op->val;
-}
-
-static int decode_modrm(struct x86_emulate_ctxt *ctxt,
- struct operand *op)
-{
- u8 sib;
- int index_reg = 0, base_reg = 0, scale;
- int rc = X86EMUL_CONTINUE;
- ulong modrm_ea = 0;
-
- if (ctxt->rex_prefix) {
- ctxt->modrm_reg = (ctxt->rex_prefix & 4) << 1; /* REX.R */
- index_reg = (ctxt->rex_prefix & 2) << 2; /* REX.X */
- ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */
- }
-
- ctxt->modrm = insn_fetch(u8, ctxt);
- ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6;
- ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3;
- ctxt->modrm_rm |= (ctxt->modrm & 0x07);
- ctxt->modrm_seg = VCPU_SREG_DS;
-
- if (ctxt->modrm_mod == 3) {
- op->type = OP_REG;
- op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
- op->addr.reg = decode_register(ctxt->modrm_rm,
- ctxt->regs, ctxt->d & ByteOp);
- if (ctxt->d & Sse) {
- op->type = OP_XMM;
- op->bytes = 16;
- op->addr.xmm = ctxt->modrm_rm;
- read_sse_reg(ctxt, &op->vec_val, ctxt->modrm_rm);
- return rc;
- }
- fetch_register_operand(op);
- return rc;
- }
-
- op->type = OP_MEM;
-
- if (ctxt->ad_bytes == 2) {
- unsigned bx = ctxt->regs[VCPU_REGS_RBX];
- unsigned bp = ctxt->regs[VCPU_REGS_RBP];
- unsigned si = ctxt->regs[VCPU_REGS_RSI];
- unsigned di = ctxt->regs[VCPU_REGS_RDI];
-
- /* 16-bit ModR/M decode. */
- switch (ctxt->modrm_mod) {
- case 0:
- if (ctxt->modrm_rm == 6)
- modrm_ea += insn_fetch(u16, ctxt);
- break;
- case 1:
- modrm_ea += insn_fetch(s8, ctxt);
- break;
- case 2:
- modrm_ea += insn_fetch(u16, ctxt);
- break;
- }
- switch (ctxt->modrm_rm) {
- case 0:
- modrm_ea += bx + si;
- break;
- case 1:
- modrm_ea += bx + di;
- break;
- case 2:
- modrm_ea += bp + si;
- break;
- case 3:
- modrm_ea += bp + di;
- break;
- case 4:
- modrm_ea += si;
- break;
- case 5:
- modrm_ea += di;
- break;
- case 6:
- if (ctxt->modrm_mod != 0)
- modrm_ea += bp;
- break;
- case 7:
- modrm_ea += bx;
- break;
- }
- if (ctxt->modrm_rm == 2 || ctxt->modrm_rm == 3 ||
- (ctxt->modrm_rm == 6 && ctxt->modrm_mod != 0))
- ctxt->modrm_seg = VCPU_SREG_SS;
- modrm_ea = (u16)modrm_ea;
- } else {
- /* 32/64-bit ModR/M decode. */
- if ((ctxt->modrm_rm & 7) == 4) {
- sib = insn_fetch(u8, ctxt);
- index_reg |= (sib >> 3) & 7;
- base_reg |= sib & 7;
- scale = sib >> 6;
-
- if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0)
- modrm_ea += insn_fetch(s32, ctxt);
- else
- modrm_ea += ctxt->regs[base_reg];
- if (index_reg != 4)
- modrm_ea += ctxt->regs[index_reg] << scale;
- } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) {
- if (ctxt->mode == X86EMUL_MODE_PROT64)
- ctxt->rip_relative = 1;
- } else
- modrm_ea += ctxt->regs[ctxt->modrm_rm];
- switch (ctxt->modrm_mod) {
- case 0:
- if (ctxt->modrm_rm == 5)
- modrm_ea += insn_fetch(s32, ctxt);
- break;
- case 1:
- modrm_ea += insn_fetch(s8, ctxt);
- break;
- case 2:
- modrm_ea += insn_fetch(s32, ctxt);
- break;
- }
- }
- op->addr.mem.ea = modrm_ea;
-done:
- return rc;
-}
-
-static int decode_abs(struct x86_emulate_ctxt *ctxt,
- struct operand *op)
-{
- int rc = X86EMUL_CONTINUE;
-
- op->type = OP_MEM;
- switch (ctxt->ad_bytes) {
- case 2:
- op->addr.mem.ea = insn_fetch(u16, ctxt);
- break;
- case 4:
- op->addr.mem.ea = insn_fetch(u32, ctxt);
- break;
- case 8:
- op->addr.mem.ea = insn_fetch(u64, ctxt);
- break;
- }
-done:
- return rc;
-}
-
-static void fetch_bit_operand(struct x86_emulate_ctxt *ctxt)
-{
- long sv = 0, mask;
-
- if (ctxt->dst.type == OP_MEM && ctxt->src.type == OP_REG) {
- mask = ~(ctxt->dst.bytes * 8 - 1);
-
- if (ctxt->src.bytes == 2)
- sv = (s16)ctxt->src.val & (s16)mask;
- else if (ctxt->src.bytes == 4)
- sv = (s32)ctxt->src.val & (s32)mask;
-
- ctxt->dst.addr.mem.ea += (sv >> 3);
- }
-
- /* only subword offset */
- ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
-}
-
-static int read_emulated(struct x86_emulate_ctxt *ctxt,
- unsigned long addr, void *dest, unsigned size)
-{
- int rc;
- struct read_cache *mc = &ctxt->mem_read;
-
- while (size) {
- int n = min(size, 8u);
- size -= n;
- if (mc->pos < mc->end)
- goto read_cached;
-
- rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, n,
- &ctxt->exception);
- if (rc != X86EMUL_CONTINUE)
- return rc;
- mc->end += n;
-
- read_cached:
- memcpy(dest, mc->data + mc->pos, n);
- mc->pos += n;
- dest += n;
- addr += n;
- }
- return X86EMUL_CONTINUE;
-}
-
-static int segmented_read(struct x86_emulate_ctxt *ctxt,
- struct segmented_address addr,
- void *data,
- unsigned size)
-{
- int rc;
- ulong linear;
-
- rc = linearize(ctxt, addr, size, false, &linear);
- if (rc != X86EMUL_CONTINUE)
- return rc;
- return read_emulated(ctxt, linear, data, size);
-}
-
-static int segmented_write(struct x86_emulate_ctxt *ctxt,
- struct segmented_address addr,
- const void *data,
- unsigned size)
-{
- int rc;
- ulong linear;
-
- rc = linearize(ctxt, addr, size, true, &linear);
- if (rc != X86EMUL_CONTINUE)
- return rc;
- return ctxt->ops->write_emulated(ctxt, linear, data, size,
- &ctxt->exception);
-}
-
-static int segmented_cmpxchg(struct x86_emulate_ctxt *ctxt,
- struct segmented_address addr,
- const void *orig_data, const void *data,
- unsigned size)
-{
- int rc;
- ulong linear;
-
- rc = linearize(ctxt, addr, size, true, &linear);
- if (rc != X86EMUL_CONTINUE)
- return rc;
- return ctxt->ops->cmpxchg_emulated(ctxt, linear, orig_data, data,
- size, &ctxt->exception);
-}
-
-static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
- unsigned int size, unsigned short port,
- void *dest)
-{
- struct read_cache *rc = &ctxt->io_read;
-
- if (rc->pos == rc->end) { /* refill pio read ahead */
- unsigned int in_page, n;
- unsigned int count = ctxt->rep_prefix ?
- address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) : 1;
- in_page = (ctxt->eflags & EFLG_DF) ?
- offset_in_page(ctxt->regs[VCPU_REGS_RDI]) :
- PAGE_SIZE - offset_in_page(ctxt->regs[VCPU_REGS_RDI]);
- n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size,
- count);
- if (n == 0)
- n = 1;
- rc->pos = rc->end = 0;
- if (!ctxt->ops->pio_in_emulated(ctxt, size, port, rc->data, n))
- return 0;
- rc->end = n * size;
- }
-
- memcpy(dest, rc->data + rc->pos, size);
- rc->pos += size;
- return 1;
-}
-
-static int read_interrupt_descriptor(struct x86_emulate_ctxt *ctxt,
- u16 index, struct desc_struct *desc)
-{
- struct desc_ptr dt;
- ulong addr;
-
- ctxt->ops->get_idt(ctxt, &dt);
-
- if (dt.size < index * 8 + 7)
- return emulate_gp(ctxt, index << 3 | 0x2);
-
- addr = dt.address + index * 8;
- return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc,
- &ctxt->exception);
-}
-
-static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
- u16 selector, struct desc_ptr *dt)
-{
- struct x86_emulate_ops *ops = ctxt->ops;
-
- if (selector & 1 << 2) {
- struct desc_struct desc;
- u16 sel;
-
- memset (dt, 0, sizeof *dt);
- if (!ops->get_segment(ctxt, &sel, &desc, NULL, VCPU_SREG_LDTR))
- return;
-
- dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
- dt->address = get_desc_base(&desc);
- } else
- ops->get_gdt(ctxt, dt);
-}
-
-/* allowed just for 8 bytes segments */
-static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
- u16 selector, struct desc_struct *desc)
-{
- struct desc_ptr dt;
- u16 index = selector >> 3;
- ulong addr;
-
- get_descriptor_table_ptr(ctxt, selector, &dt);
-
- if (dt.size < index * 8 + 7)
- return emulate_gp(ctxt, selector & 0xfffc);
-
- addr = dt.address + index * 8;
- return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc,
- &ctxt->exception);
-}
-
-/* allowed just for 8 bytes segments */
-static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
- u16 selector, struct desc_struct *desc)
-{
- struct desc_ptr dt;
- u16 index = selector >> 3;
- ulong addr;
-
- get_descriptor_table_ptr(ctxt, selector, &dt);
-
- if (dt.size < index * 8 + 7)
- return emulate_gp(ctxt, selector & 0xfffc);
-
- addr = dt.address + index * 8;
- return ctxt->ops->write_std(ctxt, addr, desc, sizeof *desc,
- &ctxt->exception);
-}
-
-/* Does not support long mode */
-static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
- u16 selector, int seg)
-{
- struct desc_struct seg_desc;
- u8 dpl, rpl, cpl;
- unsigned err_vec = GP_VECTOR;
- u32 err_code = 0;
- bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
- int ret;
-
- memset(&seg_desc, 0, sizeof seg_desc);
-
- if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86)
- || ctxt->mode == X86EMUL_MODE_REAL) {
- /* set real mode segment descriptor */
- set_desc_base(&seg_desc, selector << 4);
- set_desc_limit(&seg_desc, 0xffff);
- seg_desc.type = 3;
- seg_desc.p = 1;
- seg_desc.s = 1;
- if (ctxt->mode == X86EMUL_MODE_VM86)
- seg_desc.dpl = 3;
- goto load;
- }
-
- /* NULL selector is not valid for TR, CS and SS */
- if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
- && null_selector)
- goto exception;
-
- /* TR should be in GDT only */
- if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
- goto exception;
-
- if (null_selector) /* for NULL selector skip all following checks */
- goto load;
-
- ret = read_segment_descriptor(ctxt, selector, &seg_desc);
- if (ret != X86EMUL_CONTINUE)
- return ret;
-
- err_code = selector & 0xfffc;
- err_vec = GP_VECTOR;
-
- /* can't load system descriptor into segment selecor */
- if (seg <= VCPU_SREG_GS && !seg_desc.s)
- goto exception;
-
- if (!seg_desc.p) {
- err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
- goto exception;
- }
-
- rpl = selector & 3;
- dpl = seg_desc.dpl;
- cpl = ctxt->ops->cpl(ctxt);
-
- switch (seg) {
- case VCPU_SREG_SS:
- /*
- * segment is not a writable data segment or segment
- * selector's RPL != CPL or segment selector's RPL != CPL
- */
- if (rpl != cpl || (seg_desc.type & 0xa) != 0x2 || dpl != cpl)
- goto exception;
- break;
- case VCPU_SREG_CS:
- if (!(seg_desc.type & 8))
- goto exception;
-
- if (seg_desc.type & 4) {
- /* conforming */
- if (dpl > cpl)
- goto exception;
- } else {
- /* nonconforming */
- if (rpl > cpl || dpl != cpl)
- goto exception;
- }
- /* CS(RPL) <- CPL */
- selector = (selector & 0xfffc) | cpl;
- break;
- case VCPU_SREG_TR:
- if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9))
- goto exception;
- break;
- case VCPU_SREG_LDTR:
- if (seg_desc.s || seg_desc.type != 2)
- goto exception;
- break;
- default: /* DS, ES, FS, or GS */
- /*
- * segment is not a data or readable code segment or
- * ((segment is a data or nonconforming code segment)
- * and (both RPL and CPL > DPL))
- */
- if ((seg_desc.type & 0xa) == 0x8 ||
- (((seg_desc.type & 0xc) != 0xc) &&
- (rpl > dpl && cpl > dpl)))
- goto exception;
- break;
- }
-
- if (seg_desc.s) {
- /* mark segment as accessed */
- seg_desc.type |= 1;
- ret = write_segment_descriptor(ctxt, selector, &seg_desc);
- if (ret != X86EMUL_CONTINUE)
- return ret;
- }
-load:
- ctxt->ops->set_segment(ctxt, selector, &seg_desc, 0, seg);
- return X86EMUL_CONTINUE;
-exception:
- emulate_exception(ctxt, err_vec, err_code, true);
- return X86EMUL_PROPAGATE_FAULT;
-}
-
-static void write_register_operand(struct operand *op)
-{
- /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
- switch (op->bytes) {
- case 1:
- *(u8 *)op->addr.reg = (u8)op->val;
- break;
- case 2:
- *(u16 *)op->addr.reg = (u16)op->val;
- break;
- case 4:
- *op->addr.reg = (u32)op->val;
- break; /* 64b: zero-extend */
- case 8:
- *op->addr.reg = op->val;
- break;
- }
-}
-
-static int writeback(struct x86_emulate_ctxt *ctxt)
-{
- int rc;
-
- switch (ctxt->dst.type) {
- case OP_REG:
- write_register_operand(&ctxt->dst);
- break;
- case OP_MEM:
- if (ctxt->lock_prefix)
- rc = segmented_cmpxchg(ctxt,
- ctxt->dst.addr.mem,
- &ctxt->dst.orig_val,
- &ctxt->dst.val,
- ctxt->dst.bytes);
- else
- rc = segmented_write(ctxt,
- ctxt->dst.addr.mem,
- &ctxt->dst.val,
- ctxt->dst.bytes);
- if (rc != X86EMUL_CONTINUE)
- return rc;
- break;
- case OP_XMM:
- write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm);
- break;
- case OP_NONE:
- /* no writeback */
- break;
- default:
- break;
- }
- return X86EMUL_CONTINUE;
-}
-
-static int em_push(struct x86_emulate_ctxt *ctxt)
-{
- struct segmented_address addr;
-
- register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -ctxt->op_bytes);
- addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]);
- addr.seg = VCPU_SREG_SS;
-
- /* Disable writeback. */
- ctxt->dst.type = OP_NONE;
- return segmented_write(ctxt, addr, &ctxt->src.val, ctxt->op_bytes);
-}
-
-static int emulate_pop(struct x86_emulate_ctxt *ctxt,
- void *dest, int len)
-{
- int rc;
- struct segmented_address addr;
-
- addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]);
- addr.seg = VCPU_SREG_SS;
- rc = segmented_read(ctxt, addr, dest, len);
- if (rc != X86EMUL_CONTINUE)
- return rc;
-
- register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], len);
- return rc;
-}
-
-static int em_pop(struct x86_emulate_ctxt *ctxt)
-{
- return emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes);
-}
-
-static int emulate_popf(struct x86_emulate_ctxt *ctxt,
- void *dest, int len)
-{
- int rc;
- unsigned long val, change_mask;
- int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
- int cpl = ctxt->ops->cpl(ctxt);
-
- rc = emulate_pop(ctxt, &val, len);
- if (rc != X86EMUL_CONTINUE)
- return rc;
-
- change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF
- | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_RF | EFLG_AC | EFLG_ID;
-
- switch(ctxt->mode) {
- case X86EMUL_MODE_PROT64:
- case X86EMUL_MODE_PROT32:
- case X86EMUL_MODE_PROT16:
- if (cpl == 0)
- change_mask |= EFLG_IOPL;
- if (cpl <= iopl)
- change_mask |= EFLG_IF;
- break;
- case X86EMUL_MODE_VM86:
- if (iopl < 3)
- return emulate_gp(ctxt, 0);
- change_mask |= EFLG_IF;
- break;
- default: /* real mode */
- change_mask |= (EFLG_IOPL | EFLG_IF);
- break;
- }
-
- *(unsigned long *)dest =
- (ctxt->eflags & ~change_mask) | (val & change_mask);
-
- return rc;
-}
-
-static int em_popf(struct x86_emulate_ctxt *ctxt)
-{
- ctxt->dst.type = OP_REG;
- ctxt->dst.addr.reg = &ctxt->eflags;
- ctxt->dst.bytes = ctxt->op_bytes;
- return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes);
-}
-
-static int em_push_sreg(struct x86_emulate_ctxt *ctxt)
-{
- int seg = ctxt->src2.val;
-
- ctxt->src.val = get_segment_selector(ctxt, seg);
-
- return em_push(ctxt);
-}
-
-static int em_pop_sreg(struct x86_emulate_ctxt *ctxt)
-{
- int seg = ctxt->src2.val;
- unsigned long selector;
- int rc;
-
- rc = emulate_pop(ctxt, &selector, ctxt->op_bytes);
- if (rc != X86EMUL_CONTINUE)
- return rc;
-
- rc = load_segment_descriptor(ctxt, (u16)selector, seg);
- return rc;
-}
-
-static int em_pusha(struct x86_emulate_ctxt *ctxt)
-{
- unsigned long old_esp = ctxt->regs[VCPU_REGS_RSP];
- int rc = X86EMUL_CONTINUE;
- int reg = VCPU_REGS_RAX;
-
- while (reg <= VCPU_REGS_RDI) {
- (reg == VCPU_REGS_RSP) ?
- (ctxt->src.val = old_esp) : (ctxt->src.val = ctxt->regs[reg]);
-
- rc = em_push(ctxt);
- if (rc != X86EMUL_CONTINUE)
- return rc;
-
- ++reg;
- }
-
- return rc;
-}
-
-static int em_pushf(struct x86_emulate_ctxt *ctxt)
-{
- ctxt->src.val = (unsigned long)ctxt->eflags;
- return em_push(ctxt);
-}
-
-static int em_popa(struct x86_emulate_ctxt *ctxt)
-{
- int rc = X86EMUL_CONTINUE;
- int reg = VCPU_REGS_RDI;
-
- while (reg >= VCPU_REGS_RAX) {
- if (reg == VCPU_REGS_RSP) {
- register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP],
- ctxt->op_bytes);
- --reg;
- }
-
- rc = emulate_pop(ctxt, &ctxt->regs[reg], ctxt->op_bytes);
- if (rc != X86EMUL_CONTINUE)
- break;
- --reg;
- }
- return rc;
-}
-
-int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
-{
- struct x86_emulate_ops *ops = ctxt->ops;
- int rc;
- struct desc_ptr dt;
- gva_t cs_addr;
- gva_t eip_addr;
- u16 cs, eip;
-
- /* TODO: Add limit checks */
- ctxt->src.val = ctxt->eflags;
- rc = em_push(ctxt);
- if (rc != X86EMUL_CONTINUE)
- return rc;
-
- ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC);
-
- ctxt->src.val = get_segment_selector(ctxt, VCPU_SREG_CS);
- rc = em_push(ctxt);
- if (rc != X86EMUL_CONTINUE)
- return rc;
-
- ctxt->src.val = ctxt->_eip;
- rc = em_push(ctxt);
- if (rc != X86EMUL_CONTINUE)
- return rc;
-
- ops->get_idt(ctxt, &dt);
-
- eip_addr = dt.address + (irq << 2);
- cs_addr = dt.address + (irq << 2) + 2;
-
- rc = ops->read_std(ctxt, cs_addr, &cs, 2, &ctxt->exception);
- if (rc != X86EMUL_CONTINUE)
- return rc;
-
- rc = ops->read_std(ctxt, eip_addr, &eip, 2, &ctxt->exception);
- if (rc != X86EMUL_CONTINUE)
- return rc;
-
- rc = load_segment_descriptor(ctxt, cs, VCPU_SREG_CS);
- if (rc != X86EMUL_CONTINUE)
- return rc;
-
- ctxt->_eip = eip;
-
- return rc;
-}
-
-static int emulate_int(struct x86_emulate_ctxt *ctxt, int irq)
-{
- switch(ctxt->mode) {
- case X86EMUL_MODE_REAL:
- return emulate_int_real(ctxt, irq);
- case X86EMUL_MODE_VM86:
- case X86EMUL_MODE_PROT16:
- case X86EMUL_MODE_PROT32:
- case X86EMUL_MODE_PROT64:
- default:
- /* Protected mode interrupts unimplemented yet */
- return X86EMUL_UNHANDLEABLE;
- }
-}
-
-static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
-{
- int rc = X86EMUL_CONTINUE;
- unsigned long temp_eip = 0;
- unsigned long temp_eflags = 0;
- unsigned long cs = 0;
- unsigned long mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_TF |
- EFLG_IF | EFLG_DF | EFLG_OF | EFLG_IOPL | EFLG_NT | EFLG_RF |
- EFLG_AC | EFLG_ID | (1 << 1); /* Last one is the reserved bit */
- unsigned long vm86_mask = EFLG_VM | EFLG_VIF | EFLG_VIP;
-
- /* TODO: Add stack limit check */
-
- rc = emulate_pop(ctxt, &temp_eip, ctxt->op_bytes);
-
- if (rc != X86EMUL_CONTINUE)
- return rc;
-
- if (temp_eip & ~0xffff)
- return emulate_gp(ctxt, 0);
-
- rc = emulate_pop(ctxt, &cs, ctxt->op_bytes);
-
- if (rc != X86EMUL_CONTINUE)
- return rc;
-
- rc = emulate_pop(ctxt, &temp_eflags, ctxt->op_bytes);
-
- if (rc != X86EMUL_CONTINUE)
- return rc;
-
- rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS);
-
- if (rc != X86EMUL_CONTINUE)
- return rc;
-
- ctxt->_eip = temp_eip;
-
-
- if (ctxt->op_bytes == 4)
- ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask));
- else if (ctxt->op_bytes == 2) {
- ctxt->eflags &= ~0xffff;
- ctxt->eflags |= temp_eflags;
- }
-
- ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */
- ctxt->eflags |= EFLG_RESERVED_ONE_MASK;
-
- return rc;
-}
-
-static int em_iret(struct x86_emulate_ctxt *ctxt)
-{
- switch(ctxt->mode) {
- case X86EMUL_MODE_REAL:
- return emulate_iret_real(ctxt);
- case X86EMUL_MODE_VM86:
- case X86EMUL_MODE_PROT16:
- case X86EMUL_MODE_PROT32:
- case X86EMUL_MODE_PROT64:
- default:
- /* iret from protected mode unimplemented yet */
- return X86EMUL_UNHANDLEABLE;
- }
-}
-
-static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
-{
- int rc;
- unsigned short sel;
-
- memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
-
- rc = load_segment_descriptor(ctxt, sel, VCPU_SREG_CS);
- if (rc != X86EMUL_CONTINUE)
- return rc;
-
- ctxt->_eip = 0;
- memcpy(&ctxt->_eip, ctxt->src.valptr, ctxt->op_bytes);
- return X86EMUL_CONTINUE;
-}
-
-static int em_grp2(struct x86_emulate_ctxt *ctxt)
-{
- switch (ctxt->modrm_reg) {
- case 0: /* rol */
- emulate_2op_SrcB(ctxt, "rol");
- break;
- case 1: /* ror */
- emulate_2op_SrcB(ctxt, "ror");
- break;
- case 2: /* rcl */
- emulate_2op_SrcB(ctxt, "rcl");
- break;
- case 3: /* rcr */
- emulate_2op_SrcB(ctxt, "rcr");
- break;
- case 4: /* sal/shl */
- case 6: /* sal/shl */
- emulate_2op_SrcB(ctxt, "sal");
- break;
- case 5: /* shr */
- emulate_2op_SrcB(ctxt, "shr");
- break;
- case 7: /* sar */
- emulate_2op_SrcB(ctxt, "sar");
- break;
- }
- return X86EMUL_CONTINUE;
-}
-
-static int em_not(struct x86_emulate_ctxt *ctxt)
-{
- ctxt->dst.val = ~ctxt->dst.val;
- return X86EMUL_CONTINUE;
-}
-
-static int em_neg(struct x86_emulate_ctxt *ctxt)
-{
- emulate_1op(ctxt, "neg");
- return X86EMUL_CONTINUE;
-}
-
-static int em_mul_ex(struct x86_emulate_ctxt *ctxt)
-{
- u8 ex = 0;
-
- emulate_1op_rax_rdx(ctxt, "mul", ex);
- return X86EMUL_CONTINUE;
-}
-
-static int em_imul_ex(struct x86_emulate_ctxt *ctxt)
-{
- u8 ex = 0;
-
- emulate_1op_rax_rdx(ctxt, "imul", ex);
- return X86EMUL_CONTINUE;
-}
-
-static int em_div_ex(struct x86_emulate_ctxt *ctxt)
-{
- u8 de = 0;
-
- emulate_1op_rax_rdx(ctxt, "div", de);
- if (de)
- return emulate_de(ctxt);
- return X86EMUL_CONTINUE;
-}
-
-static int em_idiv_ex(struct x86_emulate_ctxt *ctxt)
-{
- u8 de = 0;
-
- emulate_1op_rax_rdx(ctxt, "idiv", de);
- if (de)
- return emulate_de(ctxt);
- return X86EMUL_CONTINUE;
-}
-
-static int em_grp45(struct x86_emulate_ctxt *ctxt)
-{
- int rc = X86EMUL_CONTINUE;
-
- switch (ctxt->modrm_reg) {
- case 0: /* inc */
- emulate_1op(ctxt, "inc");
- break;
- case 1: /* dec */
- emulate_1op(ctxt, "dec");
- break;
- case 2: /* call near abs */ {
- long int old_eip;
- old_eip = ctxt->_eip;
- ctxt->_eip = ctxt->src.val;
- ctxt->src.val = old_eip;
- rc = em_push(ctxt);
- break;
- }
- case 4: /* jmp abs */
- ctxt->_eip = ctxt->src.val;
- break;
- case 5: /* jmp far */
- rc = em_jmp_far(ctxt);
- break;
- case 6: /* push */
- rc = em_push(ctxt);
- break;
- }
- return rc;
-}
-
-static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
-{
- u64 old = ctxt->dst.orig_val64;
-
- if (((u32) (old >> 0) != (u32) ctxt->regs[VCPU_REGS_RAX]) ||
- ((u32) (old >> 32) != (u32) ctxt->regs[VCPU_REGS_RDX])) {
- ctxt->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
- ctxt->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
- ctxt->eflags &= ~EFLG_ZF;
- } else {
- ctxt->dst.val64 = ((u64)ctxt->regs[VCPU_REGS_RCX] << 32) |
- (u32) ctxt->regs[VCPU_REGS_RBX];
-
- ctxt->eflags |= EFLG_ZF;
- }
- return X86EMUL_CONTINUE;
-}
-
-static int em_ret(struct x86_emulate_ctxt *ctxt)
-{
- ctxt->dst.type = OP_REG;
- ctxt->dst.addr.reg = &ctxt->_eip;
- ctxt->dst.bytes = ctxt->op_bytes;
- return em_pop(ctxt);
-}
-
-static int em_ret_far(struct x86_emulate_ctxt *ctxt)
-{
- int rc;
- unsigned long cs;
-
- rc = emulate_pop(ctxt, &ctxt->_eip, ctxt->op_bytes);
- if (rc != X86EMUL_CONTINUE)
- return rc;
- if (ctxt->op_bytes == 4)
- ctxt->_eip = (u32)ctxt->_eip;
- rc = emulate_pop(ctxt, &cs, ctxt->op_bytes);
- if (rc != X86EMUL_CONTINUE)
- return rc;
- rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS);
- return rc;
-}
-
-static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
-{
- /* Save real source value, then compare EAX against destination. */
- ctxt->src.orig_val = ctxt->src.val;
- ctxt->src.val = ctxt->regs[VCPU_REGS_RAX];
- emulate_2op_SrcV(ctxt, "cmp");
-
- if (ctxt->eflags & EFLG_ZF) {
- /* Success: write back to memory. */
- ctxt->dst.val = ctxt->src.orig_val;
- } else {
- /* Failure: write the value we saw to EAX. */
- ctxt->dst.type = OP_REG;
- ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX];
- }
- return X86EMUL_CONTINUE;
-}
-
-static int em_lseg(struct x86_emulate_ctxt *ctxt)
-{
- int seg = ctxt->src2.val;
- unsigned short sel;
- int rc;
-
- memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
-
- rc = load_segment_descriptor(ctxt, sel, seg);
- if (rc != X86EMUL_CONTINUE)
- return rc;
-
- ctxt->dst.val = ctxt->src.val;
- return rc;
-}
-
-static void
-setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
- struct desc_struct *cs, struct desc_struct *ss)
-{
- u16 selector;
-
- memset(cs, 0, sizeof(struct desc_struct));
- ctxt->ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS);
- memset(ss, 0, sizeof(struct desc_struct));
-
- cs->l = 0; /* will be adjusted later */
- set_desc_base(cs, 0); /* flat segment */
- cs->g = 1; /* 4kb granularity */
- set_desc_limit(cs, 0xfffff); /* 4GB limit */
- cs->type = 0x0b; /* Read, Execute, Accessed */
- cs->s = 1;
- cs->dpl = 0; /* will be adjusted later */
- cs->p = 1;
- cs->d = 1;
-
- set_desc_base(ss, 0); /* flat segment */
- set_desc_limit(ss, 0xfffff); /* 4GB limit */
- ss->g = 1; /* 4kb granularity */
- ss->s = 1;
- ss->type = 0x03; /* Read/Write, Accessed */
- ss->d = 1; /* 32bit stack segment */
- ss->dpl = 0;
- ss->p = 1;
-}
-
-static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
-{
- u32 eax, ebx, ecx, edx;
-
- eax = ecx = 0;
- return ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx)
- && ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx
- && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx
- && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx;
-}
-
-static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
-{
- struct x86_emulate_ops *ops = ctxt->ops;
- u32 eax, ebx, ecx, edx;
-
- /*
- * syscall should always be enabled in longmode - so only become
- * vendor specific (cpuid) if other modes are active...
- */
- if (ctxt->mode == X86EMUL_MODE_PROT64)
- return true;
-
- eax = 0x00000000;
- ecx = 0x00000000;
- if (ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx)) {
- /*
- * Intel ("GenuineIntel")
- * remark: Intel CPUs only support "syscall" in 64bit
- * longmode. Also an 64bit guest with a
- * 32bit compat-app running will #UD !! While this
- * behaviour can be fixed (by emulating) into AMD
- * response - CPUs of AMD can't behave like Intel.
- */
- if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx &&
- ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx &&
- edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx)
- return false;
-
- /* AMD ("AuthenticAMD") */
- if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx &&
- ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx &&
- edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx)
- return true;
-
- /* AMD ("AMDisbetter!") */
- if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx &&
- ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx &&
- edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx)
- return true;
- }
-
- /* default: (not Intel, not AMD), apply Intel's stricter rules... */
- return false;
-}
-
-static int em_syscall(struct x86_emulate_ctxt *ctxt)
-{
- struct x86_emulate_ops *ops = ctxt->ops;
- struct desc_struct cs, ss;
- u64 msr_data;
- u16 cs_sel, ss_sel;
- u64 efer = 0;
-
- /* syscall is not available in real mode */
- if (ctxt->mode == X86EMUL_MODE_REAL ||
- ctxt->mode == X86EMUL_MODE_VM86)
- return emulate_ud(ctxt);
-
- if (!(em_syscall_is_enabled(ctxt)))
- return emulate_ud(ctxt);
-
- ops->get_msr(ctxt, MSR_EFER, &efer);
- setup_syscalls_segments(ctxt, &cs, &ss);
-
- if (!(efer & EFER_SCE))
- return emulate_ud(ctxt);
-
- ops->get_msr(ctxt, MSR_STAR, &msr_data);
- msr_data >>= 32;
- cs_sel = (u16)(msr_data & 0xfffc);
- ss_sel = (u16)(msr_data + 8);
-
- if (efer & EFER_LMA) {
- cs.d = 0;
- cs.l = 1;
- }
- ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
- ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
-
- ctxt->regs[VCPU_REGS_RCX] = ctxt->_eip;
- if (efer & EFER_LMA) {
-#ifdef CONFIG_X86_64
- ctxt->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
-
- ops->get_msr(ctxt,
- ctxt->mode == X86EMUL_MODE_PROT64 ?
- MSR_LSTAR : MSR_CSTAR, &msr_data);
- ctxt->_eip = msr_data;
-
- ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data);
- ctxt->eflags &= ~(msr_data | EFLG_RF);
-#endif
- } else {
- /* legacy mode */
- ops->get_msr(ctxt, MSR_STAR, &msr_data);
- ctxt->_eip = (u32)msr_data;
-
- ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
- }
-
- return X86EMUL_CONTINUE;
-}
-
-static int em_sysenter(struct x86_emulate_ctxt *ctxt)
-{
- struct x86_emulate_ops *ops = ctxt->ops;
- struct desc_struct cs, ss;
- u64 msr_data;
- u16 cs_sel, ss_sel;
- u64 efer = 0;
-
- ops->get_msr(ctxt, MSR_EFER, &efer);
- /* inject #GP if in real mode */
- if (ctxt->mode == X86EMUL_MODE_REAL)
- return emulate_gp(ctxt, 0);
-
- /*
- * Not recognized on AMD in compat mode (but is recognized in legacy
- * mode).
- */
- if ((ctxt->mode == X86EMUL_MODE_PROT32) && (efer & EFER_LMA)
- && !vendor_intel(ctxt))
- return emulate_ud(ctxt);
-
- /* XXX sysenter/sysexit have not been tested in 64bit mode.
- * Therefore, we inject an #UD.
- */
- if (ctxt->mode == X86EMUL_MODE_PROT64)
- return emulate_ud(ctxt);
-
- setup_syscalls_segments(ctxt, &cs, &ss);
-
- ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data);
- switch (ctxt->mode) {
- case X86EMUL_MODE_PROT32:
- if ((msr_data & 0xfffc) == 0x0)
- return emulate_gp(ctxt, 0);
- break;
- case X86EMUL_MODE_PROT64:
- if (msr_data == 0x0)
- return emulate_gp(ctxt, 0);
- break;
- }
-
- ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
- cs_sel = (u16)msr_data;
- cs_sel &= ~SELECTOR_RPL_MASK;
- ss_sel = cs_sel + 8;
- ss_sel &= ~SELECTOR_RPL_MASK;
- if (ctxt->mode == X86EMUL_MODE_PROT64 || (efer & EFER_LMA)) {
- cs.d = 0;
- cs.l = 1;
- }
-
- ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
- ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
-
- ops->get_msr(ctxt, MSR_IA32_SYSENTER_EIP, &msr_data);
- ctxt->_eip = msr_data;
-
- ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data);
- ctxt->regs[VCPU_REGS_RSP] = msr_data;
-
- return X86EMUL_CONTINUE;
-}
-
-static int em_sysexit(struct x86_emulate_ctxt *ctxt)
-{
- struct x86_emulate_ops *ops = ctxt->ops;
- struct desc_struct cs, ss;
- u64 msr_data;
- int usermode;
- u16 cs_sel = 0, ss_sel = 0;
-
- /* inject #GP if in real mode or Virtual 8086 mode */
- if (ctxt->mode == X86EMUL_MODE_REAL ||
- ctxt->mode == X86EMUL_MODE_VM86)
- return emulate_gp(ctxt, 0);
-
- setup_syscalls_segments(ctxt, &cs, &ss);
-
- if ((ctxt->rex_prefix & 0x8) != 0x0)
- usermode = X86EMUL_MODE_PROT64;
- else
- usermode = X86EMUL_MODE_PROT32;
-
- cs.dpl = 3;
- ss.dpl = 3;
- ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data);
- switch (usermode) {
- case X86EMUL_MODE_PROT32:
- cs_sel = (u16)(msr_data + 16);
- if ((msr_data & 0xfffc) == 0x0)
- return emulate_gp(ctxt, 0);
- ss_sel = (u16)(msr_data + 24);
- break;
- case X86EMUL_MODE_PROT64:
- cs_sel = (u16)(msr_data + 32);
- if (msr_data == 0x0)
- return emulate_gp(ctxt, 0);
- ss_sel = cs_sel + 8;
- cs.d = 0;
- cs.l = 1;
- break;
- }
- cs_sel |= SELECTOR_RPL_MASK;
- ss_sel |= SELECTOR_RPL_MASK;
-
- ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
- ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
-
- ctxt->_eip = ctxt->regs[VCPU_REGS_RDX];
- ctxt->regs[VCPU_REGS_RSP] = ctxt->regs[VCPU_REGS_RCX];
-
- return X86EMUL_CONTINUE;
-}
-
-static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
-{
- int iopl;
- if (ctxt->mode == X86EMUL_MODE_REAL)
- return false;
- if (ctxt->mode == X86EMUL_MODE_VM86)
- return true;
- iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
- return ctxt->ops->cpl(ctxt) > iopl;
-}
-
-static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
- u16 port, u16 len)
-{
- struct x86_emulate_ops *ops = ctxt->ops;
- struct desc_struct tr_seg;
- u32 base3;
- int r;
- u16 tr, io_bitmap_ptr, perm, bit_idx = port & 0x7;
- unsigned mask = (1 << len) - 1;
- unsigned long base;
-
- ops->get_segment(ctxt, &tr, &tr_seg, &base3, VCPU_SREG_TR);
- if (!tr_seg.p)
- return false;
- if (desc_limit_scaled(&tr_seg) < 103)
- return false;
- base = get_desc_base(&tr_seg);
-#ifdef CONFIG_X86_64
- base |= ((u64)base3) << 32;
-#endif
- r = ops->read_std(ctxt, base + 102, &io_bitmap_ptr, 2, NULL);
- if (r != X86EMUL_CONTINUE)
- return false;
- if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg))
- return false;
- r = ops->read_std(ctxt, base + io_bitmap_ptr + port/8, &perm, 2, NULL);
- if (r != X86EMUL_CONTINUE)
- return false;
- if ((perm >> bit_idx) & mask)
- return false;
- return true;
-}
-
-static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
- u16 port, u16 len)
-{
- if (ctxt->perm_ok)
- return true;
-
- if (emulator_bad_iopl(ctxt))
- if (!emulator_io_port_access_allowed(ctxt, port, len))
- return false;
-
- ctxt->perm_ok = true;
-
- return true;
-}
-
-static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
- struct tss_segment_16 *tss)
-{
- tss->ip = ctxt->_eip;
- tss->flag = ctxt->eflags;
- tss->ax = ctxt->regs[VCPU_REGS_RAX];
- tss->cx = ctxt->regs[VCPU_REGS_RCX];
- tss->dx = ctxt->regs[VCPU_REGS_RDX];
- tss->bx = ctxt->regs[VCPU_REGS_RBX];
- tss->sp = ctxt->regs[VCPU_REGS_RSP];
- tss->bp = ctxt->regs[VCPU_REGS_RBP];
- tss->si = ctxt->regs[VCPU_REGS_RSI];
- tss->di = ctxt->regs[VCPU_REGS_RDI];
-
- tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
- tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
- tss->ss = get_segment_selector(ctxt, VCPU_SREG_SS);
- tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS);
- tss->ldt = get_segment_selector(ctxt, VCPU_SREG_LDTR);
-}
-
-static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
- struct tss_segment_16 *tss)
-{
- int ret;
-
- ctxt->_eip = tss->ip;
- ctxt->eflags = tss->flag | 2;
- ctxt->regs[VCPU_REGS_RAX] = tss->ax;
- ctxt->regs[VCPU_REGS_RCX] = tss->cx;
- ctxt->regs[VCPU_REGS_RDX] = tss->dx;
- ctxt->regs[VCPU_REGS_RBX] = tss->bx;
- ctxt->regs[VCPU_REGS_RSP] = tss->sp;
- ctxt->regs[VCPU_REGS_RBP] = tss->bp;
- ctxt->regs[VCPU_REGS_RSI] = tss->si;
- ctxt->regs[VCPU_REGS_RDI] = tss->di;
-
- /*
- * SDM says that segment selectors are loaded before segment
- * descriptors
- */
- set_segment_selector(ctxt, tss->ldt, VCPU_SREG_LDTR);
- set_segment_selector(ctxt, tss->es, VCPU_SREG_ES);
- set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS);
- set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS);
- set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
-
- /*
- * Now load segment descriptors. If fault happenes at this stage
- * it is handled in a context of new task
- */
- ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR);
- if (ret != X86EMUL_CONTINUE)
- return ret;
- ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES);
- if (ret != X86EMUL_CONTINUE)
- return ret;
- ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS);
- if (ret != X86EMUL_CONTINUE)
- return ret;
- ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS);
- if (ret != X86EMUL_CONTINUE)
- return ret;
- ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS);
- if (ret != X86EMUL_CONTINUE)
- return ret;
-
- return X86EMUL_CONTINUE;
-}
-
-static int task_switch_16(struct x86_emulate_ctxt *ctxt,
- u16 tss_selector, u16 old_tss_sel,
- ulong old_tss_base, struct desc_struct *new_desc)
-{
- struct x86_emulate_ops *ops = ctxt->ops;
- struct tss_segment_16 tss_seg;
- int ret;
- u32 new_tss_base = get_desc_base(new_desc);
-
- ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
- &ctxt->exception);
- if (ret != X86EMUL_CONTINUE)
- /* FIXME: need to provide precise fault address */
- return ret;
-
- save_state_to_tss16(ctxt, &tss_seg);
-
- ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
- &ctxt->exception);
- if (ret != X86EMUL_CONTINUE)
- /* FIXME: need to provide precise fault address */
- return ret;
-
- ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg,
- &ctxt->exception);
- if (ret != X86EMUL_CONTINUE)
- /* FIXME: need to provide precise fault address */
- return ret;
-
- if (old_tss_sel != 0xffff) {
- tss_seg.prev_task_link = old_tss_sel;
-
- ret = ops->write_std(ctxt, new_tss_base,
- &tss_seg.prev_task_link,
- sizeof tss_seg.prev_task_link,
- &ctxt->exception);
- if (ret != X86EMUL_CONTINUE)
- /* FIXME: need to provide precise fault address */
- return ret;
- }
-
- return load_state_from_tss16(ctxt, &tss_seg);
-}
-
-static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
- struct tss_segment_32 *tss)
-{
- tss->cr3 = ctxt->ops->get_cr(ctxt, 3);
- tss->eip = ctxt->_eip;
- tss->eflags = ctxt->eflags;
- tss->eax = ctxt->regs[VCPU_REGS_RAX];
- tss->ecx = ctxt->regs[VCPU_REGS_RCX];
- tss->edx = ctxt->regs[VCPU_REGS_RDX];
- tss->ebx = ctxt->regs[VCPU_REGS_RBX];
- tss->esp = ctxt->regs[VCPU_REGS_RSP];
- tss->ebp = ctxt->regs[VCPU_REGS_RBP];
- tss->esi = ctxt->regs[VCPU_REGS_RSI];
- tss->edi = ctxt->regs[VCPU_REGS_RDI];
-
- tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
- tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
- tss->ss = get_segment_selector(ctxt, VCPU_SREG_SS);
- tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS);
- tss->fs = get_segment_selector(ctxt, VCPU_SREG_FS);
- tss->gs = get_segment_selector(ctxt, VCPU_SREG_GS);
- tss->ldt_selector = get_segment_selector(ctxt, VCPU_SREG_LDTR);
-}
-
-static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
- struct tss_segment_32 *tss)
-{
- int ret;
-
- if (ctxt->ops->set_cr(ctxt, 3, tss->cr3))
- return emulate_gp(ctxt, 0);
- ctxt->_eip = tss->eip;
- ctxt->eflags = tss->eflags | 2;
-
- /* General purpose registers */
- ctxt->regs[VCPU_REGS_RAX] = tss->eax;
- ctxt->regs[VCPU_REGS_RCX] = tss->ecx;
- ctxt->regs[VCPU_REGS_RDX] = tss->edx;
- ctxt->regs[VCPU_REGS_RBX] = tss->ebx;
- ctxt->regs[VCPU_REGS_RSP] = tss->esp;
- ctxt->regs[VCPU_REGS_RBP] = tss->ebp;
- ctxt->regs[VCPU_REGS_RSI] = tss->esi;
- ctxt->regs[VCPU_REGS_RDI] = tss->edi;
-
- /*
- * SDM says that segment selectors are loaded before segment
- * descriptors
- */
- set_segment_selector(ctxt, tss->ldt_selector, VCPU_SREG_LDTR);
- set_segment_selector(ctxt, tss->es, VCPU_SREG_ES);
- set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS);
- set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS);
- set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
- set_segment_selector(ctxt, tss->fs, VCPU_SREG_FS);
- set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS);
-
- /*
- * If we're switching between Protected Mode and VM86, we need to make
- * sure to update the mode before loading the segment descriptors so
- * that the selectors are interpreted correctly.
- *
- * Need to get rflags to the vcpu struct immediately because it
- * influences the CPL which is checked at least when loading the segment
- * descriptors and when pushing an error code to the new kernel stack.
- *
- * TODO Introduce a separate ctxt->ops->set_cpl callback
- */
- if (ctxt->eflags & X86_EFLAGS_VM)
- ctxt->mode = X86EMUL_MODE_VM86;
- else
- ctxt->mode = X86EMUL_MODE_PROT32;
-
- ctxt->ops->set_rflags(ctxt, ctxt->eflags);
-
- /*
- * Now load segment descriptors. If fault happenes at this stage
- * it is handled in a context of new task
- */
- ret = load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR);
- if (ret != X86EMUL_CONTINUE)
- return ret;
- ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES);
- if (ret != X86EMUL_CONTINUE)
- return ret;
- ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS);
- if (ret != X86EMUL_CONTINUE)
- return ret;
- ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS);
- if (ret != X86EMUL_CONTINUE)
- return ret;
- ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS);
- if (ret != X86EMUL_CONTINUE)
- return ret;
- ret = load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS);
- if (ret != X86EMUL_CONTINUE)
- return ret;
- ret = load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS);
- if (ret != X86EMUL_CONTINUE)
- return ret;
-
- return X86EMUL_CONTINUE;
-}
-
-static int task_switch_32(struct x86_emulate_ctxt *ctxt,
- u16 tss_selector, u16 old_tss_sel,
- ulong old_tss_base, struct desc_struct *new_desc)
-{
- struct x86_emulate_ops *ops = ctxt->ops;
- struct tss_segment_32 tss_seg;
- int ret;
- u32 new_tss_base = get_desc_base(new_desc);
-
- ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
- &ctxt->exception);
- if (ret != X86EMUL_CONTINUE)
- /* FIXME: need to provide precise fault address */
- return ret;
-
- save_state_to_tss32(ctxt, &tss_seg);
-
- ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
- &ctxt->exception);
- if (ret != X86EMUL_CONTINUE)
- /* FIXME: need to provide precise fault address */
- return ret;
-
- ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg,
- &ctxt->exception);
- if (ret != X86EMUL_CONTINUE)
- /* FIXME: need to provide precise fault address */
- return ret;
-
- if (old_tss_sel != 0xffff) {
- tss_seg.prev_task_link = old_tss_sel;
-
- ret = ops->write_std(ctxt, new_tss_base,
- &tss_seg.prev_task_link,
- sizeof tss_seg.prev_task_link,
- &ctxt->exception);
- if (ret != X86EMUL_CONTINUE)
- /* FIXME: need to provide precise fault address */
- return ret;
- }
-
- return load_state_from_tss32(ctxt, &tss_seg);
-}
-
-static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
- u16 tss_selector, int idt_index, int reason,
- bool has_error_code, u32 error_code)
-{
- struct x86_emulate_ops *ops = ctxt->ops;
- struct desc_struct curr_tss_desc, next_tss_desc;
- int ret;
- u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR);
- ulong old_tss_base =
- ops->get_cached_segment_base(ctxt, VCPU_SREG_TR);
- u32 desc_limit;
-
- /* FIXME: old_tss_base == ~0 ? */
-
- ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc);
- if (ret != X86EMUL_CONTINUE)
- return ret;
- ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc);
- if (ret != X86EMUL_CONTINUE)
- return ret;
-
- /* FIXME: check that next_tss_desc is tss */
-
- /*
- * Check privileges. The three cases are task switch caused by...
- *
- * 1. jmp/call/int to task gate: Check against DPL of the task gate
- * 2. Exception/IRQ/iret: No check is performed
- * 3. jmp/call to TSS: Check agains DPL of the TSS
- */
- if (reason == TASK_SWITCH_GATE) {
- if (idt_index != -1) {
- /* Software interrupts */
- struct desc_struct task_gate_desc;
- int dpl;
-
- ret = read_interrupt_descriptor(ctxt, idt_index,
- &task_gate_desc);
- if (ret != X86EMUL_CONTINUE)
- return ret;
-
- dpl = task_gate_desc.dpl;
- if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl)
- return emulate_gp(ctxt, (idt_index << 3) | 0x2);
- }
- } else if (reason != TASK_SWITCH_IRET) {
- int dpl = next_tss_desc.dpl;
- if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl)
- return emulate_gp(ctxt, tss_selector);
- }
-
-
- desc_limit = desc_limit_scaled(&next_tss_desc);
- if (!next_tss_desc.p ||
- ((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
- desc_limit < 0x2b)) {
- emulate_ts(ctxt, tss_selector & 0xfffc);
- return X86EMUL_PROPAGATE_FAULT;
- }
-
- if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
- curr_tss_desc.type &= ~(1 << 1); /* clear busy flag */
- write_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc);
- }
-
- if (reason == TASK_SWITCH_IRET)
- ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT;
-
- /* set back link to prev task only if NT bit is set in eflags
- note that old_tss_sel is not used afetr this point */
- if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
- old_tss_sel = 0xffff;
-
- if (next_tss_desc.type & 8)
- ret = task_switch_32(ctxt, tss_selector, old_tss_sel,
- old_tss_base, &next_tss_desc);
- else
- ret = task_switch_16(ctxt, tss_selector, old_tss_sel,
- old_tss_base, &next_tss_desc);
- if (ret != X86EMUL_CONTINUE)
- return ret;
-
- if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE)
- ctxt->eflags = ctxt->eflags | X86_EFLAGS_NT;
-
- if (reason != TASK_SWITCH_IRET) {
- next_tss_desc.type |= (1 << 1); /* set busy flag */
- write_segment_descriptor(ctxt, tss_selector, &next_tss_desc);
- }
-
- ops->set_cr(ctxt, 0, ops->get_cr(ctxt, 0) | X86_CR0_TS);
- ops->set_segment(ctxt, tss_selector, &next_tss_desc, 0, VCPU_SREG_TR);
-
- if (has_error_code) {
- ctxt->op_bytes = ctxt->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
- ctxt->lock_prefix = 0;
- ctxt->src.val = (unsigned long) error_code;
- ret = em_push(ctxt);
- }
-
- return ret;
-}
-
-int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
- u16 tss_selector, int idt_index, int reason,
- bool has_error_code, u32 error_code)
-{
- int rc;
-
- ctxt->_eip = ctxt->eip;
- ctxt->dst.type = OP_NONE;
-
- rc = emulator_do_task_switch(ctxt, tss_selector, idt_index, reason,
- has_error_code, error_code);
-
- if (rc == X86EMUL_CONTINUE)
- ctxt->eip = ctxt->_eip;
-
- return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
-}
-
-static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg,
- int reg, struct operand *op)
-{
- int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
-
- register_address_increment(ctxt, &ctxt->regs[reg], df * op->bytes);
- op->addr.mem.ea = register_address(ctxt, ctxt->regs[reg]);
- op->addr.mem.seg = seg;
-}
-
-static int em_das(struct x86_emulate_ctxt *ctxt)
-{
- u8 al, old_al;
- bool af, cf, old_cf;
-
- cf = ctxt->eflags & X86_EFLAGS_CF;
- al = ctxt->dst.val;
-
- old_al = al;
- old_cf = cf;
- cf = false;
- af = ctxt->eflags & X86_EFLAGS_AF;
- if ((al & 0x0f) > 9 || af) {
- al -= 6;
- cf = old_cf | (al >= 250);
- af = true;
- } else {
- af = false;
- }
- if (old_al > 0x99 || old_cf) {
- al -= 0x60;
- cf = true;
- }
-
- ctxt->dst.val = al;
- /* Set PF, ZF, SF */
- ctxt->src.type = OP_IMM;
- ctxt->src.val = 0;
- ctxt->src.bytes = 1;
- emulate_2op_SrcV(ctxt, "or");
- ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF);
- if (cf)
- ctxt->eflags |= X86_EFLAGS_CF;
- if (af)
- ctxt->eflags |= X86_EFLAGS_AF;
- return X86EMUL_CONTINUE;
-}
-
-static int em_call(struct x86_emulate_ctxt *ctxt)
-{
- long rel = ctxt->src.val;
-
- ctxt->src.val = (unsigned long)ctxt->_eip;
- jmp_rel(ctxt, rel);
- return em_push(ctxt);
-}
-
-static int em_call_far(struct x86_emulate_ctxt *ctxt)
-{
- u16 sel, old_cs;
- ulong old_eip;
- int rc;
-
- old_cs = get_segment_selector(ctxt, VCPU_SREG_CS);
- old_eip = ctxt->_eip;
-
- memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
- if (load_segment_descriptor(ctxt, sel, VCPU_SREG_CS))
- return X86EMUL_CONTINUE;
-
- ctxt->_eip = 0;
- memcpy(&ctxt->_eip, ctxt->src.valptr, ctxt->op_bytes);
-
- ctxt->src.val = old_cs;
- rc = em_push(ctxt);
- if (rc != X86EMUL_CONTINUE)
- return rc;
-
- ctxt->src.val = old_eip;
- return em_push(ctxt);
-}
-
-static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
-{
- int rc;
-
- ctxt->dst.type = OP_REG;
- ctxt->dst.addr.reg = &ctxt->_eip;
- ctxt->dst.bytes = ctxt->op_bytes;
- rc = emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes);
- if (rc != X86EMUL_CONTINUE)
- return rc;
- register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], ctxt->src.val);
- return X86EMUL_CONTINUE;
-}
-
-static int em_add(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV(ctxt, "add");
- return X86EMUL_CONTINUE;
-}
-
-static int em_or(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV(ctxt, "or");
- return X86EMUL_CONTINUE;
-}
-
-static int em_adc(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV(ctxt, "adc");
- return X86EMUL_CONTINUE;
-}
-
-static int em_sbb(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV(ctxt, "sbb");
- return X86EMUL_CONTINUE;
-}
-
-static int em_and(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV(ctxt, "and");
- return X86EMUL_CONTINUE;
-}
-
-static int em_sub(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV(ctxt, "sub");
- return X86EMUL_CONTINUE;
-}
-
-static int em_xor(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV(ctxt, "xor");
- return X86EMUL_CONTINUE;
-}
-
-static int em_cmp(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV(ctxt, "cmp");
- /* Disable writeback. */
- ctxt->dst.type = OP_NONE;
- return X86EMUL_CONTINUE;
-}
-
-static int em_test(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV(ctxt, "test");
- /* Disable writeback. */
- ctxt->dst.type = OP_NONE;
- return X86EMUL_CONTINUE;
-}
-
-static int em_xchg(struct x86_emulate_ctxt *ctxt)
-{
- /* Write back the register source. */
- ctxt->src.val = ctxt->dst.val;
- write_register_operand(&ctxt->src);
-
- /* Write back the memory destination with implicit LOCK prefix. */
- ctxt->dst.val = ctxt->src.orig_val;
- ctxt->lock_prefix = 1;
- return X86EMUL_CONTINUE;
-}
-
-static int em_imul(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV_nobyte(ctxt, "imul");
- return X86EMUL_CONTINUE;
-}
-
-static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
-{
- ctxt->dst.val = ctxt->src2.val;
- return em_imul(ctxt);
-}
-
-static int em_cwd(struct x86_emulate_ctxt *ctxt)
-{
- ctxt->dst.type = OP_REG;
- ctxt->dst.bytes = ctxt->src.bytes;
- ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RDX];
- ctxt->dst.val = ~((ctxt->src.val >> (ctxt->src.bytes * 8 - 1)) - 1);
-
- return X86EMUL_CONTINUE;
-}
-
-static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
-{
- u64 tsc = 0;
-
- ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc);
- ctxt->regs[VCPU_REGS_RAX] = (u32)tsc;
- ctxt->regs[VCPU_REGS_RDX] = tsc >> 32;
- return X86EMUL_CONTINUE;
-}
-
-static int em_rdpmc(struct x86_emulate_ctxt *ctxt)
-{
- u64 pmc;
-
- if (ctxt->ops->read_pmc(ctxt, ctxt->regs[VCPU_REGS_RCX], &pmc))
- return emulate_gp(ctxt, 0);
- ctxt->regs[VCPU_REGS_RAX] = (u32)pmc;
- ctxt->regs[VCPU_REGS_RDX] = pmc >> 32;
- return X86EMUL_CONTINUE;
-}
-
-static int em_mov(struct x86_emulate_ctxt *ctxt)
-{
- ctxt->dst.val = ctxt->src.val;
- return X86EMUL_CONTINUE;
-}
-
-static int em_cr_write(struct x86_emulate_ctxt *ctxt)
-{
- if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val))
- return emulate_gp(ctxt, 0);
-
- /* Disable writeback. */
- ctxt->dst.type = OP_NONE;
- return X86EMUL_CONTINUE;
-}
-
-static int em_dr_write(struct x86_emulate_ctxt *ctxt)
-{
- unsigned long val;
-
- if (ctxt->mode == X86EMUL_MODE_PROT64)
- val = ctxt->src.val & ~0ULL;
- else
- val = ctxt->src.val & ~0U;
-
- /* #UD condition is already handled. */
- if (ctxt->ops->set_dr(ctxt, ctxt->modrm_reg, val) < 0)
- return emulate_gp(ctxt, 0);
-
- /* Disable writeback. */
- ctxt->dst.type = OP_NONE;
- return X86EMUL_CONTINUE;
-}
-
-static int em_wrmsr(struct x86_emulate_ctxt *ctxt)
-{
- u64 msr_data;
-
- msr_data = (u32)ctxt->regs[VCPU_REGS_RAX]
- | ((u64)ctxt->regs[VCPU_REGS_RDX] << 32);
- if (ctxt->ops->set_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], msr_data))
- return emulate_gp(ctxt, 0);
-
- return X86EMUL_CONTINUE;
-}
-
-static int em_rdmsr(struct x86_emulate_ctxt *ctxt)
-{
- u64 msr_data;
-
- if (ctxt->ops->get_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], &msr_data))
- return emulate_gp(ctxt, 0);
-
- ctxt->regs[VCPU_REGS_RAX] = (u32)msr_data;
- ctxt->regs[VCPU_REGS_RDX] = msr_data >> 32;
- return X86EMUL_CONTINUE;
-}
-
-static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt)
-{
- if (ctxt->modrm_reg > VCPU_SREG_GS)
- return emulate_ud(ctxt);
-
- ctxt->dst.val = get_segment_selector(ctxt, ctxt->modrm_reg);
- return X86EMUL_CONTINUE;
-}
-
-static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt)
-{
- u16 sel = ctxt->src.val;
-
- if (ctxt->modrm_reg == VCPU_SREG_CS || ctxt->modrm_reg > VCPU_SREG_GS)
- return emulate_ud(ctxt);
-
- if (ctxt->modrm_reg == VCPU_SREG_SS)
- ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
-
- /* Disable writeback. */
- ctxt->dst.type = OP_NONE;
- return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg);
-}
-
-static int em_movdqu(struct x86_emulate_ctxt *ctxt)
-{
- memcpy(&ctxt->dst.vec_val, &ctxt->src.vec_val, ctxt->op_bytes);
- return X86EMUL_CONTINUE;
-}
-
-static int em_invlpg(struct x86_emulate_ctxt *ctxt)
-{
- int rc;
- ulong linear;
-
- rc = linearize(ctxt, ctxt->src.addr.mem, 1, false, &linear);
- if (rc == X86EMUL_CONTINUE)
- ctxt->ops->invlpg(ctxt, linear);
- /* Disable writeback. */
- ctxt->dst.type = OP_NONE;
- return X86EMUL_CONTINUE;
-}
-
-static int em_clts(struct x86_emulate_ctxt *ctxt)
-{
- ulong cr0;
-
- cr0 = ctxt->ops->get_cr(ctxt, 0);
- cr0 &= ~X86_CR0_TS;
- ctxt->ops->set_cr(ctxt, 0, cr0);
- return X86EMUL_CONTINUE;
-}
-
-static int em_vmcall(struct x86_emulate_ctxt *ctxt)
-{
- int rc;
-
- if (ctxt->modrm_mod != 3 || ctxt->modrm_rm != 1)
- return X86EMUL_UNHANDLEABLE;
-
- rc = ctxt->ops->fix_hypercall(ctxt);
- if (rc != X86EMUL_CONTINUE)
- return rc;
-
- /* Let the processor re-execute the fixed hypercall */
- ctxt->_eip = ctxt->eip;
- /* Disable writeback. */
- ctxt->dst.type = OP_NONE;
- return X86EMUL_CONTINUE;
-}
-
-static int em_lgdt(struct x86_emulate_ctxt *ctxt)
-{
- struct desc_ptr desc_ptr;
- int rc;
-
- rc = read_descriptor(ctxt, ctxt->src.addr.mem,
- &desc_ptr.size, &desc_ptr.address,
- ctxt->op_bytes);
- if (rc != X86EMUL_CONTINUE)
- return rc;
- ctxt->ops->set_gdt(ctxt, &desc_ptr);
- /* Disable writeback. */
- ctxt->dst.type = OP_NONE;
- return X86EMUL_CONTINUE;
-}
-
-static int em_vmmcall(struct x86_emulate_ctxt *ctxt)
-{
- int rc;
-
- rc = ctxt->ops->fix_hypercall(ctxt);
-
- /* Disable writeback. */
- ctxt->dst.type = OP_NONE;
- return rc;
-}
-
-static int em_lidt(struct x86_emulate_ctxt *ctxt)
-{
- struct desc_ptr desc_ptr;
- int rc;
-
- rc = read_descriptor(ctxt, ctxt->src.addr.mem,
- &desc_ptr.size, &desc_ptr.address,
- ctxt->op_bytes);
- if (rc != X86EMUL_CONTINUE)
- return rc;
- ctxt->ops->set_idt(ctxt, &desc_ptr);
- /* Disable writeback. */
- ctxt->dst.type = OP_NONE;
- return X86EMUL_CONTINUE;
-}
-
-static int em_smsw(struct x86_emulate_ctxt *ctxt)
-{
- ctxt->dst.bytes = 2;
- ctxt->dst.val = ctxt->ops->get_cr(ctxt, 0);
- return X86EMUL_CONTINUE;
-}
-
-static int em_lmsw(struct x86_emulate_ctxt *ctxt)
-{
- ctxt->ops->set_cr(ctxt, 0, (ctxt->ops->get_cr(ctxt, 0) & ~0x0eul)
- | (ctxt->src.val & 0x0f));
- ctxt->dst.type = OP_NONE;
- return X86EMUL_CONTINUE;
-}
-
-static int em_loop(struct x86_emulate_ctxt *ctxt)
-{
- register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1);
- if ((address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) != 0) &&
- (ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags)))
- jmp_rel(ctxt, ctxt->src.val);
-
- return X86EMUL_CONTINUE;
-}
-
-static int em_jcxz(struct x86_emulate_ctxt *ctxt)
-{
- if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0)
- jmp_rel(ctxt, ctxt->src.val);
-
- return X86EMUL_CONTINUE;
-}
-
-static int em_in(struct x86_emulate_ctxt *ctxt)
-{
- if (!pio_in_emulated(ctxt, ctxt->dst.bytes, ctxt->src.val,
- &ctxt->dst.val))
- return X86EMUL_IO_NEEDED;
-
- return X86EMUL_CONTINUE;
-}
-
-static int em_out(struct x86_emulate_ctxt *ctxt)
-{
- ctxt->ops->pio_out_emulated(ctxt, ctxt->src.bytes, ctxt->dst.val,
- &ctxt->src.val, 1);
- /* Disable writeback. */
- ctxt->dst.type = OP_NONE;
- return X86EMUL_CONTINUE;
-}
-
-static int em_cli(struct x86_emulate_ctxt *ctxt)
-{
- if (emulator_bad_iopl(ctxt))
- return emulate_gp(ctxt, 0);
-
- ctxt->eflags &= ~X86_EFLAGS_IF;
- return X86EMUL_CONTINUE;
-}
-
-static int em_sti(struct x86_emulate_ctxt *ctxt)
-{
- if (emulator_bad_iopl(ctxt))
- return emulate_gp(ctxt, 0);
-
- ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
- ctxt->eflags |= X86_EFLAGS_IF;
- return X86EMUL_CONTINUE;
-}
-
-static int em_bt(struct x86_emulate_ctxt *ctxt)
-{
- /* Disable writeback. */
- ctxt->dst.type = OP_NONE;
- /* only subword offset */
- ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
-
- emulate_2op_SrcV_nobyte(ctxt, "bt");
- return X86EMUL_CONTINUE;
-}
-
-static int em_bts(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV_nobyte(ctxt, "bts");
- return X86EMUL_CONTINUE;
-}
-
-static int em_btr(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV_nobyte(ctxt, "btr");
- return X86EMUL_CONTINUE;
-}
-
-static int em_btc(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV_nobyte(ctxt, "btc");
- return X86EMUL_CONTINUE;
-}
-
-static int em_bsf(struct x86_emulate_ctxt *ctxt)
-{
- u8 zf;
-
- __asm__ ("bsf %2, %0; setz %1"
- : "=r"(ctxt->dst.val), "=q"(zf)
- : "r"(ctxt->src.val));
-
- ctxt->eflags &= ~X86_EFLAGS_ZF;
- if (zf) {
- ctxt->eflags |= X86_EFLAGS_ZF;
- /* Disable writeback. */
- ctxt->dst.type = OP_NONE;
- }
- return X86EMUL_CONTINUE;
-}
-
-static int em_bsr(struct x86_emulate_ctxt *ctxt)
-{
- u8 zf;
-
- __asm__ ("bsr %2, %0; setz %1"
- : "=r"(ctxt->dst.val), "=q"(zf)
- : "r"(ctxt->src.val));
-
- ctxt->eflags &= ~X86_EFLAGS_ZF;
- if (zf) {
- ctxt->eflags |= X86_EFLAGS_ZF;
- /* Disable writeback. */
- ctxt->dst.type = OP_NONE;
- }
- return X86EMUL_CONTINUE;
-}
-
-static bool valid_cr(int nr)
-{
- switch (nr) {
- case 0:
- case 2 ... 4:
- case 8:
- return true;
- default:
- return false;
- }
-}
-
-static int check_cr_read(struct x86_emulate_ctxt *ctxt)
-{
- if (!valid_cr(ctxt->modrm_reg))
- return emulate_ud(ctxt);
-
- return X86EMUL_CONTINUE;
-}
-
-static int check_cr_write(struct x86_emulate_ctxt *ctxt)
-{
- u64 new_val = ctxt->src.val64;
- int cr = ctxt->modrm_reg;
- u64 efer = 0;
-
- static u64 cr_reserved_bits[] = {
- 0xffffffff00000000ULL,
- 0, 0, 0, /* CR3 checked later */
- CR4_RESERVED_BITS,
- 0, 0, 0,
- CR8_RESERVED_BITS,
- };
-
- if (!valid_cr(cr))
- return emulate_ud(ctxt);
-
- if (new_val & cr_reserved_bits[cr])
- return emulate_gp(ctxt, 0);
-
- switch (cr) {
- case 0: {
- u64 cr4;
- if (((new_val & X86_CR0_PG) && !(new_val & X86_CR0_PE)) ||
- ((new_val & X86_CR0_NW) && !(new_val & X86_CR0_CD)))
- return emulate_gp(ctxt, 0);
-
- cr4 = ctxt->ops->get_cr(ctxt, 4);
- ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
-
- if ((new_val & X86_CR0_PG) && (efer & EFER_LME) &&
- !(cr4 & X86_CR4_PAE))
- return emulate_gp(ctxt, 0);
-
- break;
- }
- case 3: {
- u64 rsvd = 0;
-
- ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
- if (efer & EFER_LMA)
- rsvd = CR3_L_MODE_RESERVED_BITS;
- else if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PAE)
- rsvd = CR3_PAE_RESERVED_BITS;
- else if (ctxt->ops->get_cr(ctxt, 0) & X86_CR0_PG)
- rsvd = CR3_NONPAE_RESERVED_BITS;
-
- if (new_val & rsvd)
- return emulate_gp(ctxt, 0);
-
- break;
- }
- case 4: {
- ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
-
- if ((efer & EFER_LMA) && !(new_val & X86_CR4_PAE))
- return emulate_gp(ctxt, 0);
-
- break;
- }
- }
-
- return X86EMUL_CONTINUE;
-}
-
-static int check_dr7_gd(struct x86_emulate_ctxt *ctxt)
-{
- unsigned long dr7;
-
- ctxt->ops->get_dr(ctxt, 7, &dr7);
-
- /* Check if DR7.Global_Enable is set */
- return dr7 & (1 << 13);
-}
-
-static int check_dr_read(struct x86_emulate_ctxt *ctxt)
-{
- int dr = ctxt->modrm_reg;
- u64 cr4;
-
- if (dr > 7)
- return emulate_ud(ctxt);
-
- cr4 = ctxt->ops->get_cr(ctxt, 4);
- if ((cr4 & X86_CR4_DE) && (dr == 4 || dr == 5))
- return emulate_ud(ctxt);
-
- if (check_dr7_gd(ctxt))
- return emulate_db(ctxt);
-
- return X86EMUL_CONTINUE;
-}
-
-static int check_dr_write(struct x86_emulate_ctxt *ctxt)
-{
- u64 new_val = ctxt->src.val64;
- int dr = ctxt->modrm_reg;
-
- if ((dr == 6 || dr == 7) && (new_val & 0xffffffff00000000ULL))
- return emulate_gp(ctxt, 0);
-
- return check_dr_read(ctxt);
-}
-
-static int check_svme(struct x86_emulate_ctxt *ctxt)
-{
- u64 efer;
-
- ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
-
- if (!(efer & EFER_SVME))
- return emulate_ud(ctxt);
-
- return X86EMUL_CONTINUE;
-}
-
-static int check_svme_pa(struct x86_emulate_ctxt *ctxt)
-{
- u64 rax = ctxt->regs[VCPU_REGS_RAX];
-
- /* Valid physical address? */
- if (rax & 0xffff000000000000ULL)
- return emulate_gp(ctxt, 0);
-
- return check_svme(ctxt);
-}
-
-static int check_rdtsc(struct x86_emulate_ctxt *ctxt)
-{
- u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
-
- if (cr4 & X86_CR4_TSD && ctxt->ops->cpl(ctxt))
- return emulate_ud(ctxt);
-
- return X86EMUL_CONTINUE;
-}
-
-static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
-{
- u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
- u64 rcx = ctxt->regs[VCPU_REGS_RCX];
-
- if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) ||
- (rcx > 3))
- return emulate_gp(ctxt, 0);
-
- return X86EMUL_CONTINUE;
-}
-
-static int check_perm_in(struct x86_emulate_ctxt *ctxt)
-{
- ctxt->dst.bytes = min(ctxt->dst.bytes, 4u);
- if (!emulator_io_permited(ctxt, ctxt->src.val, ctxt->dst.bytes))
- return emulate_gp(ctxt, 0);
-
- return X86EMUL_CONTINUE;
-}
-
-static int check_perm_out(struct x86_emulate_ctxt *ctxt)
-{
- ctxt->src.bytes = min(ctxt->src.bytes, 4u);
- if (!emulator_io_permited(ctxt, ctxt->dst.val, ctxt->src.bytes))
- return emulate_gp(ctxt, 0);
-
- return X86EMUL_CONTINUE;
-}
-
-#define D(_y) { .flags = (_y) }
-#define DI(_y, _i) { .flags = (_y), .intercept = x86_intercept_##_i }
-#define DIP(_y, _i, _p) { .flags = (_y), .intercept = x86_intercept_##_i, \
- .check_perm = (_p) }
-#define N D(0)
-#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) }
-#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
-#define GD(_f, _g) { .flags = ((_f) | GroupDual), .u.gdual = (_g) }
-#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
-#define II(_f, _e, _i) \
- { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i }
-#define IIP(_f, _e, _i, _p) \
- { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i, \
- .check_perm = (_p) }
-#define GP(_f, _g) { .flags = ((_f) | Prefix), .u.gprefix = (_g) }
-
-#define D2bv(_f) D((_f) | ByteOp), D(_f)
-#define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p)
-#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e)
-#define I2bvIP(_f, _e, _i, _p) \
- IIP((_f) | ByteOp, _e, _i, _p), IIP(_f, _e, _i, _p)
-
-#define I6ALU(_f, _e) I2bv((_f) | DstMem | SrcReg | ModRM, _e), \
- I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \
- I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e)
-
-static struct opcode group7_rm1[] = {
- DI(SrcNone | ModRM | Priv, monitor),
- DI(SrcNone | ModRM | Priv, mwait),
- N, N, N, N, N, N,
-};
-
-static struct opcode group7_rm3[] = {
- DIP(SrcNone | ModRM | Prot | Priv, vmrun, check_svme_pa),
- II(SrcNone | ModRM | Prot | VendorSpecific, em_vmmcall, vmmcall),
- DIP(SrcNone | ModRM | Prot | Priv, vmload, check_svme_pa),
- DIP(SrcNone | ModRM | Prot | Priv, vmsave, check_svme_pa),
- DIP(SrcNone | ModRM | Prot | Priv, stgi, check_svme),
- DIP(SrcNone | ModRM | Prot | Priv, clgi, check_svme),
- DIP(SrcNone | ModRM | Prot | Priv, skinit, check_svme),
- DIP(SrcNone | ModRM | Prot | Priv, invlpga, check_svme),
-};
-
-static struct opcode group7_rm7[] = {
- N,
- DIP(SrcNone | ModRM, rdtscp, check_rdtsc),
- N, N, N, N, N, N,
-};
-
-static struct opcode group1[] = {
- I(Lock, em_add),
- I(Lock | PageTable, em_or),
- I(Lock, em_adc),
- I(Lock, em_sbb),
- I(Lock | PageTable, em_and),
- I(Lock, em_sub),
- I(Lock, em_xor),
- I(0, em_cmp),
-};
-
-static struct opcode group1A[] = {
- I(DstMem | SrcNone | ModRM | Mov | Stack, em_pop), N, N, N, N, N, N, N,
-};
-
-static struct opcode group3[] = {
- I(DstMem | SrcImm | ModRM, em_test),
- I(DstMem | SrcImm | ModRM, em_test),
- I(DstMem | SrcNone | ModRM | Lock, em_not),
- I(DstMem | SrcNone | ModRM | Lock, em_neg),
- I(SrcMem | ModRM, em_mul_ex),
- I(SrcMem | ModRM, em_imul_ex),
- I(SrcMem | ModRM, em_div_ex),
- I(SrcMem | ModRM, em_idiv_ex),
-};
-
-static struct opcode group4[] = {
- I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45),
- I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45),
- N, N, N, N, N, N,
-};
-
-static struct opcode group5[] = {
- I(DstMem | SrcNone | ModRM | Lock, em_grp45),
- I(DstMem | SrcNone | ModRM | Lock, em_grp45),
- I(SrcMem | ModRM | Stack, em_grp45),
- I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far),
- I(SrcMem | ModRM | Stack, em_grp45),
- I(SrcMemFAddr | ModRM | ImplicitOps, em_grp45),
- I(SrcMem | ModRM | Stack, em_grp45), N,
-};
-
-static struct opcode group6[] = {
- DI(ModRM | Prot, sldt),
- DI(ModRM | Prot, str),
- DI(ModRM | Prot | Priv, lldt),
- DI(ModRM | Prot | Priv, ltr),
- N, N, N, N,
-};
-
-static struct group_dual group7 = { {
- DI(ModRM | Mov | DstMem | Priv, sgdt),
- DI(ModRM | Mov | DstMem | Priv, sidt),
- II(ModRM | SrcMem | Priv, em_lgdt, lgdt),
- II(ModRM | SrcMem | Priv, em_lidt, lidt),
- II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N,
- II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw),
- II(SrcMem | ModRM | ByteOp | Priv | NoAccess, em_invlpg, invlpg),
-}, {
- I(SrcNone | ModRM | Priv | VendorSpecific, em_vmcall),
- EXT(0, group7_rm1),
- N, EXT(0, group7_rm3),
- II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N,
- II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw), EXT(0, group7_rm7),
-} };
-
-static struct opcode group8[] = {
- N, N, N, N,
- I(DstMem | SrcImmByte | ModRM, em_bt),
- I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_bts),
- I(DstMem | SrcImmByte | ModRM | Lock, em_btr),
- I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_btc),
-};
-
-static struct group_dual group9 = { {
- N, I(DstMem64 | ModRM | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N,
-}, {
- N, N, N, N, N, N, N, N,
-} };
-
-static struct opcode group11[] = {
- I(DstMem | SrcImm | ModRM | Mov | PageTable, em_mov),
- X7(D(Undefined)),
-};
-
-static struct gprefix pfx_0f_6f_0f_7f = {
- N, N, N, I(Sse, em_movdqu),
-};
-
-static struct opcode opcode_table[256] = {
- /* 0x00 - 0x07 */
- I6ALU(Lock, em_add),
- I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg),
- I(ImplicitOps | Stack | No64 | Src2ES, em_pop_sreg),
- /* 0x08 - 0x0F */
- I6ALU(Lock | PageTable, em_or),
- I(ImplicitOps | Stack | No64 | Src2CS, em_push_sreg),
- N,
- /* 0x10 - 0x17 */
- I6ALU(Lock, em_adc),
- I(ImplicitOps | Stack | No64 | Src2SS, em_push_sreg),
- I(ImplicitOps | Stack | No64 | Src2SS, em_pop_sreg),
- /* 0x18 - 0x1F */
- I6ALU(Lock, em_sbb),
- I(ImplicitOps | Stack | No64 | Src2DS, em_push_sreg),
- I(ImplicitOps | Stack | No64 | Src2DS, em_pop_sreg),
- /* 0x20 - 0x27 */
- I6ALU(Lock | PageTable, em_and), N, N,
- /* 0x28 - 0x2F */
- I6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das),
- /* 0x30 - 0x37 */
- I6ALU(Lock, em_xor), N, N,
- /* 0x38 - 0x3F */
- I6ALU(0, em_cmp), N, N,
- /* 0x40 - 0x4F */
- X16(D(DstReg)),
- /* 0x50 - 0x57 */
- X8(I(SrcReg | Stack, em_push)),
- /* 0x58 - 0x5F */
- X8(I(DstReg | Stack, em_pop)),
- /* 0x60 - 0x67 */
- I(ImplicitOps | Stack | No64, em_pusha),
- I(ImplicitOps | Stack | No64, em_popa),
- N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ ,
- N, N, N, N,
- /* 0x68 - 0x6F */
- I(SrcImm | Mov | Stack, em_push),
- I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
- I(SrcImmByte | Mov | Stack, em_push),
- I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
- I2bvIP(DstDI | SrcDX | Mov | String, em_in, ins, check_perm_in), /* insb, insw/insd */
- I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */
- /* 0x70 - 0x7F */
- X16(D(SrcImmByte)),
- /* 0x80 - 0x87 */
- G(ByteOp | DstMem | SrcImm | ModRM | Group, group1),
- G(DstMem | SrcImm | ModRM | Group, group1),
- G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1),
- G(DstMem | SrcImmByte | ModRM | Group, group1),
- I2bv(DstMem | SrcReg | ModRM, em_test),
- I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg),
- /* 0x88 - 0x8F */
- I2bv(DstMem | SrcReg | ModRM | Mov | PageTable, em_mov),
- I2bv(DstReg | SrcMem | ModRM | Mov, em_mov),
- I(DstMem | SrcNone | ModRM | Mov | PageTable, em_mov_rm_sreg),
- D(ModRM | SrcMem | NoAccess | DstReg),
- I(ImplicitOps | SrcMem16 | ModRM, em_mov_sreg_rm),
- G(0, group1A),
- /* 0x90 - 0x97 */
- DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)),
- /* 0x98 - 0x9F */
- D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd),
- I(SrcImmFAddr | No64, em_call_far), N,
- II(ImplicitOps | Stack, em_pushf, pushf),
- II(ImplicitOps | Stack, em_popf, popf), N, N,
- /* 0xA0 - 0xA7 */
- I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
- I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov),
- I2bv(SrcSI | DstDI | Mov | String, em_mov),
- I2bv(SrcSI | DstDI | String, em_cmp),
- /* 0xA8 - 0xAF */
- I2bv(DstAcc | SrcImm, em_test),
- I2bv(SrcAcc | DstDI | Mov | String, em_mov),
- I2bv(SrcSI | DstAcc | Mov | String, em_mov),
- I2bv(SrcAcc | DstDI | String, em_cmp),
- /* 0xB0 - 0xB7 */
- X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)),
- /* 0xB8 - 0xBF */
- X8(I(DstReg | SrcImm | Mov, em_mov)),
- /* 0xC0 - 0xC7 */
- D2bv(DstMem | SrcImmByte | ModRM),
- I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm),
- I(ImplicitOps | Stack, em_ret),
- I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg),
- I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg),
- G(ByteOp, group11), G(0, group11),
- /* 0xC8 - 0xCF */
- N, N, N, I(ImplicitOps | Stack, em_ret_far),
- D(ImplicitOps), DI(SrcImmByte, intn),
- D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret),
- /* 0xD0 - 0xD7 */
- D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM),
- N, N, N, N,
- /* 0xD8 - 0xDF */
- N, N, N, N, N, N, N, N,
- /* 0xE0 - 0xE7 */
- X3(I(SrcImmByte, em_loop)),
- I(SrcImmByte, em_jcxz),
- I2bvIP(SrcImmUByte | DstAcc, em_in, in, check_perm_in),
- I2bvIP(SrcAcc | DstImmUByte, em_out, out, check_perm_out),
- /* 0xE8 - 0xEF */
- I(SrcImm | Stack, em_call), D(SrcImm | ImplicitOps),
- I(SrcImmFAddr | No64, em_jmp_far), D(SrcImmByte | ImplicitOps),
- I2bvIP(SrcDX | DstAcc, em_in, in, check_perm_in),
- I2bvIP(SrcAcc | DstDX, em_out, out, check_perm_out),
- /* 0xF0 - 0xF7 */
- N, DI(ImplicitOps, icebp), N, N,
- DI(ImplicitOps | Priv, hlt), D(ImplicitOps),
- G(ByteOp, group3), G(0, group3),
- /* 0xF8 - 0xFF */
- D(ImplicitOps), D(ImplicitOps),
- I(ImplicitOps, em_cli), I(ImplicitOps, em_sti),
- D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
-};
-
-static struct opcode twobyte_table[256] = {
- /* 0x00 - 0x0F */
- G(0, group6), GD(0, &group7), N, N,
- N, I(ImplicitOps | VendorSpecific, em_syscall),
- II(ImplicitOps | Priv, em_clts, clts), N,
- DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N,
- N, D(ImplicitOps | ModRM), N, N,
- /* 0x10 - 0x1F */
- N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N,
- /* 0x20 - 0x2F */
- DIP(ModRM | DstMem | Priv | Op3264, cr_read, check_cr_read),
- DIP(ModRM | DstMem | Priv | Op3264, dr_read, check_dr_read),
- IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write),
- IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write),
- N, N, N, N,
- N, N, N, N, N, N, N, N,
- /* 0x30 - 0x3F */
- II(ImplicitOps | Priv, em_wrmsr, wrmsr),
- IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
- II(ImplicitOps | Priv, em_rdmsr, rdmsr),
- IIP(ImplicitOps, em_rdpmc, rdpmc, check_rdpmc),
- I(ImplicitOps | VendorSpecific, em_sysenter),
- I(ImplicitOps | Priv | VendorSpecific, em_sysexit),
- N, N,
- N, N, N, N, N, N, N, N,
- /* 0x40 - 0x4F */
- X16(D(DstReg | SrcMem | ModRM | Mov)),
- /* 0x50 - 0x5F */
- N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
- /* 0x60 - 0x6F */
- N, N, N, N,
- N, N, N, N,
- N, N, N, N,
- N, N, N, GP(SrcMem | DstReg | ModRM | Mov, &pfx_0f_6f_0f_7f),
- /* 0x70 - 0x7F */
- N, N, N, N,
- N, N, N, N,
- N, N, N, N,
- N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_6f_0f_7f),
- /* 0x80 - 0x8F */
- X16(D(SrcImm)),
- /* 0x90 - 0x9F */
- X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
- /* 0xA0 - 0xA7 */
- I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg),
- DI(ImplicitOps, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt),
- D(DstMem | SrcReg | Src2ImmByte | ModRM),
- D(DstMem | SrcReg | Src2CL | ModRM), N, N,
- /* 0xA8 - 0xAF */
- I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg),
- DI(ImplicitOps, rsm),
- I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts),
- D(DstMem | SrcReg | Src2ImmByte | ModRM),
- D(DstMem | SrcReg | Src2CL | ModRM),
- D(ModRM), I(DstReg | SrcMem | ModRM, em_imul),
- /* 0xB0 - 0xB7 */
- I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_cmpxchg),
- I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg),
- I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr),
- I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg),
- I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg),
- D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
- /* 0xB8 - 0xBF */
- N, N,
- G(BitOp, group8),
- I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
- I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr),
- D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
- /* 0xC0 - 0xCF */
- D2bv(DstMem | SrcReg | ModRM | Lock),
- N, D(DstMem | SrcReg | ModRM | Mov),
- N, N, N, GD(0, &group9),
- N, N, N, N, N, N, N, N,
- /* 0xD0 - 0xDF */
- N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
- /* 0xE0 - 0xEF */
- N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
- /* 0xF0 - 0xFF */
- N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
-};
-
-#undef D
-#undef N
-#undef G
-#undef GD
-#undef I
-#undef GP
-#undef EXT
-
-#undef D2bv
-#undef D2bvIP
-#undef I2bv
-#undef I2bvIP
-#undef I6ALU
-
-static unsigned imm_size(struct x86_emulate_ctxt *ctxt)
-{
- unsigned size;
-
- size = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
- if (size == 8)
- size = 4;
- return size;
-}
-
-static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
- unsigned size, bool sign_extension)
-{
- int rc = X86EMUL_CONTINUE;
-
- op->type = OP_IMM;
- op->bytes = size;
- op->addr.mem.ea = ctxt->_eip;
- /* NB. Immediates are sign-extended as necessary. */
- switch (op->bytes) {
- case 1:
- op->val = insn_fetch(s8, ctxt);
- break;
- case 2:
- op->val = insn_fetch(s16, ctxt);
- break;
- case 4:
- op->val = insn_fetch(s32, ctxt);
- break;
- }
- if (!sign_extension) {
- switch (op->bytes) {
- case 1:
- op->val &= 0xff;
- break;
- case 2:
- op->val &= 0xffff;
- break;
- case 4:
- op->val &= 0xffffffff;
- break;
- }
- }
-done:
- return rc;
-}
-
-static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
- unsigned d)
-{
- int rc = X86EMUL_CONTINUE;
-
- switch (d) {
- case OpReg:
- decode_register_operand(ctxt, op);
- break;
- case OpImmUByte:
- rc = decode_imm(ctxt, op, 1, false);
- break;
- case OpMem:
- ctxt->memop.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
- mem_common:
- *op = ctxt->memop;
- ctxt->memopp = op;
- if ((ctxt->d & BitOp) && op == &ctxt->dst)
- fetch_bit_operand(ctxt);
- op->orig_val = op->val;
- break;
- case OpMem64:
- ctxt->memop.bytes = 8;
- goto mem_common;
- case OpAcc:
- op->type = OP_REG;
- op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
- op->addr.reg = &ctxt->regs[VCPU_REGS_RAX];
- fetch_register_operand(op);
- op->orig_val = op->val;
- break;
- case OpDI:
- op->type = OP_MEM;
- op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
- op->addr.mem.ea =
- register_address(ctxt, ctxt->regs[VCPU_REGS_RDI]);
- op->addr.mem.seg = VCPU_SREG_ES;
- op->val = 0;
- break;
- case OpDX:
- op->type = OP_REG;
- op->bytes = 2;
- op->addr.reg = &ctxt->regs[VCPU_REGS_RDX];
- fetch_register_operand(op);
- break;
- case OpCL:
- op->bytes = 1;
- op->val = ctxt->regs[VCPU_REGS_RCX] & 0xff;
- break;
- case OpImmByte:
- rc = decode_imm(ctxt, op, 1, true);
- break;
- case OpOne:
- op->bytes = 1;
- op->val = 1;
- break;
- case OpImm:
- rc = decode_imm(ctxt, op, imm_size(ctxt), true);
- break;
- case OpMem8:
- ctxt->memop.bytes = 1;
- goto mem_common;
- case OpMem16:
- ctxt->memop.bytes = 2;
- goto mem_common;
- case OpMem32:
- ctxt->memop.bytes = 4;
- goto mem_common;
- case OpImmU16:
- rc = decode_imm(ctxt, op, 2, false);
- break;
- case OpImmU:
- rc = decode_imm(ctxt, op, imm_size(ctxt), false);
- break;
- case OpSI:
- op->type = OP_MEM;
- op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
- op->addr.mem.ea =
- register_address(ctxt, ctxt->regs[VCPU_REGS_RSI]);
- op->addr.mem.seg = seg_override(ctxt);
- op->val = 0;
- break;
- case OpImmFAddr:
- op->type = OP_IMM;
- op->addr.mem.ea = ctxt->_eip;
- op->bytes = ctxt->op_bytes + 2;
- insn_fetch_arr(op->valptr, op->bytes, ctxt);
- break;
- case OpMemFAddr:
- ctxt->memop.bytes = ctxt->op_bytes + 2;
- goto mem_common;
- case OpES:
- op->val = VCPU_SREG_ES;
- break;
- case OpCS:
- op->val = VCPU_SREG_CS;
- break;
- case OpSS:
- op->val = VCPU_SREG_SS;
- break;
- case OpDS:
- op->val = VCPU_SREG_DS;
- break;
- case OpFS:
- op->val = VCPU_SREG_FS;
- break;
- case OpGS:
- op->val = VCPU_SREG_GS;
- break;
- case OpImplicit:
- /* Special instructions do their own operand decoding. */
- default:
- op->type = OP_NONE; /* Disable writeback. */
- break;
- }
-
-done:
- return rc;
-}
-
-int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
-{
- int rc = X86EMUL_CONTINUE;
- int mode = ctxt->mode;
- int def_op_bytes, def_ad_bytes, goffset, simd_prefix;
- bool op_prefix = false;
- struct opcode opcode;
-
- ctxt->memop.type = OP_NONE;
- ctxt->memopp = NULL;
- ctxt->_eip = ctxt->eip;
- ctxt->fetch.start = ctxt->_eip;
- ctxt->fetch.end = ctxt->fetch.start + insn_len;
- if (insn_len > 0)
- memcpy(ctxt->fetch.data, insn, insn_len);
-
- switch (mode) {
- case X86EMUL_MODE_REAL:
- case X86EMUL_MODE_VM86:
- case X86EMUL_MODE_PROT16:
- def_op_bytes = def_ad_bytes = 2;
- break;
- case X86EMUL_MODE_PROT32:
- def_op_bytes = def_ad_bytes = 4;
- break;
-#ifdef CONFIG_X86_64
- case X86EMUL_MODE_PROT64:
- def_op_bytes = 4;
- def_ad_bytes = 8;
- break;
-#endif
- default:
- return EMULATION_FAILED;
- }
-
- ctxt->op_bytes = def_op_bytes;
- ctxt->ad_bytes = def_ad_bytes;
-
- /* Legacy prefixes. */
- for (;;) {
- switch (ctxt->b = insn_fetch(u8, ctxt)) {
- case 0x66: /* operand-size override */
- op_prefix = true;
- /* switch between 2/4 bytes */
- ctxt->op_bytes = def_op_bytes ^ 6;
- break;
- case 0x67: /* address-size override */
- if (mode == X86EMUL_MODE_PROT64)
- /* switch between 4/8 bytes */
- ctxt->ad_bytes = def_ad_bytes ^ 12;
- else
- /* switch between 2/4 bytes */
- ctxt->ad_bytes = def_ad_bytes ^ 6;
- break;
- case 0x26: /* ES override */
- case 0x2e: /* CS override */
- case 0x36: /* SS override */
- case 0x3e: /* DS override */
- set_seg_override(ctxt, (ctxt->b >> 3) & 3);
- break;
- case 0x64: /* FS override */
- case 0x65: /* GS override */
- set_seg_override(ctxt, ctxt->b & 7);
- break;
- case 0x40 ... 0x4f: /* REX */
- if (mode != X86EMUL_MODE_PROT64)
- goto done_prefixes;
- ctxt->rex_prefix = ctxt->b;
- continue;
- case 0xf0: /* LOCK */
- ctxt->lock_prefix = 1;
- break;
- case 0xf2: /* REPNE/REPNZ */
- case 0xf3: /* REP/REPE/REPZ */
- ctxt->rep_prefix = ctxt->b;
- break;
- default:
- goto done_prefixes;
- }
-
- /* Any legacy prefix after a REX prefix nullifies its effect. */
-
- ctxt->rex_prefix = 0;
- }
-
-done_prefixes:
-
- /* REX prefix. */
- if (ctxt->rex_prefix & 8)
- ctxt->op_bytes = 8; /* REX.W */
-
- /* Opcode byte(s). */
- opcode = opcode_table[ctxt->b];
- /* Two-byte opcode? */
- if (ctxt->b == 0x0f) {
- ctxt->twobyte = 1;
- ctxt->b = insn_fetch(u8, ctxt);
- opcode = twobyte_table[ctxt->b];
- }
- ctxt->d = opcode.flags;
-
- while (ctxt->d & GroupMask) {
- switch (ctxt->d & GroupMask) {
- case Group:
- ctxt->modrm = insn_fetch(u8, ctxt);
- --ctxt->_eip;
- goffset = (ctxt->modrm >> 3) & 7;
- opcode = opcode.u.group[goffset];
- break;
- case GroupDual:
- ctxt->modrm = insn_fetch(u8, ctxt);
- --ctxt->_eip;
- goffset = (ctxt->modrm >> 3) & 7;
- if ((ctxt->modrm >> 6) == 3)
- opcode = opcode.u.gdual->mod3[goffset];
- else
- opcode = opcode.u.gdual->mod012[goffset];
- break;
- case RMExt:
- goffset = ctxt->modrm & 7;
- opcode = opcode.u.group[goffset];
- break;
- case Prefix:
- if (ctxt->rep_prefix && op_prefix)
- return EMULATION_FAILED;
- simd_prefix = op_prefix ? 0x66 : ctxt->rep_prefix;
- switch (simd_prefix) {
- case 0x00: opcode = opcode.u.gprefix->pfx_no; break;
- case 0x66: opcode = opcode.u.gprefix->pfx_66; break;
- case 0xf2: opcode = opcode.u.gprefix->pfx_f2; break;
- case 0xf3: opcode = opcode.u.gprefix->pfx_f3; break;
- }
- break;
- default:
- return EMULATION_FAILED;
- }
-
- ctxt->d &= ~(u64)GroupMask;
- ctxt->d |= opcode.flags;
- }
-
- ctxt->execute = opcode.u.execute;
- ctxt->check_perm = opcode.check_perm;
- ctxt->intercept = opcode.intercept;
-
- /* Unrecognised? */
- if (ctxt->d == 0 || (ctxt->d & Undefined))
- return EMULATION_FAILED;
-
- if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn)
- return EMULATION_FAILED;
-
- if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack))
- ctxt->op_bytes = 8;
-
- if (ctxt->d & Op3264) {
- if (mode == X86EMUL_MODE_PROT64)
- ctxt->op_bytes = 8;
- else
- ctxt->op_bytes = 4;
- }
-
- if (ctxt->d & Sse)
- ctxt->op_bytes = 16;
-
- /* ModRM and SIB bytes. */
- if (ctxt->d & ModRM) {
- rc = decode_modrm(ctxt, &ctxt->memop);
- if (!ctxt->has_seg_override)
- set_seg_override(ctxt, ctxt->modrm_seg);
- } else if (ctxt->d & MemAbs)
- rc = decode_abs(ctxt, &ctxt->memop);
- if (rc != X86EMUL_CONTINUE)
- goto done;
-
- if (!ctxt->has_seg_override)
- set_seg_override(ctxt, VCPU_SREG_DS);
-
- ctxt->memop.addr.mem.seg = seg_override(ctxt);
-
- if (ctxt->memop.type == OP_MEM && ctxt->ad_bytes != 8)
- ctxt->memop.addr.mem.ea = (u32)ctxt->memop.addr.mem.ea;
-
- /*
- * Decode and fetch the source operand: register, memory
- * or immediate.
- */
- rc = decode_operand(ctxt, &ctxt->src, (ctxt->d >> SrcShift) & OpMask);
- if (rc != X86EMUL_CONTINUE)
- goto done;
-
- /*
- * Decode and fetch the second source operand: register, memory
- * or immediate.
- */
- rc = decode_operand(ctxt, &ctxt->src2, (ctxt->d >> Src2Shift) & OpMask);
- if (rc != X86EMUL_CONTINUE)
- goto done;
-
- /* Decode and fetch the destination operand: register or memory. */
- rc = decode_operand(ctxt, &ctxt->dst, (ctxt->d >> DstShift) & OpMask);
-
-done:
- if (ctxt->memopp && ctxt->memopp->type == OP_MEM && ctxt->rip_relative)
- ctxt->memopp->addr.mem.ea += ctxt->_eip;
-
- return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK;
-}
-
-bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt)
-{
- return ctxt->d & PageTable;
-}
-
-static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
-{
- /* The second termination condition only applies for REPE
- * and REPNE. Test if the repeat string operation prefix is
- * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
- * corresponding termination condition according to:
- * - if REPE/REPZ and ZF = 0 then done
- * - if REPNE/REPNZ and ZF = 1 then done
- */
- if (((ctxt->b == 0xa6) || (ctxt->b == 0xa7) ||
- (ctxt->b == 0xae) || (ctxt->b == 0xaf))
- && (((ctxt->rep_prefix == REPE_PREFIX) &&
- ((ctxt->eflags & EFLG_ZF) == 0))
- || ((ctxt->rep_prefix == REPNE_PREFIX) &&
- ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))))
- return true;
-
- return false;
-}
-
-int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
-{
- struct x86_emulate_ops *ops = ctxt->ops;
- int rc = X86EMUL_CONTINUE;
- int saved_dst_type = ctxt->dst.type;
-
- ctxt->mem_read.pos = 0;
-
- if (ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) {
- rc = emulate_ud(ctxt);
- goto done;
- }
-
- /* LOCK prefix is allowed only with some instructions */
- if (ctxt->lock_prefix && (!(ctxt->d & Lock) || ctxt->dst.type != OP_MEM)) {
- rc = emulate_ud(ctxt);
- goto done;
- }
-
- if ((ctxt->d & SrcMask) == SrcMemFAddr && ctxt->src.type != OP_MEM) {
- rc = emulate_ud(ctxt);
- goto done;
- }
-
- if ((ctxt->d & Sse)
- && ((ops->get_cr(ctxt, 0) & X86_CR0_EM)
- || !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) {
- rc = emulate_ud(ctxt);
- goto done;
- }
-
- if ((ctxt->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
- rc = emulate_nm(ctxt);
- goto done;
- }
-
- if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
- rc = emulator_check_intercept(ctxt, ctxt->intercept,
- X86_ICPT_PRE_EXCEPT);
- if (rc != X86EMUL_CONTINUE)
- goto done;
- }
-
- /* Privileged instruction can be executed only in CPL=0 */
- if ((ctxt->d & Priv) && ops->cpl(ctxt)) {
- rc = emulate_gp(ctxt, 0);
- goto done;
- }
-
- /* Instruction can only be executed in protected mode */
- if ((ctxt->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) {
- rc = emulate_ud(ctxt);
- goto done;
- }
-
- /* Do instruction specific permission checks */
- if (ctxt->check_perm) {
- rc = ctxt->check_perm(ctxt);
- if (rc != X86EMUL_CONTINUE)
- goto done;
- }
-
- if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
- rc = emulator_check_intercept(ctxt, ctxt->intercept,
- X86_ICPT_POST_EXCEPT);
- if (rc != X86EMUL_CONTINUE)
- goto done;
- }
-
- if (ctxt->rep_prefix && (ctxt->d & String)) {
- /* All REP prefixes have the same first termination condition */
- if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0) {
- ctxt->eip = ctxt->_eip;
- goto done;
- }
- }
-
- if ((ctxt->src.type == OP_MEM) && !(ctxt->d & NoAccess)) {
- rc = segmented_read(ctxt, ctxt->src.addr.mem,
- ctxt->src.valptr, ctxt->src.bytes);
- if (rc != X86EMUL_CONTINUE)
- goto done;
- ctxt->src.orig_val64 = ctxt->src.val64;
- }
-
- if (ctxt->src2.type == OP_MEM) {
- rc = segmented_read(ctxt, ctxt->src2.addr.mem,
- &ctxt->src2.val, ctxt->src2.bytes);
- if (rc != X86EMUL_CONTINUE)
- goto done;
- }
-
- if ((ctxt->d & DstMask) == ImplicitOps)
- goto special_insn;
-
-
- if ((ctxt->dst.type == OP_MEM) && !(ctxt->d & Mov)) {
- /* optimisation - avoid slow emulated read if Mov */
- rc = segmented_read(ctxt, ctxt->dst.addr.mem,
- &ctxt->dst.val, ctxt->dst.bytes);
- if (rc != X86EMUL_CONTINUE)
- goto done;
- }
- ctxt->dst.orig_val = ctxt->dst.val;
-
-special_insn:
-
- if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
- rc = emulator_check_intercept(ctxt, ctxt->intercept,
- X86_ICPT_POST_MEMACCESS);
- if (rc != X86EMUL_CONTINUE)
- goto done;
- }
-
- if (ctxt->execute) {
- rc = ctxt->execute(ctxt);
- if (rc != X86EMUL_CONTINUE)
- goto done;
- goto writeback;
- }
-
- if (ctxt->twobyte)
- goto twobyte_insn;
-
- switch (ctxt->b) {
- case 0x40 ... 0x47: /* inc r16/r32 */
- emulate_1op(ctxt, "inc");
- break;
- case 0x48 ... 0x4f: /* dec r16/r32 */
- emulate_1op(ctxt, "dec");
- break;
- case 0x63: /* movsxd */
- if (ctxt->mode != X86EMUL_MODE_PROT64)
- goto cannot_emulate;
- ctxt->dst.val = (s32) ctxt->src.val;
- break;
- case 0x70 ... 0x7f: /* jcc (short) */
- if (test_cc(ctxt->b, ctxt->eflags))
- jmp_rel(ctxt, ctxt->src.val);
- break;
- case 0x8d: /* lea r16/r32, m */
- ctxt->dst.val = ctxt->src.addr.mem.ea;
- break;
- case 0x90 ... 0x97: /* nop / xchg reg, rax */
- if (ctxt->dst.addr.reg == &ctxt->regs[VCPU_REGS_RAX])
- break;
- rc = em_xchg(ctxt);
- break;
- case 0x98: /* cbw/cwde/cdqe */
- switch (ctxt->op_bytes) {
- case 2: ctxt->dst.val = (s8)ctxt->dst.val; break;
- case 4: ctxt->dst.val = (s16)ctxt->dst.val; break;
- case 8: ctxt->dst.val = (s32)ctxt->dst.val; break;
- }
- break;
- case 0xc0 ... 0xc1:
- rc = em_grp2(ctxt);
- break;
- case 0xcc: /* int3 */
- rc = emulate_int(ctxt, 3);
- break;
- case 0xcd: /* int n */
- rc = emulate_int(ctxt, ctxt->src.val);
- break;
- case 0xce: /* into */
- if (ctxt->eflags & EFLG_OF)
- rc = emulate_int(ctxt, 4);
- break;
- case 0xd0 ... 0xd1: /* Grp2 */
- rc = em_grp2(ctxt);
- break;
- case 0xd2 ... 0xd3: /* Grp2 */
- ctxt->src.val = ctxt->regs[VCPU_REGS_RCX];
- rc = em_grp2(ctxt);
- break;
- case 0xe9: /* jmp rel */
- case 0xeb: /* jmp rel short */
- jmp_rel(ctxt, ctxt->src.val);
- ctxt->dst.type = OP_NONE; /* Disable writeback. */
- break;
- case 0xf4: /* hlt */
- ctxt->ops->halt(ctxt);
- break;
- case 0xf5: /* cmc */
- /* complement carry flag from eflags reg */
- ctxt->eflags ^= EFLG_CF;
- break;
- case 0xf8: /* clc */
- ctxt->eflags &= ~EFLG_CF;
- break;
- case 0xf9: /* stc */
- ctxt->eflags |= EFLG_CF;
- break;
- case 0xfc: /* cld */
- ctxt->eflags &= ~EFLG_DF;
- break;
- case 0xfd: /* std */
- ctxt->eflags |= EFLG_DF;
- break;
- default:
- goto cannot_emulate;
- }
-
- if (rc != X86EMUL_CONTINUE)
- goto done;
-
-writeback:
- rc = writeback(ctxt);
- if (rc != X86EMUL_CONTINUE)
- goto done;
-
- /*
- * restore dst type in case the decoding will be reused
- * (happens for string instruction )
- */
- ctxt->dst.type = saved_dst_type;
-
- if ((ctxt->d & SrcMask) == SrcSI)
- string_addr_inc(ctxt, seg_override(ctxt),
- VCPU_REGS_RSI, &ctxt->src);
-
- if ((ctxt->d & DstMask) == DstDI)
- string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI,
- &ctxt->dst);
-
- if (ctxt->rep_prefix && (ctxt->d & String)) {
- struct read_cache *r = &ctxt->io_read;
- register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1);
-
- if (!string_insn_completed(ctxt)) {
- /*
- * Re-enter guest when pio read ahead buffer is empty
- * or, if it is not used, after each 1024 iteration.
- */
- if ((r->end != 0 || ctxt->regs[VCPU_REGS_RCX] & 0x3ff) &&
- (r->end == 0 || r->end != r->pos)) {
- /*
- * Reset read cache. Usually happens before
- * decode, but since instruction is restarted
- * we have to do it here.
- */
- ctxt->mem_read.end = 0;
- return EMULATION_RESTART;
- }
- goto done; /* skip rip writeback */
- }
- }
-
- ctxt->eip = ctxt->_eip;
-
-done:
- if (rc == X86EMUL_PROPAGATE_FAULT)
- ctxt->have_exception = true;
- if (rc == X86EMUL_INTERCEPTED)
- return EMULATION_INTERCEPTED;
-
- return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
-
-twobyte_insn:
- switch (ctxt->b) {
- case 0x09: /* wbinvd */
- (ctxt->ops->wbinvd)(ctxt);
- break;
- case 0x08: /* invd */
- case 0x0d: /* GrpP (prefetch) */
- case 0x18: /* Grp16 (prefetch/nop) */
- break;
- case 0x20: /* mov cr, reg */
- ctxt->dst.val = ops->get_cr(ctxt, ctxt->modrm_reg);
- break;
- case 0x21: /* mov from dr to reg */
- ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val);
- break;
- case 0x40 ... 0x4f: /* cmov */
- ctxt->dst.val = ctxt->dst.orig_val = ctxt->src.val;
- if (!test_cc(ctxt->b, ctxt->eflags))
- ctxt->dst.type = OP_NONE; /* no writeback */
- break;
- case 0x80 ... 0x8f: /* jnz rel, etc*/
- if (test_cc(ctxt->b, ctxt->eflags))
- jmp_rel(ctxt, ctxt->src.val);
- break;
- case 0x90 ... 0x9f: /* setcc r/m8 */
- ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags);
- break;
- case 0xa4: /* shld imm8, r, r/m */
- case 0xa5: /* shld cl, r, r/m */
- emulate_2op_cl(ctxt, "shld");
- break;
- case 0xac: /* shrd imm8, r, r/m */
- case 0xad: /* shrd cl, r, r/m */
- emulate_2op_cl(ctxt, "shrd");
- break;
- case 0xae: /* clflush */
- break;
- case 0xb6 ... 0xb7: /* movzx */
- ctxt->dst.bytes = ctxt->op_bytes;
- ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val
- : (u16) ctxt->src.val;
- break;
- case 0xbe ... 0xbf: /* movsx */
- ctxt->dst.bytes = ctxt->op_bytes;
- ctxt->dst.val = (ctxt->d & ByteOp) ? (s8) ctxt->src.val :
- (s16) ctxt->src.val;
- break;
- case 0xc0 ... 0xc1: /* xadd */
- emulate_2op_SrcV(ctxt, "add");
- /* Write back the register source. */
- ctxt->src.val = ctxt->dst.orig_val;
- write_register_operand(&ctxt->src);
- break;
- case 0xc3: /* movnti */
- ctxt->dst.bytes = ctxt->op_bytes;
- ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val :
- (u64) ctxt->src.val;
- break;
- default:
- goto cannot_emulate;
- }
-
- if (rc != X86EMUL_CONTINUE)
- goto done;
-
- goto writeback;
-
-cannot_emulate:
- return EMULATION_FAILED;
-}
diff --git a/ANDROID_3.4.5/arch/x86/kvm/i8254.c b/ANDROID_3.4.5/arch/x86/kvm/i8254.c
deleted file mode 100644
index d68f99df..00000000
--- a/ANDROID_3.4.5/arch/x86/kvm/i8254.c
+++ /dev/null
@@ -1,765 +0,0 @@
-/*
- * 8253/8254 interval timer emulation
- *
- * Copyright (c) 2003-2004 Fabrice Bellard
- * Copyright (c) 2006 Intel Corporation
- * Copyright (c) 2007 Keir Fraser, XenSource Inc
- * Copyright (c) 2008 Intel Corporation
- * Copyright 2009 Red Hat, Inc. and/or its affiliates.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- *
- * Authors:
- * Sheng Yang <sheng.yang@intel.com>
- * Based on QEMU and Xen.
- */
-
-#define pr_fmt(fmt) "pit: " fmt
-
-#include <linux/kvm_host.h>
-#include <linux/slab.h>
-#include <linux/workqueue.h>
-
-#include "irq.h"
-#include "i8254.h"
-
-#ifndef CONFIG_X86_64
-#define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
-#else
-#define mod_64(x, y) ((x) % (y))
-#endif
-
-#define RW_STATE_LSB 1
-#define RW_STATE_MSB 2
-#define RW_STATE_WORD0 3
-#define RW_STATE_WORD1 4
-
-/* Compute with 96 bit intermediate result: (a*b)/c */
-static u64 muldiv64(u64 a, u32 b, u32 c)
-{
- union {
- u64 ll;
- struct {
- u32 low, high;
- } l;
- } u, res;
- u64 rl, rh;
-
- u.ll = a;
- rl = (u64)u.l.low * (u64)b;
- rh = (u64)u.l.high * (u64)b;
- rh += (rl >> 32);
- res.l.high = div64_u64(rh, c);
- res.l.low = div64_u64(((mod_64(rh, c) << 32) + (rl & 0xffffffff)), c);
- return res.ll;
-}
-
-static void pit_set_gate(struct kvm *kvm, int channel, u32 val)
-{
- struct kvm_kpit_channel_state *c =
- &kvm->arch.vpit->pit_state.channels[channel];
-
- WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
-
- switch (c->mode) {
- default:
- case 0:
- case 4:
- /* XXX: just disable/enable counting */
- break;
- case 1:
- case 2:
- case 3:
- case 5:
- /* Restart counting on rising edge. */
- if (c->gate < val)
- c->count_load_time = ktime_get();
- break;
- }
-
- c->gate = val;
-}
-
-static int pit_get_gate(struct kvm *kvm, int channel)
-{
- WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
-
- return kvm->arch.vpit->pit_state.channels[channel].gate;
-}
-
-static s64 __kpit_elapsed(struct kvm *kvm)
-{
- s64 elapsed;
- ktime_t remaining;
- struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
-
- if (!ps->pit_timer.period)
- return 0;
-
- /*
- * The Counter does not stop when it reaches zero. In
- * Modes 0, 1, 4, and 5 the Counter ``wraps around'' to
- * the highest count, either FFFF hex for binary counting
- * or 9999 for BCD counting, and continues counting.
- * Modes 2 and 3 are periodic; the Counter reloads
- * itself with the initial count and continues counting
- * from there.
- */
- remaining = hrtimer_get_remaining(&ps->pit_timer.timer);
- elapsed = ps->pit_timer.period - ktime_to_ns(remaining);
- elapsed = mod_64(elapsed, ps->pit_timer.period);
-
- return elapsed;
-}
-
-static s64 kpit_elapsed(struct kvm *kvm, struct kvm_kpit_channel_state *c,
- int channel)
-{
- if (channel == 0)
- return __kpit_elapsed(kvm);
-
- return ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
-}
-
-static int pit_get_count(struct kvm *kvm, int channel)
-{
- struct kvm_kpit_channel_state *c =
- &kvm->arch.vpit->pit_state.channels[channel];
- s64 d, t;
- int counter;
-
- WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
-
- t = kpit_elapsed(kvm, c, channel);
- d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
-
- switch (c->mode) {
- case 0:
- case 1:
- case 4:
- case 5:
- counter = (c->count - d) & 0xffff;
- break;
- case 3:
- /* XXX: may be incorrect for odd counts */
- counter = c->count - (mod_64((2 * d), c->count));
- break;
- default:
- counter = c->count - mod_64(d, c->count);
- break;
- }
- return counter;
-}
-
-static int pit_get_out(struct kvm *kvm, int channel)
-{
- struct kvm_kpit_channel_state *c =
- &kvm->arch.vpit->pit_state.channels[channel];
- s64 d, t;
- int out;
-
- WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
-
- t = kpit_elapsed(kvm, c, channel);
- d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
-
- switch (c->mode) {
- default:
- case 0:
- out = (d >= c->count);
- break;
- case 1:
- out = (d < c->count);
- break;
- case 2:
- out = ((mod_64(d, c->count) == 0) && (d != 0));
- break;
- case 3:
- out = (mod_64(d, c->count) < ((c->count + 1) >> 1));
- break;
- case 4:
- case 5:
- out = (d == c->count);
- break;
- }
-
- return out;
-}
-
-static void pit_latch_count(struct kvm *kvm, int channel)
-{
- struct kvm_kpit_channel_state *c =
- &kvm->arch.vpit->pit_state.channels[channel];
-
- WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
-
- if (!c->count_latched) {
- c->latched_count = pit_get_count(kvm, channel);
- c->count_latched = c->rw_mode;
- }
-}
-
-static void pit_latch_status(struct kvm *kvm, int channel)
-{
- struct kvm_kpit_channel_state *c =
- &kvm->arch.vpit->pit_state.channels[channel];
-
- WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
-
- if (!c->status_latched) {
- /* TODO: Return NULL COUNT (bit 6). */
- c->status = ((pit_get_out(kvm, channel) << 7) |
- (c->rw_mode << 4) |
- (c->mode << 1) |
- c->bcd);
- c->status_latched = 1;
- }
-}
-
-static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
-{
- struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
- irq_ack_notifier);
- int value;
-
- spin_lock(&ps->inject_lock);
- value = atomic_dec_return(&ps->pit_timer.pending);
- if (value < 0)
- /* spurious acks can be generated if, for example, the
- * PIC is being reset. Handle it gracefully here
- */
- atomic_inc(&ps->pit_timer.pending);
- else if (value > 0)
- /* in this case, we had multiple outstanding pit interrupts
- * that we needed to inject. Reinject
- */
- queue_work(ps->pit->wq, &ps->pit->expired);
- ps->irq_ack = 1;
- spin_unlock(&ps->inject_lock);
-}
-
-void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
-{
- struct kvm_pit *pit = vcpu->kvm->arch.vpit;
- struct hrtimer *timer;
-
- if (!kvm_vcpu_is_bsp(vcpu) || !pit)
- return;
-
- timer = &pit->pit_state.pit_timer.timer;
- if (hrtimer_cancel(timer))
- hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
-}
-
-static void destroy_pit_timer(struct kvm_pit *pit)
-{
- hrtimer_cancel(&pit->pit_state.pit_timer.timer);
- cancel_work_sync(&pit->expired);
-}
-
-static bool kpit_is_periodic(struct kvm_timer *ktimer)
-{
- struct kvm_kpit_state *ps = container_of(ktimer, struct kvm_kpit_state,
- pit_timer);
- return ps->is_periodic;
-}
-
-static struct kvm_timer_ops kpit_ops = {
- .is_periodic = kpit_is_periodic,
-};
-
-static void pit_do_work(struct work_struct *work)
-{
- struct kvm_pit *pit = container_of(work, struct kvm_pit, expired);
- struct kvm *kvm = pit->kvm;
- struct kvm_vcpu *vcpu;
- int i;
- struct kvm_kpit_state *ps = &pit->pit_state;
- int inject = 0;
-
- /* Try to inject pending interrupts when
- * last one has been acked.
- */
- spin_lock(&ps->inject_lock);
- if (ps->irq_ack) {
- ps->irq_ack = 0;
- inject = 1;
- }
- spin_unlock(&ps->inject_lock);
- if (inject) {
- kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
- kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
-
- /*
- * Provides NMI watchdog support via Virtual Wire mode.
- * The route is: PIT -> PIC -> LVT0 in NMI mode.
- *
- * Note: Our Virtual Wire implementation is simplified, only
- * propagating PIT interrupts to all VCPUs when they have set
- * LVT0 to NMI delivery. Other PIC interrupts are just sent to
- * VCPU0, and only if its LVT0 is in EXTINT mode.
- */
- if (kvm->arch.vapics_in_nmi_mode > 0)
- kvm_for_each_vcpu(i, vcpu, kvm)
- kvm_apic_nmi_wd_deliver(vcpu);
- }
-}
-
-static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
-{
- struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
- struct kvm_pit *pt = ktimer->kvm->arch.vpit;
-
- if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
- atomic_inc(&ktimer->pending);
- queue_work(pt->wq, &pt->expired);
- }
-
- if (ktimer->t_ops->is_periodic(ktimer)) {
- hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
- return HRTIMER_RESTART;
- } else
- return HRTIMER_NORESTART;
-}
-
-static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
-{
- struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
- struct kvm_timer *pt = &ps->pit_timer;
- s64 interval;
-
- if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
- return;
-
- interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
-
- pr_debug("create pit timer, interval is %llu nsec\n", interval);
-
- /* TODO The new value only affected after the retriggered */
- hrtimer_cancel(&pt->timer);
- cancel_work_sync(&ps->pit->expired);
- pt->period = interval;
- ps->is_periodic = is_period;
-
- pt->timer.function = pit_timer_fn;
- pt->t_ops = &kpit_ops;
- pt->kvm = ps->pit->kvm;
-
- atomic_set(&pt->pending, 0);
- ps->irq_ack = 1;
-
- hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval),
- HRTIMER_MODE_ABS);
-}
-
-static void pit_load_count(struct kvm *kvm, int channel, u32 val)
-{
- struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
-
- WARN_ON(!mutex_is_locked(&ps->lock));
-
- pr_debug("load_count val is %d, channel is %d\n", val, channel);
-
- /*
- * The largest possible initial count is 0; this is equivalent
- * to 216 for binary counting and 104 for BCD counting.
- */
- if (val == 0)
- val = 0x10000;
-
- ps->channels[channel].count = val;
-
- if (channel != 0) {
- ps->channels[channel].count_load_time = ktime_get();
- return;
- }
-
- /* Two types of timer
- * mode 1 is one shot, mode 2 is period, otherwise del timer */
- switch (ps->channels[0].mode) {
- case 0:
- case 1:
- /* FIXME: enhance mode 4 precision */
- case 4:
- create_pit_timer(kvm, val, 0);
- break;
- case 2:
- case 3:
- create_pit_timer(kvm, val, 1);
- break;
- default:
- destroy_pit_timer(kvm->arch.vpit);
- }
-}
-
-void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start)
-{
- u8 saved_mode;
- if (hpet_legacy_start) {
- /* save existing mode for later reenablement */
- saved_mode = kvm->arch.vpit->pit_state.channels[0].mode;
- kvm->arch.vpit->pit_state.channels[0].mode = 0xff; /* disable timer */
- pit_load_count(kvm, channel, val);
- kvm->arch.vpit->pit_state.channels[0].mode = saved_mode;
- } else {
- pit_load_count(kvm, channel, val);
- }
-}
-
-static inline struct kvm_pit *dev_to_pit(struct kvm_io_device *dev)
-{
- return container_of(dev, struct kvm_pit, dev);
-}
-
-static inline struct kvm_pit *speaker_to_pit(struct kvm_io_device *dev)
-{
- return container_of(dev, struct kvm_pit, speaker_dev);
-}
-
-static inline int pit_in_range(gpa_t addr)
-{
- return ((addr >= KVM_PIT_BASE_ADDRESS) &&
- (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
-}
-
-static int pit_ioport_write(struct kvm_io_device *this,
- gpa_t addr, int len, const void *data)
-{
- struct kvm_pit *pit = dev_to_pit(this);
- struct kvm_kpit_state *pit_state = &pit->pit_state;
- struct kvm *kvm = pit->kvm;
- int channel, access;
- struct kvm_kpit_channel_state *s;
- u32 val = *(u32 *) data;
- if (!pit_in_range(addr))
- return -EOPNOTSUPP;
-
- val &= 0xff;
- addr &= KVM_PIT_CHANNEL_MASK;
-
- mutex_lock(&pit_state->lock);
-
- if (val != 0)
- pr_debug("write addr is 0x%x, len is %d, val is 0x%x\n",
- (unsigned int)addr, len, val);
-
- if (addr == 3) {
- channel = val >> 6;
- if (channel == 3) {
- /* Read-Back Command. */
- for (channel = 0; channel < 3; channel++) {
- s = &pit_state->channels[channel];
- if (val & (2 << channel)) {
- if (!(val & 0x20))
- pit_latch_count(kvm, channel);
- if (!(val & 0x10))
- pit_latch_status(kvm, channel);
- }
- }
- } else {
- /* Select Counter <channel>. */
- s = &pit_state->channels[channel];
- access = (val >> 4) & KVM_PIT_CHANNEL_MASK;
- if (access == 0) {
- pit_latch_count(kvm, channel);
- } else {
- s->rw_mode = access;
- s->read_state = access;
- s->write_state = access;
- s->mode = (val >> 1) & 7;
- if (s->mode > 5)
- s->mode -= 4;
- s->bcd = val & 1;
- }
- }
- } else {
- /* Write Count. */
- s = &pit_state->channels[addr];
- switch (s->write_state) {
- default:
- case RW_STATE_LSB:
- pit_load_count(kvm, addr, val);
- break;
- case RW_STATE_MSB:
- pit_load_count(kvm, addr, val << 8);
- break;
- case RW_STATE_WORD0:
- s->write_latch = val;
- s->write_state = RW_STATE_WORD1;
- break;
- case RW_STATE_WORD1:
- pit_load_count(kvm, addr, s->write_latch | (val << 8));
- s->write_state = RW_STATE_WORD0;
- break;
- }
- }
-
- mutex_unlock(&pit_state->lock);
- return 0;
-}
-
-static int pit_ioport_read(struct kvm_io_device *this,
- gpa_t addr, int len, void *data)
-{
- struct kvm_pit *pit = dev_to_pit(this);
- struct kvm_kpit_state *pit_state = &pit->pit_state;
- struct kvm *kvm = pit->kvm;
- int ret, count;
- struct kvm_kpit_channel_state *s;
- if (!pit_in_range(addr))
- return -EOPNOTSUPP;
-
- addr &= KVM_PIT_CHANNEL_MASK;
- if (addr == 3)
- return 0;
-
- s = &pit_state->channels[addr];
-
- mutex_lock(&pit_state->lock);
-
- if (s->status_latched) {
- s->status_latched = 0;
- ret = s->status;
- } else if (s->count_latched) {
- switch (s->count_latched) {
- default:
- case RW_STATE_LSB:
- ret = s->latched_count & 0xff;
- s->count_latched = 0;
- break;
- case RW_STATE_MSB:
- ret = s->latched_count >> 8;
- s->count_latched = 0;
- break;
- case RW_STATE_WORD0:
- ret = s->latched_count & 0xff;
- s->count_latched = RW_STATE_MSB;
- break;
- }
- } else {
- switch (s->read_state) {
- default:
- case RW_STATE_LSB:
- count = pit_get_count(kvm, addr);
- ret = count & 0xff;
- break;
- case RW_STATE_MSB:
- count = pit_get_count(kvm, addr);
- ret = (count >> 8) & 0xff;
- break;
- case RW_STATE_WORD0:
- count = pit_get_count(kvm, addr);
- ret = count & 0xff;
- s->read_state = RW_STATE_WORD1;
- break;
- case RW_STATE_WORD1:
- count = pit_get_count(kvm, addr);
- ret = (count >> 8) & 0xff;
- s->read_state = RW_STATE_WORD0;
- break;
- }
- }
-
- if (len > sizeof(ret))
- len = sizeof(ret);
- memcpy(data, (char *)&ret, len);
-
- mutex_unlock(&pit_state->lock);
- return 0;
-}
-
-static int speaker_ioport_write(struct kvm_io_device *this,
- gpa_t addr, int len, const void *data)
-{
- struct kvm_pit *pit = speaker_to_pit(this);
- struct kvm_kpit_state *pit_state = &pit->pit_state;
- struct kvm *kvm = pit->kvm;
- u32 val = *(u32 *) data;
- if (addr != KVM_SPEAKER_BASE_ADDRESS)
- return -EOPNOTSUPP;
-
- mutex_lock(&pit_state->lock);
- pit_state->speaker_data_on = (val >> 1) & 1;
- pit_set_gate(kvm, 2, val & 1);
- mutex_unlock(&pit_state->lock);
- return 0;
-}
-
-static int speaker_ioport_read(struct kvm_io_device *this,
- gpa_t addr, int len, void *data)
-{
- struct kvm_pit *pit = speaker_to_pit(this);
- struct kvm_kpit_state *pit_state = &pit->pit_state;
- struct kvm *kvm = pit->kvm;
- unsigned int refresh_clock;
- int ret;
- if (addr != KVM_SPEAKER_BASE_ADDRESS)
- return -EOPNOTSUPP;
-
- /* Refresh clock toggles at about 15us. We approximate as 2^14ns. */
- refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1;
-
- mutex_lock(&pit_state->lock);
- ret = ((pit_state->speaker_data_on << 1) | pit_get_gate(kvm, 2) |
- (pit_get_out(kvm, 2) << 5) | (refresh_clock << 4));
- if (len > sizeof(ret))
- len = sizeof(ret);
- memcpy(data, (char *)&ret, len);
- mutex_unlock(&pit_state->lock);
- return 0;
-}
-
-void kvm_pit_reset(struct kvm_pit *pit)
-{
- int i;
- struct kvm_kpit_channel_state *c;
-
- mutex_lock(&pit->pit_state.lock);
- pit->pit_state.flags = 0;
- for (i = 0; i < 3; i++) {
- c = &pit->pit_state.channels[i];
- c->mode = 0xff;
- c->gate = (i != 2);
- pit_load_count(pit->kvm, i, 0);
- }
- mutex_unlock(&pit->pit_state.lock);
-
- atomic_set(&pit->pit_state.pit_timer.pending, 0);
- pit->pit_state.irq_ack = 1;
-}
-
-static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
-{
- struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier);
-
- if (!mask) {
- atomic_set(&pit->pit_state.pit_timer.pending, 0);
- pit->pit_state.irq_ack = 1;
- }
-}
-
-static const struct kvm_io_device_ops pit_dev_ops = {
- .read = pit_ioport_read,
- .write = pit_ioport_write,
-};
-
-static const struct kvm_io_device_ops speaker_dev_ops = {
- .read = speaker_ioport_read,
- .write = speaker_ioport_write,
-};
-
-/* Caller must hold slots_lock */
-struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
-{
- struct kvm_pit *pit;
- struct kvm_kpit_state *pit_state;
- int ret;
-
- pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL);
- if (!pit)
- return NULL;
-
- pit->irq_source_id = kvm_request_irq_source_id(kvm);
- if (pit->irq_source_id < 0) {
- kfree(pit);
- return NULL;
- }
-
- mutex_init(&pit->pit_state.lock);
- mutex_lock(&pit->pit_state.lock);
- spin_lock_init(&pit->pit_state.inject_lock);
-
- pit->wq = create_singlethread_workqueue("kvm-pit-wq");
- if (!pit->wq) {
- mutex_unlock(&pit->pit_state.lock);
- kvm_free_irq_source_id(kvm, pit->irq_source_id);
- kfree(pit);
- return NULL;
- }
- INIT_WORK(&pit->expired, pit_do_work);
-
- kvm->arch.vpit = pit;
- pit->kvm = kvm;
-
- pit_state = &pit->pit_state;
- pit_state->pit = pit;
- hrtimer_init(&pit_state->pit_timer.timer,
- CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
- pit_state->irq_ack_notifier.gsi = 0;
- pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
- kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
- pit_state->pit_timer.reinject = true;
- mutex_unlock(&pit->pit_state.lock);
-
- kvm_pit_reset(pit);
-
- pit->mask_notifier.func = pit_mask_notifer;
- kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
-
- kvm_iodevice_init(&pit->dev, &pit_dev_ops);
- ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, KVM_PIT_BASE_ADDRESS,
- KVM_PIT_MEM_LENGTH, &pit->dev);
- if (ret < 0)
- goto fail;
-
- if (flags & KVM_PIT_SPEAKER_DUMMY) {
- kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops);
- ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS,
- KVM_SPEAKER_BASE_ADDRESS, 4,
- &pit->speaker_dev);
- if (ret < 0)
- goto fail_unregister;
- }
-
- return pit;
-
-fail_unregister:
- kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev);
-
-fail:
- kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
- kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
- kvm_free_irq_source_id(kvm, pit->irq_source_id);
- destroy_workqueue(pit->wq);
- kfree(pit);
- return NULL;
-}
-
-void kvm_free_pit(struct kvm *kvm)
-{
- struct hrtimer *timer;
-
- if (kvm->arch.vpit) {
- kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &kvm->arch.vpit->dev);
- kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
- &kvm->arch.vpit->speaker_dev);
- kvm_unregister_irq_mask_notifier(kvm, 0,
- &kvm->arch.vpit->mask_notifier);
- kvm_unregister_irq_ack_notifier(kvm,
- &kvm->arch.vpit->pit_state.irq_ack_notifier);
- mutex_lock(&kvm->arch.vpit->pit_state.lock);
- timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
- hrtimer_cancel(timer);
- cancel_work_sync(&kvm->arch.vpit->expired);
- kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id);
- mutex_unlock(&kvm->arch.vpit->pit_state.lock);
- destroy_workqueue(kvm->arch.vpit->wq);
- kfree(kvm->arch.vpit);
- }
-}
diff --git a/ANDROID_3.4.5/arch/x86/kvm/i8254.h b/ANDROID_3.4.5/arch/x86/kvm/i8254.h
deleted file mode 100644
index 51a97426..00000000
--- a/ANDROID_3.4.5/arch/x86/kvm/i8254.h
+++ /dev/null
@@ -1,58 +0,0 @@
-#ifndef __I8254_H
-#define __I8254_H
-
-#include "iodev.h"
-
-struct kvm_kpit_channel_state {
- u32 count; /* can be 65536 */
- u16 latched_count;
- u8 count_latched;
- u8 status_latched;
- u8 status;
- u8 read_state;
- u8 write_state;
- u8 write_latch;
- u8 rw_mode;
- u8 mode;
- u8 bcd; /* not supported */
- u8 gate; /* timer start */
- ktime_t count_load_time;
-};
-
-struct kvm_kpit_state {
- struct kvm_kpit_channel_state channels[3];
- u32 flags;
- struct kvm_timer pit_timer;
- bool is_periodic;
- u32 speaker_data_on;
- struct mutex lock;
- struct kvm_pit *pit;
- spinlock_t inject_lock;
- unsigned long irq_ack;
- struct kvm_irq_ack_notifier irq_ack_notifier;
-};
-
-struct kvm_pit {
- struct kvm_io_device dev;
- struct kvm_io_device speaker_dev;
- struct kvm *kvm;
- struct kvm_kpit_state pit_state;
- int irq_source_id;
- struct kvm_irq_mask_notifier mask_notifier;
- struct workqueue_struct *wq;
- struct work_struct expired;
-};
-
-#define KVM_PIT_BASE_ADDRESS 0x40
-#define KVM_SPEAKER_BASE_ADDRESS 0x61
-#define KVM_PIT_MEM_LENGTH 4
-#define KVM_PIT_FREQ 1193181
-#define KVM_MAX_PIT_INTR_INTERVAL HZ / 100
-#define KVM_PIT_CHANNEL_MASK 0x3
-
-void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start);
-struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags);
-void kvm_free_pit(struct kvm *kvm);
-void kvm_pit_reset(struct kvm_pit *pit);
-
-#endif
diff --git a/ANDROID_3.4.5/arch/x86/kvm/i8259.c b/ANDROID_3.4.5/arch/x86/kvm/i8259.c
deleted file mode 100644
index 81cf4fa4..00000000
--- a/ANDROID_3.4.5/arch/x86/kvm/i8259.c
+++ /dev/null
@@ -1,664 +0,0 @@
-/*
- * 8259 interrupt controller emulation
- *
- * Copyright (c) 2003-2004 Fabrice Bellard
- * Copyright (c) 2007 Intel Corporation
- * Copyright 2009 Red Hat, Inc. and/or its affiliates.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- * Authors:
- * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
- * Port from Qemu.
- */
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/bitops.h>
-#include "irq.h"
-
-#include <linux/kvm_host.h>
-#include "trace.h"
-
-#define pr_pic_unimpl(fmt, ...) \
- pr_err_ratelimited("kvm: pic: " fmt, ## __VA_ARGS__)
-
-static void pic_irq_request(struct kvm *kvm, int level);
-
-static void pic_lock(struct kvm_pic *s)
- __acquires(&s->lock)
-{
- spin_lock(&s->lock);
-}
-
-static void pic_unlock(struct kvm_pic *s)
- __releases(&s->lock)
-{
- bool wakeup = s->wakeup_needed;
- struct kvm_vcpu *vcpu, *found = NULL;
- int i;
-
- s->wakeup_needed = false;
-
- spin_unlock(&s->lock);
-
- if (wakeup) {
- kvm_for_each_vcpu(i, vcpu, s->kvm) {
- if (kvm_apic_accept_pic_intr(vcpu)) {
- found = vcpu;
- break;
- }
- }
-
- if (!found)
- return;
-
- kvm_make_request(KVM_REQ_EVENT, found);
- kvm_vcpu_kick(found);
- }
-}
-
-static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
-{
- s->isr &= ~(1 << irq);
- if (s != &s->pics_state->pics[0])
- irq += 8;
- /*
- * We are dropping lock while calling ack notifiers since ack
- * notifier callbacks for assigned devices call into PIC recursively.
- * Other interrupt may be delivered to PIC while lock is dropped but
- * it should be safe since PIC state is already updated at this stage.
- */
- pic_unlock(s->pics_state);
- kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq);
- pic_lock(s->pics_state);
-}
-
-/*
- * set irq level. If an edge is detected, then the IRR is set to 1
- */
-static inline int pic_set_irq1(struct kvm_kpic_state *s, int irq, int level)
-{
- int mask, ret = 1;
- mask = 1 << irq;
- if (s->elcr & mask) /* level triggered */
- if (level) {
- ret = !(s->irr & mask);
- s->irr |= mask;
- s->last_irr |= mask;
- } else {
- s->irr &= ~mask;
- s->last_irr &= ~mask;
- }
- else /* edge triggered */
- if (level) {
- if ((s->last_irr & mask) == 0) {
- ret = !(s->irr & mask);
- s->irr |= mask;
- }
- s->last_irr |= mask;
- } else
- s->last_irr &= ~mask;
-
- return (s->imr & mask) ? -1 : ret;
-}
-
-/*
- * return the highest priority found in mask (highest = smallest
- * number). Return 8 if no irq
- */
-static inline int get_priority(struct kvm_kpic_state *s, int mask)
-{
- int priority;
- if (mask == 0)
- return 8;
- priority = 0;
- while ((mask & (1 << ((priority + s->priority_add) & 7))) == 0)
- priority++;
- return priority;
-}
-
-/*
- * return the pic wanted interrupt. return -1 if none
- */
-static int pic_get_irq(struct kvm_kpic_state *s)
-{
- int mask, cur_priority, priority;
-
- mask = s->irr & ~s->imr;
- priority = get_priority(s, mask);
- if (priority == 8)
- return -1;
- /*
- * compute current priority. If special fully nested mode on the
- * master, the IRQ coming from the slave is not taken into account
- * for the priority computation.
- */
- mask = s->isr;
- if (s->special_fully_nested_mode && s == &s->pics_state->pics[0])
- mask &= ~(1 << 2);
- cur_priority = get_priority(s, mask);
- if (priority < cur_priority)
- /*
- * higher priority found: an irq should be generated
- */
- return (priority + s->priority_add) & 7;
- else
- return -1;
-}
-
-/*
- * raise irq to CPU if necessary. must be called every time the active
- * irq may change
- */
-static void pic_update_irq(struct kvm_pic *s)
-{
- int irq2, irq;
-
- irq2 = pic_get_irq(&s->pics[1]);
- if (irq2 >= 0) {
- /*
- * if irq request by slave pic, signal master PIC
- */
- pic_set_irq1(&s->pics[0], 2, 1);
- pic_set_irq1(&s->pics[0], 2, 0);
- }
- irq = pic_get_irq(&s->pics[0]);
- pic_irq_request(s->kvm, irq >= 0);
-}
-
-void kvm_pic_update_irq(struct kvm_pic *s)
-{
- pic_lock(s);
- pic_update_irq(s);
- pic_unlock(s);
-}
-
-int kvm_pic_set_irq(void *opaque, int irq, int level)
-{
- struct kvm_pic *s = opaque;
- int ret = -1;
-
- pic_lock(s);
- if (irq >= 0 && irq < PIC_NUM_PINS) {
- ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
- pic_update_irq(s);
- trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
- s->pics[irq >> 3].imr, ret == 0);
- }
- pic_unlock(s);
-
- return ret;
-}
-
-/*
- * acknowledge interrupt 'irq'
- */
-static inline void pic_intack(struct kvm_kpic_state *s, int irq)
-{
- s->isr |= 1 << irq;
- /*
- * We don't clear a level sensitive interrupt here
- */
- if (!(s->elcr & (1 << irq)))
- s->irr &= ~(1 << irq);
-
- if (s->auto_eoi) {
- if (s->rotate_on_auto_eoi)
- s->priority_add = (irq + 1) & 7;
- pic_clear_isr(s, irq);
- }
-
-}
-
-int kvm_pic_read_irq(struct kvm *kvm)
-{
- int irq, irq2, intno;
- struct kvm_pic *s = pic_irqchip(kvm);
-
- pic_lock(s);
- irq = pic_get_irq(&s->pics[0]);
- if (irq >= 0) {
- pic_intack(&s->pics[0], irq);
- if (irq == 2) {
- irq2 = pic_get_irq(&s->pics[1]);
- if (irq2 >= 0)
- pic_intack(&s->pics[1], irq2);
- else
- /*
- * spurious IRQ on slave controller
- */
- irq2 = 7;
- intno = s->pics[1].irq_base + irq2;
- irq = irq2 + 8;
- } else
- intno = s->pics[0].irq_base + irq;
- } else {
- /*
- * spurious IRQ on host controller
- */
- irq = 7;
- intno = s->pics[0].irq_base + irq;
- }
- pic_update_irq(s);
- pic_unlock(s);
-
- return intno;
-}
-
-void kvm_pic_reset(struct kvm_kpic_state *s)
-{
- int irq, i;
- struct kvm_vcpu *vcpu;
- u8 irr = s->irr, isr = s->imr;
- bool found = false;
-
- s->last_irr = 0;
- s->irr = 0;
- s->imr = 0;
- s->isr = 0;
- s->priority_add = 0;
- s->irq_base = 0;
- s->read_reg_select = 0;
- s->poll = 0;
- s->special_mask = 0;
- s->init_state = 0;
- s->auto_eoi = 0;
- s->rotate_on_auto_eoi = 0;
- s->special_fully_nested_mode = 0;
- s->init4 = 0;
-
- kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm)
- if (kvm_apic_accept_pic_intr(vcpu)) {
- found = true;
- break;
- }
-
-
- if (!found)
- return;
-
- for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
- if (irr & (1 << irq) || isr & (1 << irq))
- pic_clear_isr(s, irq);
-}
-
-static void pic_ioport_write(void *opaque, u32 addr, u32 val)
-{
- struct kvm_kpic_state *s = opaque;
- int priority, cmd, irq;
-
- addr &= 1;
- if (addr == 0) {
- if (val & 0x10) {
- s->init4 = val & 1;
- s->last_irr = 0;
- s->irr &= s->elcr;
- s->imr = 0;
- s->priority_add = 0;
- s->special_mask = 0;
- s->read_reg_select = 0;
- if (!s->init4) {
- s->special_fully_nested_mode = 0;
- s->auto_eoi = 0;
- }
- s->init_state = 1;
- if (val & 0x02)
- pr_pic_unimpl("single mode not supported");
- if (val & 0x08)
- pr_pic_unimpl(
- "level sensitive irq not supported");
- } else if (val & 0x08) {
- if (val & 0x04)
- s->poll = 1;
- if (val & 0x02)
- s->read_reg_select = val & 1;
- if (val & 0x40)
- s->special_mask = (val >> 5) & 1;
- } else {
- cmd = val >> 5;
- switch (cmd) {
- case 0:
- case 4:
- s->rotate_on_auto_eoi = cmd >> 2;
- break;
- case 1: /* end of interrupt */
- case 5:
- priority = get_priority(s, s->isr);
- if (priority != 8) {
- irq = (priority + s->priority_add) & 7;
- if (cmd == 5)
- s->priority_add = (irq + 1) & 7;
- pic_clear_isr(s, irq);
- pic_update_irq(s->pics_state);
- }
- break;
- case 3:
- irq = val & 7;
- pic_clear_isr(s, irq);
- pic_update_irq(s->pics_state);
- break;
- case 6:
- s->priority_add = (val + 1) & 7;
- pic_update_irq(s->pics_state);
- break;
- case 7:
- irq = val & 7;
- s->priority_add = (irq + 1) & 7;
- pic_clear_isr(s, irq);
- pic_update_irq(s->pics_state);
- break;
- default:
- break; /* no operation */
- }
- }
- } else
- switch (s->init_state) {
- case 0: { /* normal mode */
- u8 imr_diff = s->imr ^ val,
- off = (s == &s->pics_state->pics[0]) ? 0 : 8;
- s->imr = val;
- for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
- if (imr_diff & (1 << irq))
- kvm_fire_mask_notifiers(
- s->pics_state->kvm,
- SELECT_PIC(irq + off),
- irq + off,
- !!(s->imr & (1 << irq)));
- pic_update_irq(s->pics_state);
- break;
- }
- case 1:
- s->irq_base = val & 0xf8;
- s->init_state = 2;
- break;
- case 2:
- if (s->init4)
- s->init_state = 3;
- else
- s->init_state = 0;
- break;
- case 3:
- s->special_fully_nested_mode = (val >> 4) & 1;
- s->auto_eoi = (val >> 1) & 1;
- s->init_state = 0;
- break;
- }
-}
-
-static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1)
-{
- int ret;
-
- ret = pic_get_irq(s);
- if (ret >= 0) {
- if (addr1 >> 7) {
- s->pics_state->pics[0].isr &= ~(1 << 2);
- s->pics_state->pics[0].irr &= ~(1 << 2);
- }
- s->irr &= ~(1 << ret);
- pic_clear_isr(s, ret);
- if (addr1 >> 7 || ret != 2)
- pic_update_irq(s->pics_state);
- } else {
- ret = 0x07;
- pic_update_irq(s->pics_state);
- }
-
- return ret;
-}
-
-static u32 pic_ioport_read(void *opaque, u32 addr1)
-{
- struct kvm_kpic_state *s = opaque;
- unsigned int addr;
- int ret;
-
- addr = addr1;
- addr &= 1;
- if (s->poll) {
- ret = pic_poll_read(s, addr1);
- s->poll = 0;
- } else
- if (addr == 0)
- if (s->read_reg_select)
- ret = s->isr;
- else
- ret = s->irr;
- else
- ret = s->imr;
- return ret;
-}
-
-static void elcr_ioport_write(void *opaque, u32 addr, u32 val)
-{
- struct kvm_kpic_state *s = opaque;
- s->elcr = val & s->elcr_mask;
-}
-
-static u32 elcr_ioport_read(void *opaque, u32 addr1)
-{
- struct kvm_kpic_state *s = opaque;
- return s->elcr;
-}
-
-static int picdev_in_range(gpa_t addr)
-{
- switch (addr) {
- case 0x20:
- case 0x21:
- case 0xa0:
- case 0xa1:
- case 0x4d0:
- case 0x4d1:
- return 1;
- default:
- return 0;
- }
-}
-
-static int picdev_write(struct kvm_pic *s,
- gpa_t addr, int len, const void *val)
-{
- unsigned char data = *(unsigned char *)val;
- if (!picdev_in_range(addr))
- return -EOPNOTSUPP;
-
- if (len != 1) {
- pr_pic_unimpl("non byte write\n");
- return 0;
- }
- pic_lock(s);
- switch (addr) {
- case 0x20:
- case 0x21:
- case 0xa0:
- case 0xa1:
- pic_ioport_write(&s->pics[addr >> 7], addr, data);
- break;
- case 0x4d0:
- case 0x4d1:
- elcr_ioport_write(&s->pics[addr & 1], addr, data);
- break;
- }
- pic_unlock(s);
- return 0;
-}
-
-static int picdev_read(struct kvm_pic *s,
- gpa_t addr, int len, void *val)
-{
- unsigned char data = 0;
- if (!picdev_in_range(addr))
- return -EOPNOTSUPP;
-
- if (len != 1) {
- pr_pic_unimpl("non byte read\n");
- return 0;
- }
- pic_lock(s);
- switch (addr) {
- case 0x20:
- case 0x21:
- case 0xa0:
- case 0xa1:
- data = pic_ioport_read(&s->pics[addr >> 7], addr);
- break;
- case 0x4d0:
- case 0x4d1:
- data = elcr_ioport_read(&s->pics[addr & 1], addr);
- break;
- }
- *(unsigned char *)val = data;
- pic_unlock(s);
- return 0;
-}
-
-static int picdev_master_write(struct kvm_io_device *dev,
- gpa_t addr, int len, const void *val)
-{
- return picdev_write(container_of(dev, struct kvm_pic, dev_master),
- addr, len, val);
-}
-
-static int picdev_master_read(struct kvm_io_device *dev,
- gpa_t addr, int len, void *val)
-{
- return picdev_read(container_of(dev, struct kvm_pic, dev_master),
- addr, len, val);
-}
-
-static int picdev_slave_write(struct kvm_io_device *dev,
- gpa_t addr, int len, const void *val)
-{
- return picdev_write(container_of(dev, struct kvm_pic, dev_slave),
- addr, len, val);
-}
-
-static int picdev_slave_read(struct kvm_io_device *dev,
- gpa_t addr, int len, void *val)
-{
- return picdev_read(container_of(dev, struct kvm_pic, dev_slave),
- addr, len, val);
-}
-
-static int picdev_eclr_write(struct kvm_io_device *dev,
- gpa_t addr, int len, const void *val)
-{
- return picdev_write(container_of(dev, struct kvm_pic, dev_eclr),
- addr, len, val);
-}
-
-static int picdev_eclr_read(struct kvm_io_device *dev,
- gpa_t addr, int len, void *val)
-{
- return picdev_read(container_of(dev, struct kvm_pic, dev_eclr),
- addr, len, val);
-}
-
-/*
- * callback when PIC0 irq status changed
- */
-static void pic_irq_request(struct kvm *kvm, int level)
-{
- struct kvm_pic *s = pic_irqchip(kvm);
-
- if (!s->output)
- s->wakeup_needed = true;
- s->output = level;
-}
-
-static const struct kvm_io_device_ops picdev_master_ops = {
- .read = picdev_master_read,
- .write = picdev_master_write,
-};
-
-static const struct kvm_io_device_ops picdev_slave_ops = {
- .read = picdev_slave_read,
- .write = picdev_slave_write,
-};
-
-static const struct kvm_io_device_ops picdev_eclr_ops = {
- .read = picdev_eclr_read,
- .write = picdev_eclr_write,
-};
-
-struct kvm_pic *kvm_create_pic(struct kvm *kvm)
-{
- struct kvm_pic *s;
- int ret;
-
- s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
- if (!s)
- return NULL;
- spin_lock_init(&s->lock);
- s->kvm = kvm;
- s->pics[0].elcr_mask = 0xf8;
- s->pics[1].elcr_mask = 0xde;
- s->pics[0].pics_state = s;
- s->pics[1].pics_state = s;
-
- /*
- * Initialize PIO device
- */
- kvm_iodevice_init(&s->dev_master, &picdev_master_ops);
- kvm_iodevice_init(&s->dev_slave, &picdev_slave_ops);
- kvm_iodevice_init(&s->dev_eclr, &picdev_eclr_ops);
- mutex_lock(&kvm->slots_lock);
- ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x20, 2,
- &s->dev_master);
- if (ret < 0)
- goto fail_unlock;
-
- ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0xa0, 2, &s->dev_slave);
- if (ret < 0)
- goto fail_unreg_2;
-
- ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x4d0, 2, &s->dev_eclr);
- if (ret < 0)
- goto fail_unreg_1;
-
- mutex_unlock(&kvm->slots_lock);
-
- return s;
-
-fail_unreg_1:
- kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_slave);
-
-fail_unreg_2:
- kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_master);
-
-fail_unlock:
- mutex_unlock(&kvm->slots_lock);
-
- kfree(s);
-
- return NULL;
-}
-
-void kvm_destroy_pic(struct kvm *kvm)
-{
- struct kvm_pic *vpic = kvm->arch.vpic;
-
- if (vpic) {
- kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_master);
- kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_slave);
- kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_eclr);
- kvm->arch.vpic = NULL;
- kfree(vpic);
- }
-}
diff --git a/ANDROID_3.4.5/arch/x86/kvm/irq.c b/ANDROID_3.4.5/arch/x86/kvm/irq.c
deleted file mode 100644
index 7e06ba16..00000000
--- a/ANDROID_3.4.5/arch/x86/kvm/irq.c
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * irq.c: API for in kernel interrupt controller
- * Copyright (c) 2007, Intel Corporation.
- * Copyright 2009 Red Hat, Inc. and/or its affiliates.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- * Authors:
- * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
- *
- */
-
-#include <linux/module.h>
-#include <linux/kvm_host.h>
-
-#include "irq.h"
-#include "i8254.h"
-#include "x86.h"
-
-/*
- * check if there are pending timer events
- * to be processed.
- */
-int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
-{
- return apic_has_pending_timer(vcpu);
-}
-EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
-
-/*
- * check if there is pending interrupt without
- * intack.
- */
-int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
-{
- struct kvm_pic *s;
-
- if (!irqchip_in_kernel(v->kvm))
- return v->arch.interrupt.pending;
-
- if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */
- if (kvm_apic_accept_pic_intr(v)) {
- s = pic_irqchip(v->kvm); /* PIC */
- return s->output;
- } else
- return 0;
- }
- return 1;
-}
-EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
-
-/*
- * Read pending interrupt vector and intack.
- */
-int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
-{
- struct kvm_pic *s;
- int vector;
-
- if (!irqchip_in_kernel(v->kvm))
- return v->arch.interrupt.nr;
-
- vector = kvm_get_apic_interrupt(v); /* APIC */
- if (vector == -1) {
- if (kvm_apic_accept_pic_intr(v)) {
- s = pic_irqchip(v->kvm);
- s->output = 0; /* PIC */
- vector = kvm_pic_read_irq(v->kvm);
- }
- }
- return vector;
-}
-EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
-
-void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
-{
- kvm_inject_apic_timer_irqs(vcpu);
- /* TODO: PIT, RTC etc. */
-}
-EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
-
-void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
-{
- __kvm_migrate_apic_timer(vcpu);
- __kvm_migrate_pit_timer(vcpu);
-}
diff --git a/ANDROID_3.4.5/arch/x86/kvm/irq.h b/ANDROID_3.4.5/arch/x86/kvm/irq.h
deleted file mode 100644
index 2086f2bf..00000000
--- a/ANDROID_3.4.5/arch/x86/kvm/irq.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * irq.h: in kernel interrupt controller related definitions
- * Copyright (c) 2007, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- * Authors:
- * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
- *
- */
-
-#ifndef __IRQ_H
-#define __IRQ_H
-
-#include <linux/mm_types.h>
-#include <linux/hrtimer.h>
-#include <linux/kvm_host.h>
-#include <linux/spinlock.h>
-
-#include "iodev.h"
-#include "ioapic.h"
-#include "lapic.h"
-
-#define PIC_NUM_PINS 16
-#define SELECT_PIC(irq) \
- ((irq) < 8 ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE)
-
-struct kvm;
-struct kvm_vcpu;
-
-struct kvm_kpic_state {
- u8 last_irr; /* edge detection */
- u8 irr; /* interrupt request register */
- u8 imr; /* interrupt mask register */
- u8 isr; /* interrupt service register */
- u8 priority_add; /* highest irq priority */
- u8 irq_base;
- u8 read_reg_select;
- u8 poll;
- u8 special_mask;
- u8 init_state;
- u8 auto_eoi;
- u8 rotate_on_auto_eoi;
- u8 special_fully_nested_mode;
- u8 init4; /* true if 4 byte init */
- u8 elcr; /* PIIX edge/trigger selection */
- u8 elcr_mask;
- u8 isr_ack; /* interrupt ack detection */
- struct kvm_pic *pics_state;
-};
-
-struct kvm_pic {
- spinlock_t lock;
- bool wakeup_needed;
- unsigned pending_acks;
- struct kvm *kvm;
- struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
- int output; /* intr from master PIC */
- struct kvm_io_device dev_master;
- struct kvm_io_device dev_slave;
- struct kvm_io_device dev_eclr;
- void (*ack_notifier)(void *opaque, int irq);
- unsigned long irq_states[16];
-};
-
-struct kvm_pic *kvm_create_pic(struct kvm *kvm);
-void kvm_destroy_pic(struct kvm *kvm);
-int kvm_pic_read_irq(struct kvm *kvm);
-void kvm_pic_update_irq(struct kvm_pic *s);
-
-static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
-{
- return kvm->arch.vpic;
-}
-
-static inline int irqchip_in_kernel(struct kvm *kvm)
-{
- int ret;
-
- ret = (pic_irqchip(kvm) != NULL);
- smp_rmb();
- return ret;
-}
-
-void kvm_pic_reset(struct kvm_kpic_state *s);
-
-void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
-void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
-void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu);
-void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
-void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu);
-void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
-
-int apic_has_pending_timer(struct kvm_vcpu *vcpu);
-
-#endif
diff --git a/ANDROID_3.4.5/arch/x86/kvm/kvm_cache_regs.h b/ANDROID_3.4.5/arch/x86/kvm/kvm_cache_regs.h
deleted file mode 100644
index 544076c4..00000000
--- a/ANDROID_3.4.5/arch/x86/kvm/kvm_cache_regs.h
+++ /dev/null
@@ -1,102 +0,0 @@
-#ifndef ASM_KVM_CACHE_REGS_H
-#define ASM_KVM_CACHE_REGS_H
-
-#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS
-#define KVM_POSSIBLE_CR4_GUEST_BITS \
- (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
- | X86_CR4_OSXMMEXCPT | X86_CR4_PGE)
-
-static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,
- enum kvm_reg reg)
-{
- if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail))
- kvm_x86_ops->cache_reg(vcpu, reg);
-
- return vcpu->arch.regs[reg];
-}
-
-static inline void kvm_register_write(struct kvm_vcpu *vcpu,
- enum kvm_reg reg,
- unsigned long val)
-{
- vcpu->arch.regs[reg] = val;
- __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
- __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
-}
-
-static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu)
-{
- return kvm_register_read(vcpu, VCPU_REGS_RIP);
-}
-
-static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
-{
- kvm_register_write(vcpu, VCPU_REGS_RIP, val);
-}
-
-static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
-{
- might_sleep(); /* on svm */
-
- if (!test_bit(VCPU_EXREG_PDPTR,
- (unsigned long *)&vcpu->arch.regs_avail))
- kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR);
-
- return vcpu->arch.walk_mmu->pdptrs[index];
-}
-
-static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
-{
- ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS;
- if (tmask & vcpu->arch.cr0_guest_owned_bits)
- kvm_x86_ops->decache_cr0_guest_bits(vcpu);
- return vcpu->arch.cr0 & mask;
-}
-
-static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu)
-{
- return kvm_read_cr0_bits(vcpu, ~0UL);
-}
-
-static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
-{
- ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS;
- if (tmask & vcpu->arch.cr4_guest_owned_bits)
- kvm_x86_ops->decache_cr4_guest_bits(vcpu);
- return vcpu->arch.cr4 & mask;
-}
-
-static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu)
-{
- if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
- kvm_x86_ops->decache_cr3(vcpu);
- return vcpu->arch.cr3;
-}
-
-static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
-{
- return kvm_read_cr4_bits(vcpu, ~0UL);
-}
-
-static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
-{
- return (kvm_register_read(vcpu, VCPU_REGS_RAX) & -1u)
- | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
-}
-
-static inline void enter_guest_mode(struct kvm_vcpu *vcpu)
-{
- vcpu->arch.hflags |= HF_GUEST_MASK;
-}
-
-static inline void leave_guest_mode(struct kvm_vcpu *vcpu)
-{
- vcpu->arch.hflags &= ~HF_GUEST_MASK;
-}
-
-static inline bool is_guest_mode(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.hflags & HF_GUEST_MASK;
-}
-
-#endif
diff --git a/ANDROID_3.4.5/arch/x86/kvm/kvm_timer.h b/ANDROID_3.4.5/arch/x86/kvm/kvm_timer.h
deleted file mode 100644
index 497dbaa3..00000000
--- a/ANDROID_3.4.5/arch/x86/kvm/kvm_timer.h
+++ /dev/null
@@ -1,18 +0,0 @@
-
-struct kvm_timer {
- struct hrtimer timer;
- s64 period; /* unit: ns */
- u32 timer_mode_mask;
- u64 tscdeadline;
- atomic_t pending; /* accumulated triggered timers */
- bool reinject;
- struct kvm_timer_ops *t_ops;
- struct kvm *kvm;
- struct kvm_vcpu *vcpu;
-};
-
-struct kvm_timer_ops {
- bool (*is_periodic)(struct kvm_timer *);
-};
-
-enum hrtimer_restart kvm_timer_fn(struct hrtimer *data);
diff --git a/ANDROID_3.4.5/arch/x86/kvm/lapic.c b/ANDROID_3.4.5/arch/x86/kvm/lapic.c
deleted file mode 100644
index 85843228..00000000
--- a/ANDROID_3.4.5/arch/x86/kvm/lapic.c
+++ /dev/null
@@ -1,1387 +0,0 @@
-
-/*
- * Local APIC virtualization
- *
- * Copyright (C) 2006 Qumranet, Inc.
- * Copyright (C) 2007 Novell
- * Copyright (C) 2007 Intel
- * Copyright 2009 Red Hat, Inc. and/or its affiliates.
- *
- * Authors:
- * Dor Laor <dor.laor@qumranet.com>
- * Gregory Haskins <ghaskins@novell.com>
- * Yaozu (Eddie) Dong <eddie.dong@intel.com>
- *
- * Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation.
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- */
-
-#include <linux/kvm_host.h>
-#include <linux/kvm.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/smp.h>
-#include <linux/hrtimer.h>
-#include <linux/io.h>
-#include <linux/module.h>
-#include <linux/math64.h>
-#include <linux/slab.h>
-#include <asm/processor.h>
-#include <asm/msr.h>
-#include <asm/page.h>
-#include <asm/current.h>
-#include <asm/apicdef.h>
-#include <linux/atomic.h>
-#include "kvm_cache_regs.h"
-#include "irq.h"
-#include "trace.h"
-#include "x86.h"
-#include "cpuid.h"
-
-#ifndef CONFIG_X86_64
-#define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
-#else
-#define mod_64(x, y) ((x) % (y))
-#endif
-
-#define PRId64 "d"
-#define PRIx64 "llx"
-#define PRIu64 "u"
-#define PRIo64 "o"
-
-#define APIC_BUS_CYCLE_NS 1
-
-/* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */
-#define apic_debug(fmt, arg...)
-
-#define APIC_LVT_NUM 6
-/* 14 is the version for Xeon and Pentium 8.4.8*/
-#define APIC_VERSION (0x14UL | ((APIC_LVT_NUM - 1) << 16))
-#define LAPIC_MMIO_LENGTH (1 << 12)
-/* followed define is not in apicdef.h */
-#define APIC_SHORT_MASK 0xc0000
-#define APIC_DEST_NOSHORT 0x0
-#define APIC_DEST_MASK 0x800
-#define MAX_APIC_VECTOR 256
-
-#define VEC_POS(v) ((v) & (32 - 1))
-#define REG_POS(v) (((v) >> 5) << 4)
-
-static unsigned int min_timer_period_us = 500;
-module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
-
-static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off)
-{
- return *((u32 *) (apic->regs + reg_off));
-}
-
-static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
-{
- *((u32 *) (apic->regs + reg_off)) = val;
-}
-
-static inline int apic_test_and_set_vector(int vec, void *bitmap)
-{
- return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
-static inline int apic_test_and_clear_vector(int vec, void *bitmap)
-{
- return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
-static inline void apic_set_vector(int vec, void *bitmap)
-{
- set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
-static inline void apic_clear_vector(int vec, void *bitmap)
-{
- clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
-static inline int apic_hw_enabled(struct kvm_lapic *apic)
-{
- return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
-}
-
-static inline int apic_sw_enabled(struct kvm_lapic *apic)
-{
- return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED;
-}
-
-static inline int apic_enabled(struct kvm_lapic *apic)
-{
- return apic_sw_enabled(apic) && apic_hw_enabled(apic);
-}
-
-#define LVT_MASK \
- (APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK)
-
-#define LINT_MASK \
- (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
- APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
-
-static inline int kvm_apic_id(struct kvm_lapic *apic)
-{
- return (apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
-}
-
-static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
-{
- return !(apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
-}
-
-static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type)
-{
- return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
-}
-
-static inline int apic_lvtt_oneshot(struct kvm_lapic *apic)
-{
- return ((apic_get_reg(apic, APIC_LVTT) &
- apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_ONESHOT);
-}
-
-static inline int apic_lvtt_period(struct kvm_lapic *apic)
-{
- return ((apic_get_reg(apic, APIC_LVTT) &
- apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_PERIODIC);
-}
-
-static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic)
-{
- return ((apic_get_reg(apic, APIC_LVTT) &
- apic->lapic_timer.timer_mode_mask) ==
- APIC_LVT_TIMER_TSCDEADLINE);
-}
-
-static inline int apic_lvt_nmi_mode(u32 lvt_val)
-{
- return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI;
-}
-
-void kvm_apic_set_version(struct kvm_vcpu *vcpu)
-{
- struct kvm_lapic *apic = vcpu->arch.apic;
- struct kvm_cpuid_entry2 *feat;
- u32 v = APIC_VERSION;
-
- if (!irqchip_in_kernel(vcpu->kvm))
- return;
-
- feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0);
- if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31))))
- v |= APIC_LVR_DIRECTED_EOI;
- apic_set_reg(apic, APIC_LVR, v);
-}
-
-static inline int apic_x2apic_mode(struct kvm_lapic *apic)
-{
- return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
-}
-
-static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
- LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */
- LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */
- LVT_MASK | APIC_MODE_MASK, /* LVTPC */
- LINT_MASK, LINT_MASK, /* LVT0-1 */
- LVT_MASK /* LVTERR */
-};
-
-static int find_highest_vector(void *bitmap)
-{
- u32 *word = bitmap;
- int word_offset = MAX_APIC_VECTOR >> 5;
-
- while ((word_offset != 0) && (word[(--word_offset) << 2] == 0))
- continue;
-
- if (likely(!word_offset && !word[0]))
- return -1;
- else
- return fls(word[word_offset << 2]) - 1 + (word_offset << 5);
-}
-
-static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
-{
- apic->irr_pending = true;
- return apic_test_and_set_vector(vec, apic->regs + APIC_IRR);
-}
-
-static inline int apic_search_irr(struct kvm_lapic *apic)
-{
- return find_highest_vector(apic->regs + APIC_IRR);
-}
-
-static inline int apic_find_highest_irr(struct kvm_lapic *apic)
-{
- int result;
-
- if (!apic->irr_pending)
- return -1;
-
- result = apic_search_irr(apic);
- ASSERT(result == -1 || result >= 16);
-
- return result;
-}
-
-static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
-{
- apic->irr_pending = false;
- apic_clear_vector(vec, apic->regs + APIC_IRR);
- if (apic_search_irr(apic) != -1)
- apic->irr_pending = true;
-}
-
-int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
-{
- struct kvm_lapic *apic = vcpu->arch.apic;
- int highest_irr;
-
- /* This may race with setting of irr in __apic_accept_irq() and
- * value returned may be wrong, but kvm_vcpu_kick() in __apic_accept_irq
- * will cause vmexit immediately and the value will be recalculated
- * on the next vmentry.
- */
- if (!apic)
- return 0;
- highest_irr = apic_find_highest_irr(apic);
-
- return highest_irr;
-}
-
-static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
- int vector, int level, int trig_mode);
-
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq)
-{
- struct kvm_lapic *apic = vcpu->arch.apic;
-
- return __apic_accept_irq(apic, irq->delivery_mode, irq->vector,
- irq->level, irq->trig_mode);
-}
-
-static inline int apic_find_highest_isr(struct kvm_lapic *apic)
-{
- int result;
-
- result = find_highest_vector(apic->regs + APIC_ISR);
- ASSERT(result == -1 || result >= 16);
-
- return result;
-}
-
-static void apic_update_ppr(struct kvm_lapic *apic)
-{
- u32 tpr, isrv, ppr, old_ppr;
- int isr;
-
- old_ppr = apic_get_reg(apic, APIC_PROCPRI);
- tpr = apic_get_reg(apic, APIC_TASKPRI);
- isr = apic_find_highest_isr(apic);
- isrv = (isr != -1) ? isr : 0;
-
- if ((tpr & 0xf0) >= (isrv & 0xf0))
- ppr = tpr & 0xff;
- else
- ppr = isrv & 0xf0;
-
- apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x",
- apic, ppr, isr, isrv);
-
- if (old_ppr != ppr) {
- apic_set_reg(apic, APIC_PROCPRI, ppr);
- if (ppr < old_ppr)
- kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
- }
-}
-
-static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
-{
- apic_set_reg(apic, APIC_TASKPRI, tpr);
- apic_update_ppr(apic);
-}
-
-int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest)
-{
- return dest == 0xff || kvm_apic_id(apic) == dest;
-}
-
-int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
-{
- int result = 0;
- u32 logical_id;
-
- if (apic_x2apic_mode(apic)) {
- logical_id = apic_get_reg(apic, APIC_LDR);
- return logical_id & mda;
- }
-
- logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR));
-
- switch (apic_get_reg(apic, APIC_DFR)) {
- case APIC_DFR_FLAT:
- if (logical_id & mda)
- result = 1;
- break;
- case APIC_DFR_CLUSTER:
- if (((logical_id >> 4) == (mda >> 0x4))
- && (logical_id & mda & 0xf))
- result = 1;
- break;
- default:
- apic_debug("Bad DFR vcpu %d: %08x\n",
- apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR));
- break;
- }
-
- return result;
-}
-
-int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
- int short_hand, int dest, int dest_mode)
-{
- int result = 0;
- struct kvm_lapic *target = vcpu->arch.apic;
-
- apic_debug("target %p, source %p, dest 0x%x, "
- "dest_mode 0x%x, short_hand 0x%x\n",
- target, source, dest, dest_mode, short_hand);
-
- ASSERT(target);
- switch (short_hand) {
- case APIC_DEST_NOSHORT:
- if (dest_mode == 0)
- /* Physical mode. */
- result = kvm_apic_match_physical_addr(target, dest);
- else
- /* Logical mode. */
- result = kvm_apic_match_logical_addr(target, dest);
- break;
- case APIC_DEST_SELF:
- result = (target == source);
- break;
- case APIC_DEST_ALLINC:
- result = 1;
- break;
- case APIC_DEST_ALLBUT:
- result = (target != source);
- break;
- default:
- apic_debug("kvm: apic: Bad dest shorthand value %x\n",
- short_hand);
- break;
- }
-
- return result;
-}
-
-/*
- * Add a pending IRQ into lapic.
- * Return 1 if successfully added and 0 if discarded.
- */
-static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
- int vector, int level, int trig_mode)
-{
- int result = 0;
- struct kvm_vcpu *vcpu = apic->vcpu;
-
- switch (delivery_mode) {
- case APIC_DM_LOWEST:
- vcpu->arch.apic_arb_prio++;
- case APIC_DM_FIXED:
- /* FIXME add logic for vcpu on reset */
- if (unlikely(!apic_enabled(apic)))
- break;
-
- if (trig_mode) {
- apic_debug("level trig mode for vector %d", vector);
- apic_set_vector(vector, apic->regs + APIC_TMR);
- } else
- apic_clear_vector(vector, apic->regs + APIC_TMR);
-
- result = !apic_test_and_set_irr(vector, apic);
- trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
- trig_mode, vector, !result);
- if (!result) {
- if (trig_mode)
- apic_debug("level trig mode repeatedly for "
- "vector %d", vector);
- break;
- }
-
- kvm_make_request(KVM_REQ_EVENT, vcpu);
- kvm_vcpu_kick(vcpu);
- break;
-
- case APIC_DM_REMRD:
- apic_debug("Ignoring delivery mode 3\n");
- break;
-
- case APIC_DM_SMI:
- apic_debug("Ignoring guest SMI\n");
- break;
-
- case APIC_DM_NMI:
- result = 1;
- kvm_inject_nmi(vcpu);
- kvm_vcpu_kick(vcpu);
- break;
-
- case APIC_DM_INIT:
- if (!trig_mode || level) {
- result = 1;
- vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
- kvm_make_request(KVM_REQ_EVENT, vcpu);
- kvm_vcpu_kick(vcpu);
- } else {
- apic_debug("Ignoring de-assert INIT to vcpu %d\n",
- vcpu->vcpu_id);
- }
- break;
-
- case APIC_DM_STARTUP:
- apic_debug("SIPI to vcpu %d vector 0x%02x\n",
- vcpu->vcpu_id, vector);
- if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
- result = 1;
- vcpu->arch.sipi_vector = vector;
- vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
- kvm_make_request(KVM_REQ_EVENT, vcpu);
- kvm_vcpu_kick(vcpu);
- }
- break;
-
- case APIC_DM_EXTINT:
- /*
- * Should only be called by kvm_apic_local_deliver() with LVT0,
- * before NMI watchdog was enabled. Already handled by
- * kvm_apic_accept_pic_intr().
- */
- break;
-
- default:
- printk(KERN_ERR "TODO: unsupported delivery mode %x\n",
- delivery_mode);
- break;
- }
- return result;
-}
-
-int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
-{
- return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
-}
-
-static void apic_set_eoi(struct kvm_lapic *apic)
-{
- int vector = apic_find_highest_isr(apic);
- int trigger_mode;
- /*
- * Not every write EOI will has corresponding ISR,
- * one example is when Kernel check timer on setup_IO_APIC
- */
- if (vector == -1)
- return;
-
- apic_clear_vector(vector, apic->regs + APIC_ISR);
- apic_update_ppr(apic);
-
- if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR))
- trigger_mode = IOAPIC_LEVEL_TRIG;
- else
- trigger_mode = IOAPIC_EDGE_TRIG;
- if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI))
- kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
- kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
-}
-
-static void apic_send_ipi(struct kvm_lapic *apic)
-{
- u32 icr_low = apic_get_reg(apic, APIC_ICR);
- u32 icr_high = apic_get_reg(apic, APIC_ICR2);
- struct kvm_lapic_irq irq;
-
- irq.vector = icr_low & APIC_VECTOR_MASK;
- irq.delivery_mode = icr_low & APIC_MODE_MASK;
- irq.dest_mode = icr_low & APIC_DEST_MASK;
- irq.level = icr_low & APIC_INT_ASSERT;
- irq.trig_mode = icr_low & APIC_INT_LEVELTRIG;
- irq.shorthand = icr_low & APIC_SHORT_MASK;
- if (apic_x2apic_mode(apic))
- irq.dest_id = icr_high;
- else
- irq.dest_id = GET_APIC_DEST_FIELD(icr_high);
-
- trace_kvm_apic_ipi(icr_low, irq.dest_id);
-
- apic_debug("icr_high 0x%x, icr_low 0x%x, "
- "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
- "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
- icr_high, icr_low, irq.shorthand, irq.dest_id,
- irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
- irq.vector);
-
- kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq);
-}
-
-static u32 apic_get_tmcct(struct kvm_lapic *apic)
-{
- ktime_t remaining;
- s64 ns;
- u32 tmcct;
-
- ASSERT(apic != NULL);
-
- /* if initial count is 0, current count should also be 0 */
- if (apic_get_reg(apic, APIC_TMICT) == 0)
- return 0;
-
- remaining = hrtimer_get_remaining(&apic->lapic_timer.timer);
- if (ktime_to_ns(remaining) < 0)
- remaining = ktime_set(0, 0);
-
- ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period);
- tmcct = div64_u64(ns,
- (APIC_BUS_CYCLE_NS * apic->divide_count));
-
- return tmcct;
-}
-
-static void __report_tpr_access(struct kvm_lapic *apic, bool write)
-{
- struct kvm_vcpu *vcpu = apic->vcpu;
- struct kvm_run *run = vcpu->run;
-
- kvm_make_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu);
- run->tpr_access.rip = kvm_rip_read(vcpu);
- run->tpr_access.is_write = write;
-}
-
-static inline void report_tpr_access(struct kvm_lapic *apic, bool write)
-{
- if (apic->vcpu->arch.tpr_access_reporting)
- __report_tpr_access(apic, write);
-}
-
-static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
-{
- u32 val = 0;
-
- if (offset >= LAPIC_MMIO_LENGTH)
- return 0;
-
- switch (offset) {
- case APIC_ID:
- if (apic_x2apic_mode(apic))
- val = kvm_apic_id(apic);
- else
- val = kvm_apic_id(apic) << 24;
- break;
- case APIC_ARBPRI:
- apic_debug("Access APIC ARBPRI register which is for P6\n");
- break;
-
- case APIC_TMCCT: /* Timer CCR */
- if (apic_lvtt_tscdeadline(apic))
- return 0;
-
- val = apic_get_tmcct(apic);
- break;
-
- case APIC_TASKPRI:
- report_tpr_access(apic, false);
- /* fall thru */
- default:
- apic_update_ppr(apic);
- val = apic_get_reg(apic, offset);
- break;
- }
-
- return val;
-}
-
-static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev)
-{
- return container_of(dev, struct kvm_lapic, dev);
-}
-
-static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
- void *data)
-{
- unsigned char alignment = offset & 0xf;
- u32 result;
- /* this bitmask has a bit cleared for each reserver register */
- static const u64 rmask = 0x43ff01ffffffe70cULL;
-
- if ((alignment + len) > 4) {
- apic_debug("KVM_APIC_READ: alignment error %x %d\n",
- offset, len);
- return 1;
- }
-
- if (offset > 0x3f0 || !(rmask & (1ULL << (offset >> 4)))) {
- apic_debug("KVM_APIC_READ: read reserved register %x\n",
- offset);
- return 1;
- }
-
- result = __apic_read(apic, offset & ~0xf);
-
- trace_kvm_apic_read(offset, result);
-
- switch (len) {
- case 1:
- case 2:
- case 4:
- memcpy(data, (char *)&result + alignment, len);
- break;
- default:
- printk(KERN_ERR "Local APIC read with len = %x, "
- "should be 1,2, or 4 instead\n", len);
- break;
- }
- return 0;
-}
-
-static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr)
-{
- return apic_hw_enabled(apic) &&
- addr >= apic->base_address &&
- addr < apic->base_address + LAPIC_MMIO_LENGTH;
-}
-
-static int apic_mmio_read(struct kvm_io_device *this,
- gpa_t address, int len, void *data)
-{
- struct kvm_lapic *apic = to_lapic(this);
- u32 offset = address - apic->base_address;
-
- if (!apic_mmio_in_range(apic, address))
- return -EOPNOTSUPP;
-
- apic_reg_read(apic, offset, len, data);
-
- return 0;
-}
-
-static void update_divide_count(struct kvm_lapic *apic)
-{
- u32 tmp1, tmp2, tdcr;
-
- tdcr = apic_get_reg(apic, APIC_TDCR);
- tmp1 = tdcr & 0xf;
- tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
- apic->divide_count = 0x1 << (tmp2 & 0x7);
-
- apic_debug("timer divide count is 0x%x\n",
- apic->divide_count);
-}
-
-static void start_apic_timer(struct kvm_lapic *apic)
-{
- ktime_t now;
- atomic_set(&apic->lapic_timer.pending, 0);
-
- if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
- /* lapic timer in oneshot or peroidic mode */
- now = apic->lapic_timer.timer.base->get_time();
- apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT)
- * APIC_BUS_CYCLE_NS * apic->divide_count;
-
- if (!apic->lapic_timer.period)
- return;
- /*
- * Do not allow the guest to program periodic timers with small
- * interval, since the hrtimers are not throttled by the host
- * scheduler.
- */
- if (apic_lvtt_period(apic)) {
- s64 min_period = min_timer_period_us * 1000LL;
-
- if (apic->lapic_timer.period < min_period) {
- pr_info_ratelimited(
- "kvm: vcpu %i: requested %lld ns "
- "lapic timer period limited to %lld ns\n",
- apic->vcpu->vcpu_id,
- apic->lapic_timer.period, min_period);
- apic->lapic_timer.period = min_period;
- }
- }
-
- hrtimer_start(&apic->lapic_timer.timer,
- ktime_add_ns(now, apic->lapic_timer.period),
- HRTIMER_MODE_ABS);
-
- apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
- PRIx64 ", "
- "timer initial count 0x%x, period %lldns, "
- "expire @ 0x%016" PRIx64 ".\n", __func__,
- APIC_BUS_CYCLE_NS, ktime_to_ns(now),
- apic_get_reg(apic, APIC_TMICT),
- apic->lapic_timer.period,
- ktime_to_ns(ktime_add_ns(now,
- apic->lapic_timer.period)));
- } else if (apic_lvtt_tscdeadline(apic)) {
- /* lapic timer in tsc deadline mode */
- u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
- u64 ns = 0;
- struct kvm_vcpu *vcpu = apic->vcpu;
- unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
- unsigned long flags;
-
- if (unlikely(!tscdeadline || !this_tsc_khz))
- return;
-
- local_irq_save(flags);
-
- now = apic->lapic_timer.timer.base->get_time();
- guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
- if (likely(tscdeadline > guest_tsc)) {
- ns = (tscdeadline - guest_tsc) * 1000000ULL;
- do_div(ns, this_tsc_khz);
- }
- hrtimer_start(&apic->lapic_timer.timer,
- ktime_add_ns(now, ns), HRTIMER_MODE_ABS);
-
- local_irq_restore(flags);
- }
-}
-
-static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
-{
- int nmi_wd_enabled = apic_lvt_nmi_mode(apic_get_reg(apic, APIC_LVT0));
-
- if (apic_lvt_nmi_mode(lvt0_val)) {
- if (!nmi_wd_enabled) {
- apic_debug("Receive NMI setting on APIC_LVT0 "
- "for cpu %d\n", apic->vcpu->vcpu_id);
- apic->vcpu->kvm->arch.vapics_in_nmi_mode++;
- }
- } else if (nmi_wd_enabled)
- apic->vcpu->kvm->arch.vapics_in_nmi_mode--;
-}
-
-static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
-{
- int ret = 0;
-
- trace_kvm_apic_write(reg, val);
-
- switch (reg) {
- case APIC_ID: /* Local APIC ID */
- if (!apic_x2apic_mode(apic))
- apic_set_reg(apic, APIC_ID, val);
- else
- ret = 1;
- break;
-
- case APIC_TASKPRI:
- report_tpr_access(apic, true);
- apic_set_tpr(apic, val & 0xff);
- break;
-
- case APIC_EOI:
- apic_set_eoi(apic);
- break;
-
- case APIC_LDR:
- if (!apic_x2apic_mode(apic))
- apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK);
- else
- ret = 1;
- break;
-
- case APIC_DFR:
- if (!apic_x2apic_mode(apic))
- apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
- else
- ret = 1;
- break;
-
- case APIC_SPIV: {
- u32 mask = 0x3ff;
- if (apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI)
- mask |= APIC_SPIV_DIRECTED_EOI;
- apic_set_reg(apic, APIC_SPIV, val & mask);
- if (!(val & APIC_SPIV_APIC_ENABLED)) {
- int i;
- u32 lvt_val;
-
- for (i = 0; i < APIC_LVT_NUM; i++) {
- lvt_val = apic_get_reg(apic,
- APIC_LVTT + 0x10 * i);
- apic_set_reg(apic, APIC_LVTT + 0x10 * i,
- lvt_val | APIC_LVT_MASKED);
- }
- atomic_set(&apic->lapic_timer.pending, 0);
-
- }
- break;
- }
- case APIC_ICR:
- /* No delay here, so we always clear the pending bit */
- apic_set_reg(apic, APIC_ICR, val & ~(1 << 12));
- apic_send_ipi(apic);
- break;
-
- case APIC_ICR2:
- if (!apic_x2apic_mode(apic))
- val &= 0xff000000;
- apic_set_reg(apic, APIC_ICR2, val);
- break;
-
- case APIC_LVT0:
- apic_manage_nmi_watchdog(apic, val);
- case APIC_LVTTHMR:
- case APIC_LVTPC:
- case APIC_LVT1:
- case APIC_LVTERR:
- /* TODO: Check vector */
- if (!apic_sw_enabled(apic))
- val |= APIC_LVT_MASKED;
-
- val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4];
- apic_set_reg(apic, reg, val);
-
- break;
-
- case APIC_LVTT:
- if ((apic_get_reg(apic, APIC_LVTT) &
- apic->lapic_timer.timer_mode_mask) !=
- (val & apic->lapic_timer.timer_mode_mask))
- hrtimer_cancel(&apic->lapic_timer.timer);
-
- if (!apic_sw_enabled(apic))
- val |= APIC_LVT_MASKED;
- val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask);
- apic_set_reg(apic, APIC_LVTT, val);
- break;
-
- case APIC_TMICT:
- if (apic_lvtt_tscdeadline(apic))
- break;
-
- hrtimer_cancel(&apic->lapic_timer.timer);
- apic_set_reg(apic, APIC_TMICT, val);
- start_apic_timer(apic);
- break;
-
- case APIC_TDCR:
- if (val & 4)
- apic_debug("KVM_WRITE:TDCR %x\n", val);
- apic_set_reg(apic, APIC_TDCR, val);
- update_divide_count(apic);
- break;
-
- case APIC_ESR:
- if (apic_x2apic_mode(apic) && val != 0) {
- apic_debug("KVM_WRITE:ESR not zero %x\n", val);
- ret = 1;
- }
- break;
-
- case APIC_SELF_IPI:
- if (apic_x2apic_mode(apic)) {
- apic_reg_write(apic, APIC_ICR, 0x40000 | (val & 0xff));
- } else
- ret = 1;
- break;
- default:
- ret = 1;
- break;
- }
- if (ret)
- apic_debug("Local APIC Write to read-only register %x\n", reg);
- return ret;
-}
-
-static int apic_mmio_write(struct kvm_io_device *this,
- gpa_t address, int len, const void *data)
-{
- struct kvm_lapic *apic = to_lapic(this);
- unsigned int offset = address - apic->base_address;
- u32 val;
-
- if (!apic_mmio_in_range(apic, address))
- return -EOPNOTSUPP;
-
- /*
- * APIC register must be aligned on 128-bits boundary.
- * 32/64/128 bits registers must be accessed thru 32 bits.
- * Refer SDM 8.4.1
- */
- if (len != 4 || (offset & 0xf)) {
- /* Don't shout loud, $infamous_os would cause only noise. */
- apic_debug("apic write: bad size=%d %lx\n", len, (long)address);
- return 0;
- }
-
- val = *(u32*)data;
-
- /* too common printing */
- if (offset != APIC_EOI)
- apic_debug("%s: offset 0x%x with length 0x%x, and value is "
- "0x%x\n", __func__, offset, len, val);
-
- apic_reg_write(apic, offset & 0xff0, val);
-
- return 0;
-}
-
-void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
-{
- struct kvm_lapic *apic = vcpu->arch.apic;
-
- if (apic)
- apic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
-}
-EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
-
-void kvm_free_lapic(struct kvm_vcpu *vcpu)
-{
- if (!vcpu->arch.apic)
- return;
-
- hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer);
-
- if (vcpu->arch.apic->regs)
- free_page((unsigned long)vcpu->arch.apic->regs);
-
- kfree(vcpu->arch.apic);
-}
-
-/*
- *----------------------------------------------------------------------
- * LAPIC interface
- *----------------------------------------------------------------------
- */
-
-u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
-{
- struct kvm_lapic *apic = vcpu->arch.apic;
- if (!apic)
- return 0;
-
- if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic))
- return 0;
-
- return apic->lapic_timer.tscdeadline;
-}
-
-void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
-{
- struct kvm_lapic *apic = vcpu->arch.apic;
- if (!apic)
- return;
-
- if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic))
- return;
-
- hrtimer_cancel(&apic->lapic_timer.timer);
- apic->lapic_timer.tscdeadline = data;
- start_apic_timer(apic);
-}
-
-void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
-{
- struct kvm_lapic *apic = vcpu->arch.apic;
-
- if (!apic)
- return;
- apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
- | (apic_get_reg(apic, APIC_TASKPRI) & 4));
-}
-
-u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
-{
- struct kvm_lapic *apic = vcpu->arch.apic;
- u64 tpr;
-
- if (!apic)
- return 0;
- tpr = (u64) apic_get_reg(apic, APIC_TASKPRI);
-
- return (tpr & 0xf0) >> 4;
-}
-
-void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
-{
- struct kvm_lapic *apic = vcpu->arch.apic;
-
- if (!apic) {
- value |= MSR_IA32_APICBASE_BSP;
- vcpu->arch.apic_base = value;
- return;
- }
-
- if (!kvm_vcpu_is_bsp(apic->vcpu))
- value &= ~MSR_IA32_APICBASE_BSP;
-
- vcpu->arch.apic_base = value;
- if (apic_x2apic_mode(apic)) {
- u32 id = kvm_apic_id(apic);
- u32 ldr = ((id & ~0xf) << 16) | (1 << (id & 0xf));
- apic_set_reg(apic, APIC_LDR, ldr);
- }
- apic->base_address = apic->vcpu->arch.apic_base &
- MSR_IA32_APICBASE_BASE;
-
- /* with FSB delivery interrupt, we can restart APIC functionality */
- apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is "
- "0x%lx.\n", apic->vcpu->arch.apic_base, apic->base_address);
-
-}
-
-void kvm_lapic_reset(struct kvm_vcpu *vcpu)
-{
- struct kvm_lapic *apic;
- int i;
-
- apic_debug("%s\n", __func__);
-
- ASSERT(vcpu);
- apic = vcpu->arch.apic;
- ASSERT(apic != NULL);
-
- /* Stop the timer in case it's a reset to an active apic */
- hrtimer_cancel(&apic->lapic_timer.timer);
-
- apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24);
- kvm_apic_set_version(apic->vcpu);
-
- for (i = 0; i < APIC_LVT_NUM; i++)
- apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
- apic_set_reg(apic, APIC_LVT0,
- SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
-
- apic_set_reg(apic, APIC_DFR, 0xffffffffU);
- apic_set_reg(apic, APIC_SPIV, 0xff);
- apic_set_reg(apic, APIC_TASKPRI, 0);
- apic_set_reg(apic, APIC_LDR, 0);
- apic_set_reg(apic, APIC_ESR, 0);
- apic_set_reg(apic, APIC_ICR, 0);
- apic_set_reg(apic, APIC_ICR2, 0);
- apic_set_reg(apic, APIC_TDCR, 0);
- apic_set_reg(apic, APIC_TMICT, 0);
- for (i = 0; i < 8; i++) {
- apic_set_reg(apic, APIC_IRR + 0x10 * i, 0);
- apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
- apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
- }
- apic->irr_pending = false;
- update_divide_count(apic);
- atomic_set(&apic->lapic_timer.pending, 0);
- if (kvm_vcpu_is_bsp(vcpu))
- vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
- apic_update_ppr(apic);
-
- vcpu->arch.apic_arb_prio = 0;
-
- apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
- "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__,
- vcpu, kvm_apic_id(apic),
- vcpu->arch.apic_base, apic->base_address);
-}
-
-bool kvm_apic_present(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.apic && apic_hw_enabled(vcpu->arch.apic);
-}
-
-int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
-{
- return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic);
-}
-
-/*
- *----------------------------------------------------------------------
- * timer interface
- *----------------------------------------------------------------------
- */
-
-static bool lapic_is_periodic(struct kvm_timer *ktimer)
-{
- struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic,
- lapic_timer);
- return apic_lvtt_period(apic);
-}
-
-int apic_has_pending_timer(struct kvm_vcpu *vcpu)
-{
- struct kvm_lapic *lapic = vcpu->arch.apic;
-
- if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT))
- return atomic_read(&lapic->lapic_timer.pending);
-
- return 0;
-}
-
-int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
-{
- u32 reg = apic_get_reg(apic, lvt_type);
- int vector, mode, trig_mode;
-
- if (apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) {
- vector = reg & APIC_VECTOR_MASK;
- mode = reg & APIC_MODE_MASK;
- trig_mode = reg & APIC_LVT_LEVEL_TRIGGER;
- return __apic_accept_irq(apic, mode, vector, 1, trig_mode);
- }
- return 0;
-}
-
-void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
-{
- struct kvm_lapic *apic = vcpu->arch.apic;
-
- if (apic)
- kvm_apic_local_deliver(apic, APIC_LVT0);
-}
-
-static struct kvm_timer_ops lapic_timer_ops = {
- .is_periodic = lapic_is_periodic,
-};
-
-static const struct kvm_io_device_ops apic_mmio_ops = {
- .read = apic_mmio_read,
- .write = apic_mmio_write,
-};
-
-int kvm_create_lapic(struct kvm_vcpu *vcpu)
-{
- struct kvm_lapic *apic;
-
- ASSERT(vcpu != NULL);
- apic_debug("apic_init %d\n", vcpu->vcpu_id);
-
- apic = kzalloc(sizeof(*apic), GFP_KERNEL);
- if (!apic)
- goto nomem;
-
- vcpu->arch.apic = apic;
-
- apic->regs = (void *)get_zeroed_page(GFP_KERNEL);
- if (!apic->regs) {
- printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
- vcpu->vcpu_id);
- goto nomem_free_apic;
- }
- apic->vcpu = vcpu;
-
- hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
- HRTIMER_MODE_ABS);
- apic->lapic_timer.timer.function = kvm_timer_fn;
- apic->lapic_timer.t_ops = &lapic_timer_ops;
- apic->lapic_timer.kvm = vcpu->kvm;
- apic->lapic_timer.vcpu = vcpu;
-
- apic->base_address = APIC_DEFAULT_PHYS_BASE;
- vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
-
- kvm_lapic_reset(vcpu);
- kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
-
- return 0;
-nomem_free_apic:
- kfree(apic);
-nomem:
- return -ENOMEM;
-}
-
-int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
-{
- struct kvm_lapic *apic = vcpu->arch.apic;
- int highest_irr;
-
- if (!apic || !apic_enabled(apic))
- return -1;
-
- apic_update_ppr(apic);
- highest_irr = apic_find_highest_irr(apic);
- if ((highest_irr == -1) ||
- ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI)))
- return -1;
- return highest_irr;
-}
-
-int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
-{
- u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
- int r = 0;
-
- if (!apic_hw_enabled(vcpu->arch.apic))
- r = 1;
- if ((lvt0 & APIC_LVT_MASKED) == 0 &&
- GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
- r = 1;
- return r;
-}
-
-void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
-{
- struct kvm_lapic *apic = vcpu->arch.apic;
-
- if (apic && atomic_read(&apic->lapic_timer.pending) > 0) {
- if (kvm_apic_local_deliver(apic, APIC_LVTT))
- atomic_dec(&apic->lapic_timer.pending);
- }
-}
-
-int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
-{
- int vector = kvm_apic_has_interrupt(vcpu);
- struct kvm_lapic *apic = vcpu->arch.apic;
-
- if (vector == -1)
- return -1;
-
- apic_set_vector(vector, apic->regs + APIC_ISR);
- apic_update_ppr(apic);
- apic_clear_irr(vector, apic);
- return vector;
-}
-
-void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
-{
- struct kvm_lapic *apic = vcpu->arch.apic;
-
- apic->base_address = vcpu->arch.apic_base &
- MSR_IA32_APICBASE_BASE;
- kvm_apic_set_version(vcpu);
-
- apic_update_ppr(apic);
- hrtimer_cancel(&apic->lapic_timer.timer);
- update_divide_count(apic);
- start_apic_timer(apic);
- apic->irr_pending = true;
- kvm_make_request(KVM_REQ_EVENT, vcpu);
-}
-
-void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
-{
- struct kvm_lapic *apic = vcpu->arch.apic;
- struct hrtimer *timer;
-
- if (!apic)
- return;
-
- timer = &apic->lapic_timer.timer;
- if (hrtimer_cancel(timer))
- hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
-}
-
-void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
-{
- u32 data;
- void *vapic;
-
- if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
- return;
-
- vapic = kmap_atomic(vcpu->arch.apic->vapic_page);
- data = *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr));
- kunmap_atomic(vapic);
-
- apic_set_tpr(vcpu->arch.apic, data & 0xff);
-}
-
-void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
-{
- u32 data, tpr;
- int max_irr, max_isr;
- struct kvm_lapic *apic;
- void *vapic;
-
- if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
- return;
-
- apic = vcpu->arch.apic;
- tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff;
- max_irr = apic_find_highest_irr(apic);
- if (max_irr < 0)
- max_irr = 0;
- max_isr = apic_find_highest_isr(apic);
- if (max_isr < 0)
- max_isr = 0;
- data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24);
-
- vapic = kmap_atomic(vcpu->arch.apic->vapic_page);
- *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)) = data;
- kunmap_atomic(vapic);
-}
-
-void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
-{
- if (!irqchip_in_kernel(vcpu->kvm))
- return;
-
- vcpu->arch.apic->vapic_addr = vapic_addr;
-}
-
-int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
-{
- struct kvm_lapic *apic = vcpu->arch.apic;
- u32 reg = (msr - APIC_BASE_MSR) << 4;
-
- if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
- return 1;
-
- /* if this is ICR write vector before command */
- if (msr == 0x830)
- apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
- return apic_reg_write(apic, reg, (u32)data);
-}
-
-int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
-{
- struct kvm_lapic *apic = vcpu->arch.apic;
- u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0;
-
- if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
- return 1;
-
- if (apic_reg_read(apic, reg, 4, &low))
- return 1;
- if (msr == 0x830)
- apic_reg_read(apic, APIC_ICR2, 4, &high);
-
- *data = (((u64)high) << 32) | low;
-
- return 0;
-}
-
-int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data)
-{
- struct kvm_lapic *apic = vcpu->arch.apic;
-
- if (!irqchip_in_kernel(vcpu->kvm))
- return 1;
-
- /* if this is ICR write vector before command */
- if (reg == APIC_ICR)
- apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
- return apic_reg_write(apic, reg, (u32)data);
-}
-
-int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
-{
- struct kvm_lapic *apic = vcpu->arch.apic;
- u32 low, high = 0;
-
- if (!irqchip_in_kernel(vcpu->kvm))
- return 1;
-
- if (apic_reg_read(apic, reg, 4, &low))
- return 1;
- if (reg == APIC_ICR)
- apic_reg_read(apic, APIC_ICR2, 4, &high);
-
- *data = (((u64)high) << 32) | low;
-
- return 0;
-}
diff --git a/ANDROID_3.4.5/arch/x86/kvm/lapic.h b/ANDROID_3.4.5/arch/x86/kvm/lapic.h
deleted file mode 100644
index 6f4ce257..00000000
--- a/ANDROID_3.4.5/arch/x86/kvm/lapic.h
+++ /dev/null
@@ -1,63 +0,0 @@
-#ifndef __KVM_X86_LAPIC_H
-#define __KVM_X86_LAPIC_H
-
-#include "iodev.h"
-#include "kvm_timer.h"
-
-#include <linux/kvm_host.h>
-
-struct kvm_lapic {
- unsigned long base_address;
- struct kvm_io_device dev;
- struct kvm_timer lapic_timer;
- u32 divide_count;
- struct kvm_vcpu *vcpu;
- bool irr_pending;
- void *regs;
- gpa_t vapic_addr;
- struct page *vapic_page;
-};
-int kvm_create_lapic(struct kvm_vcpu *vcpu);
-void kvm_free_lapic(struct kvm_vcpu *vcpu);
-
-int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
-int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
-int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
-void kvm_lapic_reset(struct kvm_vcpu *vcpu);
-u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
-void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
-void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu);
-void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
-u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
-void kvm_apic_set_version(struct kvm_vcpu *vcpu);
-
-int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
-int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
-int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
-
-u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
-void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
-void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
-int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
-bool kvm_apic_present(struct kvm_vcpu *vcpu);
-int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
-
-u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
-void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data);
-
-void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
-void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
-void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
-
-int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
-int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
-
-int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
-int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
-
-static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE;
-}
-#endif
diff --git a/ANDROID_3.4.5/arch/x86/kvm/mmu.c b/ANDROID_3.4.5/arch/x86/kvm/mmu.c
deleted file mode 100644
index 4cb16426..00000000
--- a/ANDROID_3.4.5/arch/x86/kvm/mmu.c
+++ /dev/null
@@ -1,4027 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
- *
- * MMU support
- *
- * Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
- *
- * Authors:
- * Yaniv Kamay <yaniv@qumranet.com>
- * Avi Kivity <avi@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- *
- */
-
-#include "irq.h"
-#include "mmu.h"
-#include "x86.h"
-#include "kvm_cache_regs.h"
-
-#include <linux/kvm_host.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/module.h>
-#include <linux/swap.h>
-#include <linux/hugetlb.h>
-#include <linux/compiler.h>
-#include <linux/srcu.h>
-#include <linux/slab.h>
-#include <linux/uaccess.h>
-
-#include <asm/page.h>
-#include <asm/cmpxchg.h>
-#include <asm/io.h>
-#include <asm/vmx.h>
-
-/*
- * When setting this variable to true it enables Two-Dimensional-Paging
- * where the hardware walks 2 page tables:
- * 1. the guest-virtual to guest-physical
- * 2. while doing 1. it walks guest-physical to host-physical
- * If the hardware supports that we don't need to do shadow paging.
- */
-bool tdp_enabled = false;
-
-enum {
- AUDIT_PRE_PAGE_FAULT,
- AUDIT_POST_PAGE_FAULT,
- AUDIT_PRE_PTE_WRITE,
- AUDIT_POST_PTE_WRITE,
- AUDIT_PRE_SYNC,
- AUDIT_POST_SYNC
-};
-
-#undef MMU_DEBUG
-
-#ifdef MMU_DEBUG
-
-#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
-#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
-
-#else
-
-#define pgprintk(x...) do { } while (0)
-#define rmap_printk(x...) do { } while (0)
-
-#endif
-
-#ifdef MMU_DEBUG
-static bool dbg = 0;
-module_param(dbg, bool, 0644);
-#endif
-
-#ifndef MMU_DEBUG
-#define ASSERT(x) do { } while (0)
-#else
-#define ASSERT(x) \
- if (!(x)) { \
- printk(KERN_WARNING "assertion failed %s:%d: %s\n", \
- __FILE__, __LINE__, #x); \
- }
-#endif
-
-#define PTE_PREFETCH_NUM 8
-
-#define PT_FIRST_AVAIL_BITS_SHIFT 9
-#define PT64_SECOND_AVAIL_BITS_SHIFT 52
-
-#define PT64_LEVEL_BITS 9
-
-#define PT64_LEVEL_SHIFT(level) \
- (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
-
-#define PT64_INDEX(address, level)\
- (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
-
-
-#define PT32_LEVEL_BITS 10
-
-#define PT32_LEVEL_SHIFT(level) \
- (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
-
-#define PT32_LVL_OFFSET_MASK(level) \
- (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
- * PT32_LEVEL_BITS))) - 1))
-
-#define PT32_INDEX(address, level)\
- (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
-
-
-#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
-#define PT64_DIR_BASE_ADDR_MASK \
- (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
-#define PT64_LVL_ADDR_MASK(level) \
- (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
- * PT64_LEVEL_BITS))) - 1))
-#define PT64_LVL_OFFSET_MASK(level) \
- (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
- * PT64_LEVEL_BITS))) - 1))
-
-#define PT32_BASE_ADDR_MASK PAGE_MASK
-#define PT32_DIR_BASE_ADDR_MASK \
- (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
-#define PT32_LVL_ADDR_MASK(level) \
- (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
- * PT32_LEVEL_BITS))) - 1))
-
-#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
- | PT64_NX_MASK)
-
-#define PTE_LIST_EXT 4
-
-#define ACC_EXEC_MASK 1
-#define ACC_WRITE_MASK PT_WRITABLE_MASK
-#define ACC_USER_MASK PT_USER_MASK
-#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
-
-#include <trace/events/kvm.h>
-
-#define CREATE_TRACE_POINTS
-#include "mmutrace.h"
-
-#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
-
-#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
-
-struct pte_list_desc {
- u64 *sptes[PTE_LIST_EXT];
- struct pte_list_desc *more;
-};
-
-struct kvm_shadow_walk_iterator {
- u64 addr;
- hpa_t shadow_addr;
- u64 *sptep;
- int level;
- unsigned index;
-};
-
-#define for_each_shadow_entry(_vcpu, _addr, _walker) \
- for (shadow_walk_init(&(_walker), _vcpu, _addr); \
- shadow_walk_okay(&(_walker)); \
- shadow_walk_next(&(_walker)))
-
-#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \
- for (shadow_walk_init(&(_walker), _vcpu, _addr); \
- shadow_walk_okay(&(_walker)) && \
- ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \
- __shadow_walk_next(&(_walker), spte))
-
-static struct kmem_cache *pte_list_desc_cache;
-static struct kmem_cache *mmu_page_header_cache;
-static struct percpu_counter kvm_total_used_mmu_pages;
-
-static u64 __read_mostly shadow_nx_mask;
-static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
-static u64 __read_mostly shadow_user_mask;
-static u64 __read_mostly shadow_accessed_mask;
-static u64 __read_mostly shadow_dirty_mask;
-static u64 __read_mostly shadow_mmio_mask;
-
-static void mmu_spte_set(u64 *sptep, u64 spte);
-
-void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
-{
- shadow_mmio_mask = mmio_mask;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
-
-static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access)
-{
- access &= ACC_WRITE_MASK | ACC_USER_MASK;
-
- trace_mark_mmio_spte(sptep, gfn, access);
- mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT);
-}
-
-static bool is_mmio_spte(u64 spte)
-{
- return (spte & shadow_mmio_mask) == shadow_mmio_mask;
-}
-
-static gfn_t get_mmio_spte_gfn(u64 spte)
-{
- return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT;
-}
-
-static unsigned get_mmio_spte_access(u64 spte)
-{
- return (spte & ~shadow_mmio_mask) & ~PAGE_MASK;
-}
-
-static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access)
-{
- if (unlikely(is_noslot_pfn(pfn))) {
- mark_mmio_spte(sptep, gfn, access);
- return true;
- }
-
- return false;
-}
-
-static inline u64 rsvd_bits(int s, int e)
-{
- return ((1ULL << (e - s + 1)) - 1) << s;
-}
-
-void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
- u64 dirty_mask, u64 nx_mask, u64 x_mask)
-{
- shadow_user_mask = user_mask;
- shadow_accessed_mask = accessed_mask;
- shadow_dirty_mask = dirty_mask;
- shadow_nx_mask = nx_mask;
- shadow_x_mask = x_mask;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
-
-static int is_cpuid_PSE36(void)
-{
- return 1;
-}
-
-static int is_nx(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.efer & EFER_NX;
-}
-
-static int is_shadow_present_pte(u64 pte)
-{
- return pte & PT_PRESENT_MASK && !is_mmio_spte(pte);
-}
-
-static int is_large_pte(u64 pte)
-{
- return pte & PT_PAGE_SIZE_MASK;
-}
-
-static int is_dirty_gpte(unsigned long pte)
-{
- return pte & PT_DIRTY_MASK;
-}
-
-static int is_rmap_spte(u64 pte)
-{
- return is_shadow_present_pte(pte);
-}
-
-static int is_last_spte(u64 pte, int level)
-{
- if (level == PT_PAGE_TABLE_LEVEL)
- return 1;
- if (is_large_pte(pte))
- return 1;
- return 0;
-}
-
-static pfn_t spte_to_pfn(u64 pte)
-{
- return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
-}
-
-static gfn_t pse36_gfn_delta(u32 gpte)
-{
- int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
-
- return (gpte & PT32_DIR_PSE36_MASK) << shift;
-}
-
-#ifdef CONFIG_X86_64
-static void __set_spte(u64 *sptep, u64 spte)
-{
- *sptep = spte;
-}
-
-static void __update_clear_spte_fast(u64 *sptep, u64 spte)
-{
- *sptep = spte;
-}
-
-static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
-{
- return xchg(sptep, spte);
-}
-
-static u64 __get_spte_lockless(u64 *sptep)
-{
- return ACCESS_ONCE(*sptep);
-}
-
-static bool __check_direct_spte_mmio_pf(u64 spte)
-{
- /* It is valid if the spte is zapped. */
- return spte == 0ull;
-}
-#else
-union split_spte {
- struct {
- u32 spte_low;
- u32 spte_high;
- };
- u64 spte;
-};
-
-static void count_spte_clear(u64 *sptep, u64 spte)
-{
- struct kvm_mmu_page *sp = page_header(__pa(sptep));
-
- if (is_shadow_present_pte(spte))
- return;
-
- /* Ensure the spte is completely set before we increase the count */
- smp_wmb();
- sp->clear_spte_count++;
-}
-
-static void __set_spte(u64 *sptep, u64 spte)
-{
- union split_spte *ssptep, sspte;
-
- ssptep = (union split_spte *)sptep;
- sspte = (union split_spte)spte;
-
- ssptep->spte_high = sspte.spte_high;
-
- /*
- * If we map the spte from nonpresent to present, We should store
- * the high bits firstly, then set present bit, so cpu can not
- * fetch this spte while we are setting the spte.
- */
- smp_wmb();
-
- ssptep->spte_low = sspte.spte_low;
-}
-
-static void __update_clear_spte_fast(u64 *sptep, u64 spte)
-{
- union split_spte *ssptep, sspte;
-
- ssptep = (union split_spte *)sptep;
- sspte = (union split_spte)spte;
-
- ssptep->spte_low = sspte.spte_low;
-
- /*
- * If we map the spte from present to nonpresent, we should clear
- * present bit firstly to avoid vcpu fetch the old high bits.
- */
- smp_wmb();
-
- ssptep->spte_high = sspte.spte_high;
- count_spte_clear(sptep, spte);
-}
-
-static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
-{
- union split_spte *ssptep, sspte, orig;
-
- ssptep = (union split_spte *)sptep;
- sspte = (union split_spte)spte;
-
- /* xchg acts as a barrier before the setting of the high bits */
- orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
- orig.spte_high = ssptep->spte_high;
- ssptep->spte_high = sspte.spte_high;
- count_spte_clear(sptep, spte);
-
- return orig.spte;
-}
-
-/*
- * The idea using the light way get the spte on x86_32 guest is from
- * gup_get_pte(arch/x86/mm/gup.c).
- * The difference is we can not catch the spte tlb flush if we leave
- * guest mode, so we emulate it by increase clear_spte_count when spte
- * is cleared.
- */
-static u64 __get_spte_lockless(u64 *sptep)
-{
- struct kvm_mmu_page *sp = page_header(__pa(sptep));
- union split_spte spte, *orig = (union split_spte *)sptep;
- int count;
-
-retry:
- count = sp->clear_spte_count;
- smp_rmb();
-
- spte.spte_low = orig->spte_low;
- smp_rmb();
-
- spte.spte_high = orig->spte_high;
- smp_rmb();
-
- if (unlikely(spte.spte_low != orig->spte_low ||
- count != sp->clear_spte_count))
- goto retry;
-
- return spte.spte;
-}
-
-static bool __check_direct_spte_mmio_pf(u64 spte)
-{
- union split_spte sspte = (union split_spte)spte;
- u32 high_mmio_mask = shadow_mmio_mask >> 32;
-
- /* It is valid if the spte is zapped. */
- if (spte == 0ull)
- return true;
-
- /* It is valid if the spte is being zapped. */
- if (sspte.spte_low == 0ull &&
- (sspte.spte_high & high_mmio_mask) == high_mmio_mask)
- return true;
-
- return false;
-}
-#endif
-
-static bool spte_has_volatile_bits(u64 spte)
-{
- if (!shadow_accessed_mask)
- return false;
-
- if (!is_shadow_present_pte(spte))
- return false;
-
- if ((spte & shadow_accessed_mask) &&
- (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
- return false;
-
- return true;
-}
-
-static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
-{
- return (old_spte & bit_mask) && !(new_spte & bit_mask);
-}
-
-/* Rules for using mmu_spte_set:
- * Set the sptep from nonpresent to present.
- * Note: the sptep being assigned *must* be either not present
- * or in a state where the hardware will not attempt to update
- * the spte.
- */
-static void mmu_spte_set(u64 *sptep, u64 new_spte)
-{
- WARN_ON(is_shadow_present_pte(*sptep));
- __set_spte(sptep, new_spte);
-}
-
-/* Rules for using mmu_spte_update:
- * Update the state bits, it means the mapped pfn is not changged.
- */
-static void mmu_spte_update(u64 *sptep, u64 new_spte)
-{
- u64 mask, old_spte = *sptep;
-
- WARN_ON(!is_rmap_spte(new_spte));
-
- if (!is_shadow_present_pte(old_spte))
- return mmu_spte_set(sptep, new_spte);
-
- new_spte |= old_spte & shadow_dirty_mask;
-
- mask = shadow_accessed_mask;
- if (is_writable_pte(old_spte))
- mask |= shadow_dirty_mask;
-
- if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
- __update_clear_spte_fast(sptep, new_spte);
- else
- old_spte = __update_clear_spte_slow(sptep, new_spte);
-
- if (!shadow_accessed_mask)
- return;
-
- if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
- kvm_set_pfn_accessed(spte_to_pfn(old_spte));
- if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
- kvm_set_pfn_dirty(spte_to_pfn(old_spte));
-}
-
-/*
- * Rules for using mmu_spte_clear_track_bits:
- * It sets the sptep from present to nonpresent, and track the
- * state bits, it is used to clear the last level sptep.
- */
-static int mmu_spte_clear_track_bits(u64 *sptep)
-{
- pfn_t pfn;
- u64 old_spte = *sptep;
-
- if (!spte_has_volatile_bits(old_spte))
- __update_clear_spte_fast(sptep, 0ull);
- else
- old_spte = __update_clear_spte_slow(sptep, 0ull);
-
- if (!is_rmap_spte(old_spte))
- return 0;
-
- pfn = spte_to_pfn(old_spte);
- if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
- kvm_set_pfn_accessed(pfn);
- if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
- kvm_set_pfn_dirty(pfn);
- return 1;
-}
-
-/*
- * Rules for using mmu_spte_clear_no_track:
- * Directly clear spte without caring the state bits of sptep,
- * it is used to set the upper level spte.
- */
-static void mmu_spte_clear_no_track(u64 *sptep)
-{
- __update_clear_spte_fast(sptep, 0ull);
-}
-
-static u64 mmu_spte_get_lockless(u64 *sptep)
-{
- return __get_spte_lockless(sptep);
-}
-
-static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
-{
- rcu_read_lock();
- atomic_inc(&vcpu->kvm->arch.reader_counter);
-
- /* Increase the counter before walking shadow page table */
- smp_mb__after_atomic_inc();
-}
-
-static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
-{
- /* Decrease the counter after walking shadow page table finished */
- smp_mb__before_atomic_dec();
- atomic_dec(&vcpu->kvm->arch.reader_counter);
- rcu_read_unlock();
-}
-
-static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
- struct kmem_cache *base_cache, int min)
-{
- void *obj;
-
- if (cache->nobjs >= min)
- return 0;
- while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
- obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
- if (!obj)
- return -ENOMEM;
- cache->objects[cache->nobjs++] = obj;
- }
- return 0;
-}
-
-static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache)
-{
- return cache->nobjs;
-}
-
-static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
- struct kmem_cache *cache)
-{
- while (mc->nobjs)
- kmem_cache_free(cache, mc->objects[--mc->nobjs]);
-}
-
-static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
- int min)
-{
- void *page;
-
- if (cache->nobjs >= min)
- return 0;
- while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
- page = (void *)__get_free_page(GFP_KERNEL);
- if (!page)
- return -ENOMEM;
- cache->objects[cache->nobjs++] = page;
- }
- return 0;
-}
-
-static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
-{
- while (mc->nobjs)
- free_page((unsigned long)mc->objects[--mc->nobjs]);
-}
-
-static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
-{
- int r;
-
- r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
- pte_list_desc_cache, 8 + PTE_PREFETCH_NUM);
- if (r)
- goto out;
- r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
- if (r)
- goto out;
- r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
- mmu_page_header_cache, 4);
-out:
- return r;
-}
-
-static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
-{
- mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
- pte_list_desc_cache);
- mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
- mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
- mmu_page_header_cache);
-}
-
-static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
- size_t size)
-{
- void *p;
-
- BUG_ON(!mc->nobjs);
- p = mc->objects[--mc->nobjs];
- return p;
-}
-
-static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
-{
- return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache,
- sizeof(struct pte_list_desc));
-}
-
-static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
-{
- kmem_cache_free(pte_list_desc_cache, pte_list_desc);
-}
-
-static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
-{
- if (!sp->role.direct)
- return sp->gfns[index];
-
- return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
-}
-
-static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
-{
- if (sp->role.direct)
- BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
- else
- sp->gfns[index] = gfn;
-}
-
-/*
- * Return the pointer to the large page information for a given gfn,
- * handling slots that are not large page aligned.
- */
-static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
- struct kvm_memory_slot *slot,
- int level)
-{
- unsigned long idx;
-
- idx = gfn_to_index(gfn, slot->base_gfn, level);
- return &slot->arch.lpage_info[level - 2][idx];
-}
-
-static void account_shadowed(struct kvm *kvm, gfn_t gfn)
-{
- struct kvm_memory_slot *slot;
- struct kvm_lpage_info *linfo;
- int i;
-
- slot = gfn_to_memslot(kvm, gfn);
- for (i = PT_DIRECTORY_LEVEL;
- i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
- linfo = lpage_info_slot(gfn, slot, i);
- linfo->write_count += 1;
- }
- kvm->arch.indirect_shadow_pages++;
-}
-
-static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
-{
- struct kvm_memory_slot *slot;
- struct kvm_lpage_info *linfo;
- int i;
-
- slot = gfn_to_memslot(kvm, gfn);
- for (i = PT_DIRECTORY_LEVEL;
- i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
- linfo = lpage_info_slot(gfn, slot, i);
- linfo->write_count -= 1;
- WARN_ON(linfo->write_count < 0);
- }
- kvm->arch.indirect_shadow_pages--;
-}
-
-static int has_wrprotected_page(struct kvm *kvm,
- gfn_t gfn,
- int level)
-{
- struct kvm_memory_slot *slot;
- struct kvm_lpage_info *linfo;
-
- slot = gfn_to_memslot(kvm, gfn);
- if (slot) {
- linfo = lpage_info_slot(gfn, slot, level);
- return linfo->write_count;
- }
-
- return 1;
-}
-
-static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
-{
- unsigned long page_size;
- int i, ret = 0;
-
- page_size = kvm_host_page_size(kvm, gfn);
-
- for (i = PT_PAGE_TABLE_LEVEL;
- i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
- if (page_size >= KVM_HPAGE_SIZE(i))
- ret = i;
- else
- break;
- }
-
- return ret;
-}
-
-static struct kvm_memory_slot *
-gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
- bool no_dirty_log)
-{
- struct kvm_memory_slot *slot;
-
- slot = gfn_to_memslot(vcpu->kvm, gfn);
- if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
- (no_dirty_log && slot->dirty_bitmap))
- slot = NULL;
-
- return slot;
-}
-
-static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
-{
- return !gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true);
-}
-
-static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
-{
- int host_level, level, max_level;
-
- host_level = host_mapping_level(vcpu->kvm, large_gfn);
-
- if (host_level == PT_PAGE_TABLE_LEVEL)
- return host_level;
-
- max_level = kvm_x86_ops->get_lpage_level() < host_level ?
- kvm_x86_ops->get_lpage_level() : host_level;
-
- for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
- if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
- break;
-
- return level - 1;
-}
-
-/*
- * Pte mapping structures:
- *
- * If pte_list bit zero is zero, then pte_list point to the spte.
- *
- * If pte_list bit zero is one, (then pte_list & ~1) points to a struct
- * pte_list_desc containing more mappings.
- *
- * Returns the number of pte entries before the spte was added or zero if
- * the spte was not added.
- *
- */
-static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
- unsigned long *pte_list)
-{
- struct pte_list_desc *desc;
- int i, count = 0;
-
- if (!*pte_list) {
- rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
- *pte_list = (unsigned long)spte;
- } else if (!(*pte_list & 1)) {
- rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
- desc = mmu_alloc_pte_list_desc(vcpu);
- desc->sptes[0] = (u64 *)*pte_list;
- desc->sptes[1] = spte;
- *pte_list = (unsigned long)desc | 1;
- ++count;
- } else {
- rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
- desc = (struct pte_list_desc *)(*pte_list & ~1ul);
- while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
- desc = desc->more;
- count += PTE_LIST_EXT;
- }
- if (desc->sptes[PTE_LIST_EXT-1]) {
- desc->more = mmu_alloc_pte_list_desc(vcpu);
- desc = desc->more;
- }
- for (i = 0; desc->sptes[i]; ++i)
- ++count;
- desc->sptes[i] = spte;
- }
- return count;
-}
-
-static u64 *pte_list_next(unsigned long *pte_list, u64 *spte)
-{
- struct pte_list_desc *desc;
- u64 *prev_spte;
- int i;
-
- if (!*pte_list)
- return NULL;
- else if (!(*pte_list & 1)) {
- if (!spte)
- return (u64 *)*pte_list;
- return NULL;
- }
- desc = (struct pte_list_desc *)(*pte_list & ~1ul);
- prev_spte = NULL;
- while (desc) {
- for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
- if (prev_spte == spte)
- return desc->sptes[i];
- prev_spte = desc->sptes[i];
- }
- desc = desc->more;
- }
- return NULL;
-}
-
-static void
-pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc,
- int i, struct pte_list_desc *prev_desc)
-{
- int j;
-
- for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)
- ;
- desc->sptes[i] = desc->sptes[j];
- desc->sptes[j] = NULL;
- if (j != 0)
- return;
- if (!prev_desc && !desc->more)
- *pte_list = (unsigned long)desc->sptes[0];
- else
- if (prev_desc)
- prev_desc->more = desc->more;
- else
- *pte_list = (unsigned long)desc->more | 1;
- mmu_free_pte_list_desc(desc);
-}
-
-static void pte_list_remove(u64 *spte, unsigned long *pte_list)
-{
- struct pte_list_desc *desc;
- struct pte_list_desc *prev_desc;
- int i;
-
- if (!*pte_list) {
- printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte);
- BUG();
- } else if (!(*pte_list & 1)) {
- rmap_printk("pte_list_remove: %p 1->0\n", spte);
- if ((u64 *)*pte_list != spte) {
- printk(KERN_ERR "pte_list_remove: %p 1->BUG\n", spte);
- BUG();
- }
- *pte_list = 0;
- } else {
- rmap_printk("pte_list_remove: %p many->many\n", spte);
- desc = (struct pte_list_desc *)(*pte_list & ~1ul);
- prev_desc = NULL;
- while (desc) {
- for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
- if (desc->sptes[i] == spte) {
- pte_list_desc_remove_entry(pte_list,
- desc, i,
- prev_desc);
- return;
- }
- prev_desc = desc;
- desc = desc->more;
- }
- pr_err("pte_list_remove: %p many->many\n", spte);
- BUG();
- }
-}
-
-typedef void (*pte_list_walk_fn) (u64 *spte);
-static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
-{
- struct pte_list_desc *desc;
- int i;
-
- if (!*pte_list)
- return;
-
- if (!(*pte_list & 1))
- return fn((u64 *)*pte_list);
-
- desc = (struct pte_list_desc *)(*pte_list & ~1ul);
- while (desc) {
- for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
- fn(desc->sptes[i]);
- desc = desc->more;
- }
-}
-
-static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
- struct kvm_memory_slot *slot)
-{
- struct kvm_lpage_info *linfo;
-
- if (likely(level == PT_PAGE_TABLE_LEVEL))
- return &slot->rmap[gfn - slot->base_gfn];
-
- linfo = lpage_info_slot(gfn, slot, level);
- return &linfo->rmap_pde;
-}
-
-/*
- * Take gfn and return the reverse mapping to it.
- */
-static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
-{
- struct kvm_memory_slot *slot;
-
- slot = gfn_to_memslot(kvm, gfn);
- return __gfn_to_rmap(gfn, level, slot);
-}
-
-static bool rmap_can_add(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu_memory_cache *cache;
-
- cache = &vcpu->arch.mmu_pte_list_desc_cache;
- return mmu_memory_cache_free_objects(cache);
-}
-
-static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
-{
- struct kvm_mmu_page *sp;
- unsigned long *rmapp;
-
- sp = page_header(__pa(spte));
- kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
- rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
- return pte_list_add(vcpu, spte, rmapp);
-}
-
-static u64 *rmap_next(unsigned long *rmapp, u64 *spte)
-{
- return pte_list_next(rmapp, spte);
-}
-
-static void rmap_remove(struct kvm *kvm, u64 *spte)
-{
- struct kvm_mmu_page *sp;
- gfn_t gfn;
- unsigned long *rmapp;
-
- sp = page_header(__pa(spte));
- gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
- rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
- pte_list_remove(spte, rmapp);
-}
-
-static void drop_spte(struct kvm *kvm, u64 *sptep)
-{
- if (mmu_spte_clear_track_bits(sptep))
- rmap_remove(kvm, sptep);
-}
-
-int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
- struct kvm_memory_slot *slot)
-{
- unsigned long *rmapp;
- u64 *spte;
- int i, write_protected = 0;
-
- rmapp = __gfn_to_rmap(gfn, PT_PAGE_TABLE_LEVEL, slot);
- spte = rmap_next(rmapp, NULL);
- while (spte) {
- BUG_ON(!(*spte & PT_PRESENT_MASK));
- rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
- if (is_writable_pte(*spte)) {
- mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK);
- write_protected = 1;
- }
- spte = rmap_next(rmapp, spte);
- }
-
- /* check for huge page mappings */
- for (i = PT_DIRECTORY_LEVEL;
- i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
- rmapp = __gfn_to_rmap(gfn, i, slot);
- spte = rmap_next(rmapp, NULL);
- while (spte) {
- BUG_ON(!(*spte & PT_PRESENT_MASK));
- BUG_ON(!is_large_pte(*spte));
- pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
- if (is_writable_pte(*spte)) {
- drop_spte(kvm, spte);
- --kvm->stat.lpages;
- spte = NULL;
- write_protected = 1;
- }
- spte = rmap_next(rmapp, spte);
- }
- }
-
- return write_protected;
-}
-
-static int rmap_write_protect(struct kvm *kvm, u64 gfn)
-{
- struct kvm_memory_slot *slot;
-
- slot = gfn_to_memslot(kvm, gfn);
- return kvm_mmu_rmap_write_protect(kvm, gfn, slot);
-}
-
-static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
- unsigned long data)
-{
- u64 *spte;
- int need_tlb_flush = 0;
-
- while ((spte = rmap_next(rmapp, NULL))) {
- BUG_ON(!(*spte & PT_PRESENT_MASK));
- rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
- drop_spte(kvm, spte);
- need_tlb_flush = 1;
- }
- return need_tlb_flush;
-}
-
-static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
- unsigned long data)
-{
- int need_flush = 0;
- u64 *spte, new_spte;
- pte_t *ptep = (pte_t *)data;
- pfn_t new_pfn;
-
- WARN_ON(pte_huge(*ptep));
- new_pfn = pte_pfn(*ptep);
- spte = rmap_next(rmapp, NULL);
- while (spte) {
- BUG_ON(!is_shadow_present_pte(*spte));
- rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
- need_flush = 1;
- if (pte_write(*ptep)) {
- drop_spte(kvm, spte);
- spte = rmap_next(rmapp, NULL);
- } else {
- new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
- new_spte |= (u64)new_pfn << PAGE_SHIFT;
-
- new_spte &= ~PT_WRITABLE_MASK;
- new_spte &= ~SPTE_HOST_WRITEABLE;
- new_spte &= ~shadow_accessed_mask;
- mmu_spte_clear_track_bits(spte);
- mmu_spte_set(spte, new_spte);
- spte = rmap_next(rmapp, spte);
- }
- }
- if (need_flush)
- kvm_flush_remote_tlbs(kvm);
-
- return 0;
-}
-
-static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
- unsigned long data,
- int (*handler)(struct kvm *kvm, unsigned long *rmapp,
- unsigned long data))
-{
- int j;
- int ret;
- int retval = 0;
- struct kvm_memslots *slots;
- struct kvm_memory_slot *memslot;
-
- slots = kvm_memslots(kvm);
-
- kvm_for_each_memslot(memslot, slots) {
- unsigned long start = memslot->userspace_addr;
- unsigned long end;
-
- end = start + (memslot->npages << PAGE_SHIFT);
- if (hva >= start && hva < end) {
- gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
- gfn_t gfn = memslot->base_gfn + gfn_offset;
-
- ret = handler(kvm, &memslot->rmap[gfn_offset], data);
-
- for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
- struct kvm_lpage_info *linfo;
-
- linfo = lpage_info_slot(gfn, memslot,
- PT_DIRECTORY_LEVEL + j);
- ret |= handler(kvm, &linfo->rmap_pde, data);
- }
- trace_kvm_age_page(hva, memslot, ret);
- retval |= ret;
- }
- }
-
- return retval;
-}
-
-int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
-{
- return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
-}
-
-void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
-{
- kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
-}
-
-static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
- unsigned long data)
-{
- u64 *spte;
- int young = 0;
-
- /*
- * Emulate the accessed bit for EPT, by checking if this page has
- * an EPT mapping, and clearing it if it does. On the next access,
- * a new EPT mapping will be established.
- * This has some overhead, but not as much as the cost of swapping
- * out actively used pages or breaking up actively used hugepages.
- */
- if (!shadow_accessed_mask)
- return kvm_unmap_rmapp(kvm, rmapp, data);
-
- spte = rmap_next(rmapp, NULL);
- while (spte) {
- int _young;
- u64 _spte = *spte;
- BUG_ON(!(_spte & PT_PRESENT_MASK));
- _young = _spte & PT_ACCESSED_MASK;
- if (_young) {
- young = 1;
- clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
- }
- spte = rmap_next(rmapp, spte);
- }
- return young;
-}
-
-static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
- unsigned long data)
-{
- u64 *spte;
- int young = 0;
-
- /*
- * If there's no access bit in the secondary pte set by the
- * hardware it's up to gup-fast/gup to set the access bit in
- * the primary pte or in the page structure.
- */
- if (!shadow_accessed_mask)
- goto out;
-
- spte = rmap_next(rmapp, NULL);
- while (spte) {
- u64 _spte = *spte;
- BUG_ON(!(_spte & PT_PRESENT_MASK));
- young = _spte & PT_ACCESSED_MASK;
- if (young) {
- young = 1;
- break;
- }
- spte = rmap_next(rmapp, spte);
- }
-out:
- return young;
-}
-
-#define RMAP_RECYCLE_THRESHOLD 1000
-
-static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
-{
- unsigned long *rmapp;
- struct kvm_mmu_page *sp;
-
- sp = page_header(__pa(spte));
-
- rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
-
- kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
- kvm_flush_remote_tlbs(vcpu->kvm);
-}
-
-int kvm_age_hva(struct kvm *kvm, unsigned long hva)
-{
- return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
-}
-
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
-{
- return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
-}
-
-#ifdef MMU_DEBUG
-static int is_empty_shadow_page(u64 *spt)
-{
- u64 *pos;
- u64 *end;
-
- for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
- if (is_shadow_present_pte(*pos)) {
- printk(KERN_ERR "%s: %p %llx\n", __func__,
- pos, *pos);
- return 0;
- }
- return 1;
-}
-#endif
-
-/*
- * This value is the sum of all of the kvm instances's
- * kvm->arch.n_used_mmu_pages values. We need a global,
- * aggregate version in order to make the slab shrinker
- * faster
- */
-static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
-{
- kvm->arch.n_used_mmu_pages += nr;
- percpu_counter_add(&kvm_total_used_mmu_pages, nr);
-}
-
-/*
- * Remove the sp from shadow page cache, after call it,
- * we can not find this sp from the cache, and the shadow
- * page table is still valid.
- * It should be under the protection of mmu lock.
- */
-static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp)
-{
- ASSERT(is_empty_shadow_page(sp->spt));
- hlist_del(&sp->hash_link);
- if (!sp->role.direct)
- free_page((unsigned long)sp->gfns);
-}
-
-/*
- * Free the shadow page table and the sp, we can do it
- * out of the protection of mmu lock.
- */
-static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
-{
- list_del(&sp->link);
- free_page((unsigned long)sp->spt);
- kmem_cache_free(mmu_page_header_cache, sp);
-}
-
-static unsigned kvm_page_table_hashfn(gfn_t gfn)
-{
- return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
-}
-
-static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, u64 *parent_pte)
-{
- if (!parent_pte)
- return;
-
- pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
-}
-
-static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
- u64 *parent_pte)
-{
- pte_list_remove(parent_pte, &sp->parent_ptes);
-}
-
-static void drop_parent_pte(struct kvm_mmu_page *sp,
- u64 *parent_pte)
-{
- mmu_page_remove_parent_pte(sp, parent_pte);
- mmu_spte_clear_no_track(parent_pte);
-}
-
-static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
- u64 *parent_pte, int direct)
-{
- struct kvm_mmu_page *sp;
- sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache,
- sizeof *sp);
- sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
- if (!direct)
- sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
- PAGE_SIZE);
- set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
- list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
- bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM);
- sp->parent_ptes = 0;
- mmu_page_add_parent_pte(vcpu, sp, parent_pte);
- kvm_mod_used_mmu_pages(vcpu->kvm, +1);
- return sp;
-}
-
-static void mark_unsync(u64 *spte);
-static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
-{
- pte_list_walk(&sp->parent_ptes, mark_unsync);
-}
-
-static void mark_unsync(u64 *spte)
-{
- struct kvm_mmu_page *sp;
- unsigned int index;
-
- sp = page_header(__pa(spte));
- index = spte - sp->spt;
- if (__test_and_set_bit(index, sp->unsync_child_bitmap))
- return;
- if (sp->unsync_children++)
- return;
- kvm_mmu_mark_parents_unsync(sp);
-}
-
-static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp)
-{
- return 1;
-}
-
-static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
-{
-}
-
-static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, u64 *spte,
- const void *pte)
-{
- WARN_ON(1);
-}
-
-#define KVM_PAGE_ARRAY_NR 16
-
-struct kvm_mmu_pages {
- struct mmu_page_and_offset {
- struct kvm_mmu_page *sp;
- unsigned int idx;
- } page[KVM_PAGE_ARRAY_NR];
- unsigned int nr;
-};
-
-static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
- int idx)
-{
- int i;
-
- if (sp->unsync)
- for (i=0; i < pvec->nr; i++)
- if (pvec->page[i].sp == sp)
- return 0;
-
- pvec->page[pvec->nr].sp = sp;
- pvec->page[pvec->nr].idx = idx;
- pvec->nr++;
- return (pvec->nr == KVM_PAGE_ARRAY_NR);
-}
-
-static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
- struct kvm_mmu_pages *pvec)
-{
- int i, ret, nr_unsync_leaf = 0;
-
- for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
- struct kvm_mmu_page *child;
- u64 ent = sp->spt[i];
-
- if (!is_shadow_present_pte(ent) || is_large_pte(ent))
- goto clear_child_bitmap;
-
- child = page_header(ent & PT64_BASE_ADDR_MASK);
-
- if (child->unsync_children) {
- if (mmu_pages_add(pvec, child, i))
- return -ENOSPC;
-
- ret = __mmu_unsync_walk(child, pvec);
- if (!ret)
- goto clear_child_bitmap;
- else if (ret > 0)
- nr_unsync_leaf += ret;
- else
- return ret;
- } else if (child->unsync) {
- nr_unsync_leaf++;
- if (mmu_pages_add(pvec, child, i))
- return -ENOSPC;
- } else
- goto clear_child_bitmap;
-
- continue;
-
-clear_child_bitmap:
- __clear_bit(i, sp->unsync_child_bitmap);
- sp->unsync_children--;
- WARN_ON((int)sp->unsync_children < 0);
- }
-
-
- return nr_unsync_leaf;
-}
-
-static int mmu_unsync_walk(struct kvm_mmu_page *sp,
- struct kvm_mmu_pages *pvec)
-{
- if (!sp->unsync_children)
- return 0;
-
- mmu_pages_add(pvec, sp, 0);
- return __mmu_unsync_walk(sp, pvec);
-}
-
-static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- WARN_ON(!sp->unsync);
- trace_kvm_mmu_sync_page(sp);
- sp->unsync = 0;
- --kvm->stat.mmu_unsync;
-}
-
-static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
- struct list_head *invalid_list);
-static void kvm_mmu_commit_zap_page(struct kvm *kvm,
- struct list_head *invalid_list);
-
-#define for_each_gfn_sp(kvm, sp, gfn, pos) \
- hlist_for_each_entry(sp, pos, \
- &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \
- if ((sp)->gfn != (gfn)) {} else
-
-#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos) \
- hlist_for_each_entry(sp, pos, \
- &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \
- if ((sp)->gfn != (gfn) || (sp)->role.direct || \
- (sp)->role.invalid) {} else
-
-/* @sp->gfn should be write-protected at the call site */
-static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- struct list_head *invalid_list, bool clear_unsync)
-{
- if (sp->role.cr4_pae != !!is_pae(vcpu)) {
- kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
- return 1;
- }
-
- if (clear_unsync)
- kvm_unlink_unsync_page(vcpu->kvm, sp);
-
- if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
- kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
- return 1;
- }
-
- kvm_mmu_flush_tlb(vcpu);
- return 0;
-}
-
-static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp)
-{
- LIST_HEAD(invalid_list);
- int ret;
-
- ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
- if (ret)
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
-
- return ret;
-}
-
-#ifdef CONFIG_KVM_MMU_AUDIT
-#include "mmu_audit.c"
-#else
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
-static void mmu_audit_disable(void) { }
-#endif
-
-static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- struct list_head *invalid_list)
-{
- return __kvm_sync_page(vcpu, sp, invalid_list, true);
-}
-
-/* @gfn should be write-protected at the call site */
-static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
- struct kvm_mmu_page *s;
- struct hlist_node *node;
- LIST_HEAD(invalid_list);
- bool flush = false;
-
- for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
- if (!s->unsync)
- continue;
-
- WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
- kvm_unlink_unsync_page(vcpu->kvm, s);
- if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
- (vcpu->arch.mmu.sync_page(vcpu, s))) {
- kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
- continue;
- }
- flush = true;
- }
-
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
- if (flush)
- kvm_mmu_flush_tlb(vcpu);
-}
-
-struct mmu_page_path {
- struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
- unsigned int idx[PT64_ROOT_LEVEL-1];
-};
-
-#define for_each_sp(pvec, sp, parents, i) \
- for (i = mmu_pages_next(&pvec, &parents, -1), \
- sp = pvec.page[i].sp; \
- i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \
- i = mmu_pages_next(&pvec, &parents, i))
-
-static int mmu_pages_next(struct kvm_mmu_pages *pvec,
- struct mmu_page_path *parents,
- int i)
-{
- int n;
-
- for (n = i+1; n < pvec->nr; n++) {
- struct kvm_mmu_page *sp = pvec->page[n].sp;
-
- if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
- parents->idx[0] = pvec->page[n].idx;
- return n;
- }
-
- parents->parent[sp->role.level-2] = sp;
- parents->idx[sp->role.level-1] = pvec->page[n].idx;
- }
-
- return n;
-}
-
-static void mmu_pages_clear_parents(struct mmu_page_path *parents)
-{
- struct kvm_mmu_page *sp;
- unsigned int level = 0;
-
- do {
- unsigned int idx = parents->idx[level];
-
- sp = parents->parent[level];
- if (!sp)
- return;
-
- --sp->unsync_children;
- WARN_ON((int)sp->unsync_children < 0);
- __clear_bit(idx, sp->unsync_child_bitmap);
- level++;
- } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
-}
-
-static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
- struct mmu_page_path *parents,
- struct kvm_mmu_pages *pvec)
-{
- parents->parent[parent->role.level-1] = NULL;
- pvec->nr = 0;
-}
-
-static void mmu_sync_children(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *parent)
-{
- int i;
- struct kvm_mmu_page *sp;
- struct mmu_page_path parents;
- struct kvm_mmu_pages pages;
- LIST_HEAD(invalid_list);
-
- kvm_mmu_pages_init(parent, &parents, &pages);
- while (mmu_unsync_walk(parent, &pages)) {
- int protected = 0;
-
- for_each_sp(pages, sp, parents, i)
- protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
-
- if (protected)
- kvm_flush_remote_tlbs(vcpu->kvm);
-
- for_each_sp(pages, sp, parents, i) {
- kvm_sync_page(vcpu, sp, &invalid_list);
- mmu_pages_clear_parents(&parents);
- }
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
- cond_resched_lock(&vcpu->kvm->mmu_lock);
- kvm_mmu_pages_init(parent, &parents, &pages);
- }
-}
-
-static void init_shadow_page_table(struct kvm_mmu_page *sp)
-{
- int i;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
- sp->spt[i] = 0ull;
-}
-
-static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
-{
- sp->write_flooding_count = 0;
-}
-
-static void clear_sp_write_flooding_count(u64 *spte)
-{
- struct kvm_mmu_page *sp = page_header(__pa(spte));
-
- __clear_sp_write_flooding_count(sp);
-}
-
-static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
- gfn_t gfn,
- gva_t gaddr,
- unsigned level,
- int direct,
- unsigned access,
- u64 *parent_pte)
-{
- union kvm_mmu_page_role role;
- unsigned quadrant;
- struct kvm_mmu_page *sp;
- struct hlist_node *node;
- bool need_sync = false;
-
- role = vcpu->arch.mmu.base_role;
- role.level = level;
- role.direct = direct;
- if (role.direct)
- role.cr4_pae = 0;
- role.access = access;
- if (!vcpu->arch.mmu.direct_map
- && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
- quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
- quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
- role.quadrant = quadrant;
- }
- for_each_gfn_sp(vcpu->kvm, sp, gfn, node) {
- if (!need_sync && sp->unsync)
- need_sync = true;
-
- if (sp->role.word != role.word)
- continue;
-
- if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
- break;
-
- mmu_page_add_parent_pte(vcpu, sp, parent_pte);
- if (sp->unsync_children) {
- kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
- kvm_mmu_mark_parents_unsync(sp);
- } else if (sp->unsync)
- kvm_mmu_mark_parents_unsync(sp);
-
- __clear_sp_write_flooding_count(sp);
- trace_kvm_mmu_get_page(sp, false);
- return sp;
- }
- ++vcpu->kvm->stat.mmu_cache_miss;
- sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct);
- if (!sp)
- return sp;
- sp->gfn = gfn;
- sp->role = role;
- hlist_add_head(&sp->hash_link,
- &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
- if (!direct) {
- if (rmap_write_protect(vcpu->kvm, gfn))
- kvm_flush_remote_tlbs(vcpu->kvm);
- if (level > PT_PAGE_TABLE_LEVEL && need_sync)
- kvm_sync_pages(vcpu, gfn);
-
- account_shadowed(vcpu->kvm, gfn);
- }
- init_shadow_page_table(sp);
- trace_kvm_mmu_get_page(sp, true);
- return sp;
-}
-
-static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
- struct kvm_vcpu *vcpu, u64 addr)
-{
- iterator->addr = addr;
- iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
- iterator->level = vcpu->arch.mmu.shadow_root_level;
-
- if (iterator->level == PT64_ROOT_LEVEL &&
- vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL &&
- !vcpu->arch.mmu.direct_map)
- --iterator->level;
-
- if (iterator->level == PT32E_ROOT_LEVEL) {
- iterator->shadow_addr
- = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
- iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
- --iterator->level;
- if (!iterator->shadow_addr)
- iterator->level = 0;
- }
-}
-
-static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
-{
- if (iterator->level < PT_PAGE_TABLE_LEVEL)
- return false;
-
- iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
- iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
- return true;
-}
-
-static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
- u64 spte)
-{
- if (is_last_spte(spte, iterator->level)) {
- iterator->level = 0;
- return;
- }
-
- iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
- --iterator->level;
-}
-
-static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
-{
- return __shadow_walk_next(iterator, *iterator->sptep);
-}
-
-static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
-{
- u64 spte;
-
- spte = __pa(sp->spt)
- | PT_PRESENT_MASK | PT_ACCESSED_MASK
- | PT_WRITABLE_MASK | PT_USER_MASK;
- mmu_spte_set(sptep, spte);
-}
-
-static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
-{
- if (is_large_pte(*sptep)) {
- drop_spte(vcpu->kvm, sptep);
- --vcpu->kvm->stat.lpages;
- kvm_flush_remote_tlbs(vcpu->kvm);
- }
-}
-
-static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
- unsigned direct_access)
-{
- if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
- struct kvm_mmu_page *child;
-
- /*
- * For the direct sp, if the guest pte's dirty bit
- * changed form clean to dirty, it will corrupt the
- * sp's access: allow writable in the read-only sp,
- * so we should update the spte at this point to get
- * a new sp with the correct access.
- */
- child = page_header(*sptep & PT64_BASE_ADDR_MASK);
- if (child->role.access == direct_access)
- return;
-
- drop_parent_pte(child, sptep);
- kvm_flush_remote_tlbs(vcpu->kvm);
- }
-}
-
-static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
- u64 *spte)
-{
- u64 pte;
- struct kvm_mmu_page *child;
-
- pte = *spte;
- if (is_shadow_present_pte(pte)) {
- if (is_last_spte(pte, sp->role.level)) {
- drop_spte(kvm, spte);
- if (is_large_pte(pte))
- --kvm->stat.lpages;
- } else {
- child = page_header(pte & PT64_BASE_ADDR_MASK);
- drop_parent_pte(child, spte);
- }
- return true;
- }
-
- if (is_mmio_spte(pte))
- mmu_spte_clear_no_track(spte);
-
- return false;
-}
-
-static void kvm_mmu_page_unlink_children(struct kvm *kvm,
- struct kvm_mmu_page *sp)
-{
- unsigned i;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
- mmu_page_zap_pte(kvm, sp, sp->spt + i);
-}
-
-static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
-{
- mmu_page_remove_parent_pte(sp, parent_pte);
-}
-
-static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- u64 *parent_pte;
-
- while ((parent_pte = pte_list_next(&sp->parent_ptes, NULL)))
- drop_parent_pte(sp, parent_pte);
-}
-
-static int mmu_zap_unsync_children(struct kvm *kvm,
- struct kvm_mmu_page *parent,
- struct list_head *invalid_list)
-{
- int i, zapped = 0;
- struct mmu_page_path parents;
- struct kvm_mmu_pages pages;
-
- if (parent->role.level == PT_PAGE_TABLE_LEVEL)
- return 0;
-
- kvm_mmu_pages_init(parent, &parents, &pages);
- while (mmu_unsync_walk(parent, &pages)) {
- struct kvm_mmu_page *sp;
-
- for_each_sp(pages, sp, parents, i) {
- kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
- mmu_pages_clear_parents(&parents);
- zapped++;
- }
- kvm_mmu_pages_init(parent, &parents, &pages);
- }
-
- return zapped;
-}
-
-static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
- struct list_head *invalid_list)
-{
- int ret;
-
- trace_kvm_mmu_prepare_zap_page(sp);
- ++kvm->stat.mmu_shadow_zapped;
- ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
- kvm_mmu_page_unlink_children(kvm, sp);
- kvm_mmu_unlink_parents(kvm, sp);
- if (!sp->role.invalid && !sp->role.direct)
- unaccount_shadowed(kvm, sp->gfn);
- if (sp->unsync)
- kvm_unlink_unsync_page(kvm, sp);
- if (!sp->root_count) {
- /* Count self */
- ret++;
- list_move(&sp->link, invalid_list);
- kvm_mod_used_mmu_pages(kvm, -1);
- } else {
- list_move(&sp->link, &kvm->arch.active_mmu_pages);
- kvm_reload_remote_mmus(kvm);
- }
-
- sp->role.invalid = 1;
- return ret;
-}
-
-static void kvm_mmu_isolate_pages(struct list_head *invalid_list)
-{
- struct kvm_mmu_page *sp;
-
- list_for_each_entry(sp, invalid_list, link)
- kvm_mmu_isolate_page(sp);
-}
-
-static void free_pages_rcu(struct rcu_head *head)
-{
- struct kvm_mmu_page *next, *sp;
-
- sp = container_of(head, struct kvm_mmu_page, rcu);
- while (sp) {
- if (!list_empty(&sp->link))
- next = list_first_entry(&sp->link,
- struct kvm_mmu_page, link);
- else
- next = NULL;
- kvm_mmu_free_page(sp);
- sp = next;
- }
-}
-
-static void kvm_mmu_commit_zap_page(struct kvm *kvm,
- struct list_head *invalid_list)
-{
- struct kvm_mmu_page *sp;
-
- if (list_empty(invalid_list))
- return;
-
- kvm_flush_remote_tlbs(kvm);
-
- if (atomic_read(&kvm->arch.reader_counter)) {
- kvm_mmu_isolate_pages(invalid_list);
- sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
- list_del_init(invalid_list);
-
- trace_kvm_mmu_delay_free_pages(sp);
- call_rcu(&sp->rcu, free_pages_rcu);
- return;
- }
-
- do {
- sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
- WARN_ON(!sp->role.invalid || sp->root_count);
- kvm_mmu_isolate_page(sp);
- kvm_mmu_free_page(sp);
- } while (!list_empty(invalid_list));
-
-}
-
-/*
- * Changing the number of mmu pages allocated to the vm
- * Note: if goal_nr_mmu_pages is too small, you will get dead lock
- */
-void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
-{
- LIST_HEAD(invalid_list);
- /*
- * If we set the number of mmu pages to be smaller be than the
- * number of actived pages , we must to free some mmu pages before we
- * change the value
- */
-
- if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
- while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
- !list_empty(&kvm->arch.active_mmu_pages)) {
- struct kvm_mmu_page *page;
-
- page = container_of(kvm->arch.active_mmu_pages.prev,
- struct kvm_mmu_page, link);
- kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
- }
- kvm_mmu_commit_zap_page(kvm, &invalid_list);
- goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
- }
-
- kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
-}
-
-int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
-{
- struct kvm_mmu_page *sp;
- struct hlist_node *node;
- LIST_HEAD(invalid_list);
- int r;
-
- pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
- r = 0;
- spin_lock(&kvm->mmu_lock);
- for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
- pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
- sp->role.word);
- r = 1;
- kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
- }
- kvm_mmu_commit_zap_page(kvm, &invalid_list);
- spin_unlock(&kvm->mmu_lock);
-
- return r;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
-
-static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
-{
- int slot = memslot_id(kvm, gfn);
- struct kvm_mmu_page *sp = page_header(__pa(pte));
-
- __set_bit(slot, sp->slot_bitmap);
-}
-
-/*
- * The function is based on mtrr_type_lookup() in
- * arch/x86/kernel/cpu/mtrr/generic.c
- */
-static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
- u64 start, u64 end)
-{
- int i;
- u64 base, mask;
- u8 prev_match, curr_match;
- int num_var_ranges = KVM_NR_VAR_MTRR;
-
- if (!mtrr_state->enabled)
- return 0xFF;
-
- /* Make end inclusive end, instead of exclusive */
- end--;
-
- /* Look in fixed ranges. Just return the type as per start */
- if (mtrr_state->have_fixed && (start < 0x100000)) {
- int idx;
-
- if (start < 0x80000) {
- idx = 0;
- idx += (start >> 16);
- return mtrr_state->fixed_ranges[idx];
- } else if (start < 0xC0000) {
- idx = 1 * 8;
- idx += ((start - 0x80000) >> 14);
- return mtrr_state->fixed_ranges[idx];
- } else if (start < 0x1000000) {
- idx = 3 * 8;
- idx += ((start - 0xC0000) >> 12);
- return mtrr_state->fixed_ranges[idx];
- }
- }
-
- /*
- * Look in variable ranges
- * Look of multiple ranges matching this address and pick type
- * as per MTRR precedence
- */
- if (!(mtrr_state->enabled & 2))
- return mtrr_state->def_type;
-
- prev_match = 0xFF;
- for (i = 0; i < num_var_ranges; ++i) {
- unsigned short start_state, end_state;
-
- if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11)))
- continue;
-
- base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) +
- (mtrr_state->var_ranges[i].base_lo & PAGE_MASK);
- mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) +
- (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK);
-
- start_state = ((start & mask) == (base & mask));
- end_state = ((end & mask) == (base & mask));
- if (start_state != end_state)
- return 0xFE;
-
- if ((start & mask) != (base & mask))
- continue;
-
- curr_match = mtrr_state->var_ranges[i].base_lo & 0xff;
- if (prev_match == 0xFF) {
- prev_match = curr_match;
- continue;
- }
-
- if (prev_match == MTRR_TYPE_UNCACHABLE ||
- curr_match == MTRR_TYPE_UNCACHABLE)
- return MTRR_TYPE_UNCACHABLE;
-
- if ((prev_match == MTRR_TYPE_WRBACK &&
- curr_match == MTRR_TYPE_WRTHROUGH) ||
- (prev_match == MTRR_TYPE_WRTHROUGH &&
- curr_match == MTRR_TYPE_WRBACK)) {
- prev_match = MTRR_TYPE_WRTHROUGH;
- curr_match = MTRR_TYPE_WRTHROUGH;
- }
-
- if (prev_match != curr_match)
- return MTRR_TYPE_UNCACHABLE;
- }
-
- if (prev_match != 0xFF)
- return prev_match;
-
- return mtrr_state->def_type;
-}
-
-u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
- u8 mtrr;
-
- mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT,
- (gfn << PAGE_SHIFT) + PAGE_SIZE);
- if (mtrr == 0xfe || mtrr == 0xff)
- mtrr = MTRR_TYPE_WRBACK;
- return mtrr;
-}
-EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
-
-static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
-{
- trace_kvm_mmu_unsync_page(sp);
- ++vcpu->kvm->stat.mmu_unsync;
- sp->unsync = 1;
-
- kvm_mmu_mark_parents_unsync(sp);
-}
-
-static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
- struct kvm_mmu_page *s;
- struct hlist_node *node;
-
- for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
- if (s->unsync)
- continue;
- WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
- __kvm_unsync_page(vcpu, s);
- }
-}
-
-static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
- bool can_unsync)
-{
- struct kvm_mmu_page *s;
- struct hlist_node *node;
- bool need_unsync = false;
-
- for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
- if (!can_unsync)
- return 1;
-
- if (s->role.level != PT_PAGE_TABLE_LEVEL)
- return 1;
-
- if (!need_unsync && !s->unsync) {
- need_unsync = true;
- }
- }
- if (need_unsync)
- kvm_unsync_pages(vcpu, gfn);
- return 0;
-}
-
-static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
- unsigned pte_access, int user_fault,
- int write_fault, int level,
- gfn_t gfn, pfn_t pfn, bool speculative,
- bool can_unsync, bool host_writable)
-{
- u64 spte, entry = *sptep;
- int ret = 0;
-
- if (set_mmio_spte(sptep, gfn, pfn, pte_access))
- return 0;
-
- spte = PT_PRESENT_MASK;
- if (!speculative)
- spte |= shadow_accessed_mask;
-
- if (pte_access & ACC_EXEC_MASK)
- spte |= shadow_x_mask;
- else
- spte |= shadow_nx_mask;
- if (pte_access & ACC_USER_MASK)
- spte |= shadow_user_mask;
- if (level > PT_PAGE_TABLE_LEVEL)
- spte |= PT_PAGE_SIZE_MASK;
- if (tdp_enabled)
- spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
- kvm_is_mmio_pfn(pfn));
-
- if (host_writable)
- spte |= SPTE_HOST_WRITEABLE;
- else
- pte_access &= ~ACC_WRITE_MASK;
-
- spte |= (u64)pfn << PAGE_SHIFT;
-
- if ((pte_access & ACC_WRITE_MASK)
- || (!vcpu->arch.mmu.direct_map && write_fault
- && !is_write_protection(vcpu) && !user_fault)) {
-
- if (level > PT_PAGE_TABLE_LEVEL &&
- has_wrprotected_page(vcpu->kvm, gfn, level)) {
- ret = 1;
- drop_spte(vcpu->kvm, sptep);
- goto done;
- }
-
- spte |= PT_WRITABLE_MASK;
-
- if (!vcpu->arch.mmu.direct_map
- && !(pte_access & ACC_WRITE_MASK)) {
- spte &= ~PT_USER_MASK;
- /*
- * If we converted a user page to a kernel page,
- * so that the kernel can write to it when cr0.wp=0,
- * then we should prevent the kernel from executing it
- * if SMEP is enabled.
- */
- if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
- spte |= PT64_NX_MASK;
- }
-
- /*
- * Optimization: for pte sync, if spte was writable the hash
- * lookup is unnecessary (and expensive). Write protection
- * is responsibility of mmu_get_page / kvm_sync_page.
- * Same reasoning can be applied to dirty page accounting.
- */
- if (!can_unsync && is_writable_pte(*sptep))
- goto set_pte;
-
- if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
- pgprintk("%s: found shadow page for %llx, marking ro\n",
- __func__, gfn);
- ret = 1;
- pte_access &= ~ACC_WRITE_MASK;
- if (is_writable_pte(spte))
- spte &= ~PT_WRITABLE_MASK;
- }
- }
-
- if (pte_access & ACC_WRITE_MASK)
- mark_page_dirty(vcpu->kvm, gfn);
-
-set_pte:
- mmu_spte_update(sptep, spte);
- /*
- * If we overwrite a writable spte with a read-only one we
- * should flush remote TLBs. Otherwise rmap_write_protect
- * will find a read-only spte, even though the writable spte
- * might be cached on a CPU's TLB.
- */
- if (is_writable_pte(entry) && !is_writable_pte(*sptep))
- kvm_flush_remote_tlbs(vcpu->kvm);
-done:
- return ret;
-}
-
-static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
- unsigned pt_access, unsigned pte_access,
- int user_fault, int write_fault,
- int *emulate, int level, gfn_t gfn,
- pfn_t pfn, bool speculative,
- bool host_writable)
-{
- int was_rmapped = 0;
- int rmap_count;
-
- pgprintk("%s: spte %llx access %x write_fault %d"
- " user_fault %d gfn %llx\n",
- __func__, *sptep, pt_access,
- write_fault, user_fault, gfn);
-
- if (is_rmap_spte(*sptep)) {
- /*
- * If we overwrite a PTE page pointer with a 2MB PMD, unlink
- * the parent of the now unreachable PTE.
- */
- if (level > PT_PAGE_TABLE_LEVEL &&
- !is_large_pte(*sptep)) {
- struct kvm_mmu_page *child;
- u64 pte = *sptep;
-
- child = page_header(pte & PT64_BASE_ADDR_MASK);
- drop_parent_pte(child, sptep);
- kvm_flush_remote_tlbs(vcpu->kvm);
- } else if (pfn != spte_to_pfn(*sptep)) {
- pgprintk("hfn old %llx new %llx\n",
- spte_to_pfn(*sptep), pfn);
- drop_spte(vcpu->kvm, sptep);
- kvm_flush_remote_tlbs(vcpu->kvm);
- } else
- was_rmapped = 1;
- }
-
- if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
- level, gfn, pfn, speculative, true,
- host_writable)) {
- if (write_fault)
- *emulate = 1;
- kvm_mmu_flush_tlb(vcpu);
- }
-
- if (unlikely(is_mmio_spte(*sptep) && emulate))
- *emulate = 1;
-
- pgprintk("%s: setting spte %llx\n", __func__, *sptep);
- pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
- is_large_pte(*sptep)? "2MB" : "4kB",
- *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
- *sptep, sptep);
- if (!was_rmapped && is_large_pte(*sptep))
- ++vcpu->kvm->stat.lpages;
-
- if (is_shadow_present_pte(*sptep)) {
- page_header_update_slot(vcpu->kvm, sptep, gfn);
- if (!was_rmapped) {
- rmap_count = rmap_add(vcpu, sptep, gfn);
- if (rmap_count > RMAP_RECYCLE_THRESHOLD)
- rmap_recycle(vcpu, sptep, gfn);
- }
- }
- kvm_release_pfn_clean(pfn);
-}
-
-static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
-{
-}
-
-static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
- bool no_dirty_log)
-{
- struct kvm_memory_slot *slot;
- unsigned long hva;
-
- slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
- if (!slot) {
- get_page(fault_page);
- return page_to_pfn(fault_page);
- }
-
- hva = gfn_to_hva_memslot(slot, gfn);
-
- return hva_to_pfn_atomic(vcpu->kvm, hva);
-}
-
-static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp,
- u64 *start, u64 *end)
-{
- struct page *pages[PTE_PREFETCH_NUM];
- unsigned access = sp->role.access;
- int i, ret;
- gfn_t gfn;
-
- gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
- if (!gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK))
- return -1;
-
- ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start);
- if (ret <= 0)
- return -1;
-
- for (i = 0; i < ret; i++, gfn++, start++)
- mmu_set_spte(vcpu, start, ACC_ALL,
- access, 0, 0, NULL,
- sp->role.level, gfn,
- page_to_pfn(pages[i]), true, true);
-
- return 0;
-}
-
-static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, u64 *sptep)
-{
- u64 *spte, *start = NULL;
- int i;
-
- WARN_ON(!sp->role.direct);
-
- i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
- spte = sp->spt + i;
-
- for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
- if (is_shadow_present_pte(*spte) || spte == sptep) {
- if (!start)
- continue;
- if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
- break;
- start = NULL;
- } else if (!start)
- start = spte;
- }
-}
-
-static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
-{
- struct kvm_mmu_page *sp;
-
- /*
- * Since it's no accessed bit on EPT, it's no way to
- * distinguish between actually accessed translations
- * and prefetched, so disable pte prefetch if EPT is
- * enabled.
- */
- if (!shadow_accessed_mask)
- return;
-
- sp = page_header(__pa(sptep));
- if (sp->role.level > PT_PAGE_TABLE_LEVEL)
- return;
-
- __direct_pte_prefetch(vcpu, sp, sptep);
-}
-
-static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
- int map_writable, int level, gfn_t gfn, pfn_t pfn,
- bool prefault)
-{
- struct kvm_shadow_walk_iterator iterator;
- struct kvm_mmu_page *sp;
- int emulate = 0;
- gfn_t pseudo_gfn;
-
- for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
- if (iterator.level == level) {
- unsigned pte_access = ACC_ALL;
-
- mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
- 0, write, &emulate,
- level, gfn, pfn, prefault, map_writable);
- direct_pte_prefetch(vcpu, iterator.sptep);
- ++vcpu->stat.pf_fixed;
- break;
- }
-
- if (!is_shadow_present_pte(*iterator.sptep)) {
- u64 base_addr = iterator.addr;
-
- base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
- pseudo_gfn = base_addr >> PAGE_SHIFT;
- sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
- iterator.level - 1,
- 1, ACC_ALL, iterator.sptep);
- if (!sp) {
- pgprintk("nonpaging_map: ENOMEM\n");
- kvm_release_pfn_clean(pfn);
- return -ENOMEM;
- }
-
- mmu_spte_set(iterator.sptep,
- __pa(sp->spt)
- | PT_PRESENT_MASK | PT_WRITABLE_MASK
- | shadow_user_mask | shadow_x_mask
- | shadow_accessed_mask);
- }
- }
- return emulate;
-}
-
-static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
-{
- siginfo_t info;
-
- info.si_signo = SIGBUS;
- info.si_errno = 0;
- info.si_code = BUS_MCEERR_AR;
- info.si_addr = (void __user *)address;
- info.si_addr_lsb = PAGE_SHIFT;
-
- send_sig_info(SIGBUS, &info, tsk);
-}
-
-static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
-{
- kvm_release_pfn_clean(pfn);
- if (is_hwpoison_pfn(pfn)) {
- kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current);
- return 0;
- }
-
- return -EFAULT;
-}
-
-static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
- gfn_t *gfnp, pfn_t *pfnp, int *levelp)
-{
- pfn_t pfn = *pfnp;
- gfn_t gfn = *gfnp;
- int level = *levelp;
-
- /*
- * Check if it's a transparent hugepage. If this would be an
- * hugetlbfs page, level wouldn't be set to
- * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
- * here.
- */
- if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
- level == PT_PAGE_TABLE_LEVEL &&
- PageTransCompound(pfn_to_page(pfn)) &&
- !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
- unsigned long mask;
- /*
- * mmu_notifier_retry was successful and we hold the
- * mmu_lock here, so the pmd can't become splitting
- * from under us, and in turn
- * __split_huge_page_refcount() can't run from under
- * us and we can safely transfer the refcount from
- * PG_tail to PG_head as we switch the pfn to tail to
- * head.
- */
- *levelp = level = PT_DIRECTORY_LEVEL;
- mask = KVM_PAGES_PER_HPAGE(level) - 1;
- VM_BUG_ON((gfn & mask) != (pfn & mask));
- if (pfn & mask) {
- gfn &= ~mask;
- *gfnp = gfn;
- kvm_release_pfn_clean(pfn);
- pfn &= ~mask;
- if (!get_page_unless_zero(pfn_to_page(pfn)))
- BUG();
- *pfnp = pfn;
- }
- }
-}
-
-static bool mmu_invalid_pfn(pfn_t pfn)
-{
- return unlikely(is_invalid_pfn(pfn));
-}
-
-static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
- pfn_t pfn, unsigned access, int *ret_val)
-{
- bool ret = true;
-
- /* The pfn is invalid, report the error! */
- if (unlikely(is_invalid_pfn(pfn))) {
- *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
- goto exit;
- }
-
- if (unlikely(is_noslot_pfn(pfn)))
- vcpu_cache_mmio_info(vcpu, gva, gfn, access);
-
- ret = false;
-exit:
- return ret;
-}
-
-static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
- gva_t gva, pfn_t *pfn, bool write, bool *writable);
-
-static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
- bool prefault)
-{
- int r;
- int level;
- int force_pt_level;
- pfn_t pfn;
- unsigned long mmu_seq;
- bool map_writable;
-
- force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
- if (likely(!force_pt_level)) {
- level = mapping_level(vcpu, gfn);
- /*
- * This path builds a PAE pagetable - so we can map
- * 2mb pages at maximum. Therefore check if the level
- * is larger than that.
- */
- if (level > PT_DIRECTORY_LEVEL)
- level = PT_DIRECTORY_LEVEL;
-
- gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
- } else
- level = PT_PAGE_TABLE_LEVEL;
-
- mmu_seq = vcpu->kvm->mmu_notifier_seq;
- smp_rmb();
-
- if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
- return 0;
-
- if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
- return r;
-
- spin_lock(&vcpu->kvm->mmu_lock);
- if (mmu_notifier_retry(vcpu, mmu_seq))
- goto out_unlock;
- kvm_mmu_free_some_pages(vcpu);
- if (likely(!force_pt_level))
- transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
- r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
- prefault);
- spin_unlock(&vcpu->kvm->mmu_lock);
-
-
- return r;
-
-out_unlock:
- spin_unlock(&vcpu->kvm->mmu_lock);
- kvm_release_pfn_clean(pfn);
- return 0;
-}
-
-
-static void mmu_free_roots(struct kvm_vcpu *vcpu)
-{
- int i;
- struct kvm_mmu_page *sp;
- LIST_HEAD(invalid_list);
-
- if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
- return;
- spin_lock(&vcpu->kvm->mmu_lock);
- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
- (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
- vcpu->arch.mmu.direct_map)) {
- hpa_t root = vcpu->arch.mmu.root_hpa;
-
- sp = page_header(root);
- --sp->root_count;
- if (!sp->root_count && sp->role.invalid) {
- kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
- }
- vcpu->arch.mmu.root_hpa = INVALID_PAGE;
- spin_unlock(&vcpu->kvm->mmu_lock);
- return;
- }
- for (i = 0; i < 4; ++i) {
- hpa_t root = vcpu->arch.mmu.pae_root[i];
-
- if (root) {
- root &= PT64_BASE_ADDR_MASK;
- sp = page_header(root);
- --sp->root_count;
- if (!sp->root_count && sp->role.invalid)
- kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
- &invalid_list);
- }
- vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
- }
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
- spin_unlock(&vcpu->kvm->mmu_lock);
- vcpu->arch.mmu.root_hpa = INVALID_PAGE;
-}
-
-static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
-{
- int ret = 0;
-
- if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
- kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
- ret = 1;
- }
-
- return ret;
-}
-
-static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu_page *sp;
- unsigned i;
-
- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
- spin_lock(&vcpu->kvm->mmu_lock);
- kvm_mmu_free_some_pages(vcpu);
- sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL,
- 1, ACC_ALL, NULL);
- ++sp->root_count;
- spin_unlock(&vcpu->kvm->mmu_lock);
- vcpu->arch.mmu.root_hpa = __pa(sp->spt);
- } else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) {
- for (i = 0; i < 4; ++i) {
- hpa_t root = vcpu->arch.mmu.pae_root[i];
-
- ASSERT(!VALID_PAGE(root));
- spin_lock(&vcpu->kvm->mmu_lock);
- kvm_mmu_free_some_pages(vcpu);
- sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
- i << 30,
- PT32_ROOT_LEVEL, 1, ACC_ALL,
- NULL);
- root = __pa(sp->spt);
- ++sp->root_count;
- spin_unlock(&vcpu->kvm->mmu_lock);
- vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
- }
- vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
- } else
- BUG();
-
- return 0;
-}
-
-static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu_page *sp;
- u64 pdptr, pm_mask;
- gfn_t root_gfn;
- int i;
-
- root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT;
-
- if (mmu_check_root(vcpu, root_gfn))
- return 1;
-
- /*
- * Do we shadow a long mode page table? If so we need to
- * write-protect the guests page table root.
- */
- if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
- hpa_t root = vcpu->arch.mmu.root_hpa;
-
- ASSERT(!VALID_PAGE(root));
-
- spin_lock(&vcpu->kvm->mmu_lock);
- kvm_mmu_free_some_pages(vcpu);
- sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
- 0, ACC_ALL, NULL);
- root = __pa(sp->spt);
- ++sp->root_count;
- spin_unlock(&vcpu->kvm->mmu_lock);
- vcpu->arch.mmu.root_hpa = root;
- return 0;
- }
-
- /*
- * We shadow a 32 bit page table. This may be a legacy 2-level
- * or a PAE 3-level page table. In either case we need to be aware that
- * the shadow page table may be a PAE or a long mode page table.
- */
- pm_mask = PT_PRESENT_MASK;
- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL)
- pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
-
- for (i = 0; i < 4; ++i) {
- hpa_t root = vcpu->arch.mmu.pae_root[i];
-
- ASSERT(!VALID_PAGE(root));
- if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
- pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i);
- if (!is_present_gpte(pdptr)) {
- vcpu->arch.mmu.pae_root[i] = 0;
- continue;
- }
- root_gfn = pdptr >> PAGE_SHIFT;
- if (mmu_check_root(vcpu, root_gfn))
- return 1;
- }
- spin_lock(&vcpu->kvm->mmu_lock);
- kvm_mmu_free_some_pages(vcpu);
- sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
- PT32_ROOT_LEVEL, 0,
- ACC_ALL, NULL);
- root = __pa(sp->spt);
- ++sp->root_count;
- spin_unlock(&vcpu->kvm->mmu_lock);
-
- vcpu->arch.mmu.pae_root[i] = root | pm_mask;
- }
- vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
-
- /*
- * If we shadow a 32 bit page table with a long mode page
- * table we enter this path.
- */
- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
- if (vcpu->arch.mmu.lm_root == NULL) {
- /*
- * The additional page necessary for this is only
- * allocated on demand.
- */
-
- u64 *lm_root;
-
- lm_root = (void*)get_zeroed_page(GFP_KERNEL);
- if (lm_root == NULL)
- return 1;
-
- lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask;
-
- vcpu->arch.mmu.lm_root = lm_root;
- }
-
- vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root);
- }
-
- return 0;
-}
-
-static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
-{
- if (vcpu->arch.mmu.direct_map)
- return mmu_alloc_direct_roots(vcpu);
- else
- return mmu_alloc_shadow_roots(vcpu);
-}
-
-static void mmu_sync_roots(struct kvm_vcpu *vcpu)
-{
- int i;
- struct kvm_mmu_page *sp;
-
- if (vcpu->arch.mmu.direct_map)
- return;
-
- if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
- return;
-
- vcpu_clear_mmio_info(vcpu, ~0ul);
- kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
- if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
- hpa_t root = vcpu->arch.mmu.root_hpa;
- sp = page_header(root);
- mmu_sync_children(vcpu, sp);
- kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
- return;
- }
- for (i = 0; i < 4; ++i) {
- hpa_t root = vcpu->arch.mmu.pae_root[i];
-
- if (root && VALID_PAGE(root)) {
- root &= PT64_BASE_ADDR_MASK;
- sp = page_header(root);
- mmu_sync_children(vcpu, sp);
- }
- }
- kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
-}
-
-void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
-{
- spin_lock(&vcpu->kvm->mmu_lock);
- mmu_sync_roots(vcpu);
- spin_unlock(&vcpu->kvm->mmu_lock);
-}
-
-static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
- u32 access, struct x86_exception *exception)
-{
- if (exception)
- exception->error_code = 0;
- return vaddr;
-}
-
-static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
- u32 access,
- struct x86_exception *exception)
-{
- if (exception)
- exception->error_code = 0;
- return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
-}
-
-static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
-{
- if (direct)
- return vcpu_match_mmio_gpa(vcpu, addr);
-
- return vcpu_match_mmio_gva(vcpu, addr);
-}
-
-
-/*
- * On direct hosts, the last spte is only allows two states
- * for mmio page fault:
- * - It is the mmio spte
- * - It is zapped or it is being zapped.
- *
- * This function completely checks the spte when the last spte
- * is not the mmio spte.
- */
-static bool check_direct_spte_mmio_pf(u64 spte)
-{
- return __check_direct_spte_mmio_pf(spte);
-}
-
-static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr)
-{
- struct kvm_shadow_walk_iterator iterator;
- u64 spte = 0ull;
-
- walk_shadow_page_lockless_begin(vcpu);
- for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
- if (!is_shadow_present_pte(spte))
- break;
- walk_shadow_page_lockless_end(vcpu);
-
- return spte;
-}
-
-/*
- * If it is a real mmio page fault, return 1 and emulat the instruction
- * directly, return 0 to let CPU fault again on the address, -1 is
- * returned if bug is detected.
- */
-int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
-{
- u64 spte;
-
- if (quickly_check_mmio_pf(vcpu, addr, direct))
- return 1;
-
- spte = walk_shadow_page_get_mmio_spte(vcpu, addr);
-
- if (is_mmio_spte(spte)) {
- gfn_t gfn = get_mmio_spte_gfn(spte);
- unsigned access = get_mmio_spte_access(spte);
-
- if (direct)
- addr = 0;
-
- trace_handle_mmio_page_fault(addr, gfn, access);
- vcpu_cache_mmio_info(vcpu, addr, gfn, access);
- return 1;
- }
-
- /*
- * It's ok if the gva is remapped by other cpus on shadow guest,
- * it's a BUG if the gfn is not a mmio page.
- */
- if (direct && !check_direct_spte_mmio_pf(spte))
- return -1;
-
- /*
- * If the page table is zapped by other cpus, let CPU fault again on
- * the address.
- */
- return 0;
-}
-EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common);
-
-static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr,
- u32 error_code, bool direct)
-{
- int ret;
-
- ret = handle_mmio_page_fault_common(vcpu, addr, direct);
- WARN_ON(ret < 0);
- return ret;
-}
-
-static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
- u32 error_code, bool prefault)
-{
- gfn_t gfn;
- int r;
-
- pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
-
- if (unlikely(error_code & PFERR_RSVD_MASK))
- return handle_mmio_page_fault(vcpu, gva, error_code, true);
-
- r = mmu_topup_memory_caches(vcpu);
- if (r)
- return r;
-
- ASSERT(vcpu);
- ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
-
- gfn = gva >> PAGE_SHIFT;
-
- return nonpaging_map(vcpu, gva & PAGE_MASK,
- error_code & PFERR_WRITE_MASK, gfn, prefault);
-}
-
-static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
-{
- struct kvm_arch_async_pf arch;
-
- arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
- arch.gfn = gfn;
- arch.direct_map = vcpu->arch.mmu.direct_map;
- arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
-
- return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
-}
-
-static bool can_do_async_pf(struct kvm_vcpu *vcpu)
-{
- if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
- kvm_event_needs_reinjection(vcpu)))
- return false;
-
- return kvm_x86_ops->interrupt_allowed(vcpu);
-}
-
-static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
- gva_t gva, pfn_t *pfn, bool write, bool *writable)
-{
- bool async;
-
- *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable);
-
- if (!async)
- return false; /* *pfn has correct page already */
-
- put_page(pfn_to_page(*pfn));
-
- if (!prefault && can_do_async_pf(vcpu)) {
- trace_kvm_try_async_get_page(gva, gfn);
- if (kvm_find_async_pf_gfn(vcpu, gfn)) {
- trace_kvm_async_pf_doublefault(gva, gfn);
- kvm_make_request(KVM_REQ_APF_HALT, vcpu);
- return true;
- } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
- return true;
- }
-
- *pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable);
-
- return false;
-}
-
-static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
- bool prefault)
-{
- pfn_t pfn;
- int r;
- int level;
- int force_pt_level;
- gfn_t gfn = gpa >> PAGE_SHIFT;
- unsigned long mmu_seq;
- int write = error_code & PFERR_WRITE_MASK;
- bool map_writable;
-
- ASSERT(vcpu);
- ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
-
- if (unlikely(error_code & PFERR_RSVD_MASK))
- return handle_mmio_page_fault(vcpu, gpa, error_code, true);
-
- r = mmu_topup_memory_caches(vcpu);
- if (r)
- return r;
-
- force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
- if (likely(!force_pt_level)) {
- level = mapping_level(vcpu, gfn);
- gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
- } else
- level = PT_PAGE_TABLE_LEVEL;
-
- mmu_seq = vcpu->kvm->mmu_notifier_seq;
- smp_rmb();
-
- if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
- return 0;
-
- if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
- return r;
-
- spin_lock(&vcpu->kvm->mmu_lock);
- if (mmu_notifier_retry(vcpu, mmu_seq))
- goto out_unlock;
- kvm_mmu_free_some_pages(vcpu);
- if (likely(!force_pt_level))
- transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
- r = __direct_map(vcpu, gpa, write, map_writable,
- level, gfn, pfn, prefault);
- spin_unlock(&vcpu->kvm->mmu_lock);
-
- return r;
-
-out_unlock:
- spin_unlock(&vcpu->kvm->mmu_lock);
- kvm_release_pfn_clean(pfn);
- return 0;
-}
-
-static void nonpaging_free(struct kvm_vcpu *vcpu)
-{
- mmu_free_roots(vcpu);
-}
-
-static int nonpaging_init_context(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context)
-{
- context->new_cr3 = nonpaging_new_cr3;
- context->page_fault = nonpaging_page_fault;
- context->gva_to_gpa = nonpaging_gva_to_gpa;
- context->free = nonpaging_free;
- context->sync_page = nonpaging_sync_page;
- context->invlpg = nonpaging_invlpg;
- context->update_pte = nonpaging_update_pte;
- context->root_level = 0;
- context->shadow_root_level = PT32E_ROOT_LEVEL;
- context->root_hpa = INVALID_PAGE;
- context->direct_map = true;
- context->nx = false;
- return 0;
-}
-
-void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
-{
- ++vcpu->stat.tlb_flush;
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
-}
-
-static void paging_new_cr3(struct kvm_vcpu *vcpu)
-{
- pgprintk("%s: cr3 %lx\n", __func__, kvm_read_cr3(vcpu));
- mmu_free_roots(vcpu);
-}
-
-static unsigned long get_cr3(struct kvm_vcpu *vcpu)
-{
- return kvm_read_cr3(vcpu);
-}
-
-static void inject_page_fault(struct kvm_vcpu *vcpu,
- struct x86_exception *fault)
-{
- vcpu->arch.mmu.inject_page_fault(vcpu, fault);
-}
-
-static void paging_free(struct kvm_vcpu *vcpu)
-{
- nonpaging_free(vcpu);
-}
-
-static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
-{
- int bit7;
-
- bit7 = (gpte >> 7) & 1;
- return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
-}
-
-static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
- int *nr_present)
-{
- if (unlikely(is_mmio_spte(*sptep))) {
- if (gfn != get_mmio_spte_gfn(*sptep)) {
- mmu_spte_clear_no_track(sptep);
- return true;
- }
-
- (*nr_present)++;
- mark_mmio_spte(sptep, gfn, access);
- return true;
- }
-
- return false;
-}
-
-#define PTTYPE 64
-#include "paging_tmpl.h"
-#undef PTTYPE
-
-#define PTTYPE 32
-#include "paging_tmpl.h"
-#undef PTTYPE
-
-static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context)
-{
- int maxphyaddr = cpuid_maxphyaddr(vcpu);
- u64 exb_bit_rsvd = 0;
-
- if (!context->nx)
- exb_bit_rsvd = rsvd_bits(63, 63);
- switch (context->root_level) {
- case PT32_ROOT_LEVEL:
- /* no rsvd bits for 2 level 4K page table entries */
- context->rsvd_bits_mask[0][1] = 0;
- context->rsvd_bits_mask[0][0] = 0;
- context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
-
- if (!is_pse(vcpu)) {
- context->rsvd_bits_mask[1][1] = 0;
- break;
- }
-
- if (is_cpuid_PSE36())
- /* 36bits PSE 4MB page */
- context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
- else
- /* 32 bits PSE 4MB page */
- context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
- break;
- case PT32E_ROOT_LEVEL:
- context->rsvd_bits_mask[0][2] =
- rsvd_bits(maxphyaddr, 63) |
- rsvd_bits(7, 8) | rsvd_bits(1, 2); /* PDPTE */
- context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 62); /* PDE */
- context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 62); /* PTE */
- context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 62) |
- rsvd_bits(13, 20); /* large page */
- context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
- break;
- case PT64_ROOT_LEVEL:
- context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
- context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
- context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 51);
- context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 51);
- context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
- context->rsvd_bits_mask[1][2] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 51) |
- rsvd_bits(13, 29);
- context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 51) |
- rsvd_bits(13, 20); /* large page */
- context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
- break;
- }
-}
-
-static int paging64_init_context_common(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context,
- int level)
-{
- context->nx = is_nx(vcpu);
- context->root_level = level;
-
- reset_rsvds_bits_mask(vcpu, context);
-
- ASSERT(is_pae(vcpu));
- context->new_cr3 = paging_new_cr3;
- context->page_fault = paging64_page_fault;
- context->gva_to_gpa = paging64_gva_to_gpa;
- context->sync_page = paging64_sync_page;
- context->invlpg = paging64_invlpg;
- context->update_pte = paging64_update_pte;
- context->free = paging_free;
- context->shadow_root_level = level;
- context->root_hpa = INVALID_PAGE;
- context->direct_map = false;
- return 0;
-}
-
-static int paging64_init_context(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context)
-{
- return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL);
-}
-
-static int paging32_init_context(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context)
-{
- context->nx = false;
- context->root_level = PT32_ROOT_LEVEL;
-
- reset_rsvds_bits_mask(vcpu, context);
-
- context->new_cr3 = paging_new_cr3;
- context->page_fault = paging32_page_fault;
- context->gva_to_gpa = paging32_gva_to_gpa;
- context->free = paging_free;
- context->sync_page = paging32_sync_page;
- context->invlpg = paging32_invlpg;
- context->update_pte = paging32_update_pte;
- context->shadow_root_level = PT32E_ROOT_LEVEL;
- context->root_hpa = INVALID_PAGE;
- context->direct_map = false;
- return 0;
-}
-
-static int paging32E_init_context(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context)
-{
- return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
-}
-
-static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu *context = vcpu->arch.walk_mmu;
-
- context->base_role.word = 0;
- context->new_cr3 = nonpaging_new_cr3;
- context->page_fault = tdp_page_fault;
- context->free = nonpaging_free;
- context->sync_page = nonpaging_sync_page;
- context->invlpg = nonpaging_invlpg;
- context->update_pte = nonpaging_update_pte;
- context->shadow_root_level = kvm_x86_ops->get_tdp_level();
- context->root_hpa = INVALID_PAGE;
- context->direct_map = true;
- context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
- context->get_cr3 = get_cr3;
- context->get_pdptr = kvm_pdptr_read;
- context->inject_page_fault = kvm_inject_page_fault;
-
- if (!is_paging(vcpu)) {
- context->nx = false;
- context->gva_to_gpa = nonpaging_gva_to_gpa;
- context->root_level = 0;
- } else if (is_long_mode(vcpu)) {
- context->nx = is_nx(vcpu);
- context->root_level = PT64_ROOT_LEVEL;
- reset_rsvds_bits_mask(vcpu, context);
- context->gva_to_gpa = paging64_gva_to_gpa;
- } else if (is_pae(vcpu)) {
- context->nx = is_nx(vcpu);
- context->root_level = PT32E_ROOT_LEVEL;
- reset_rsvds_bits_mask(vcpu, context);
- context->gva_to_gpa = paging64_gva_to_gpa;
- } else {
- context->nx = false;
- context->root_level = PT32_ROOT_LEVEL;
- reset_rsvds_bits_mask(vcpu, context);
- context->gva_to_gpa = paging32_gva_to_gpa;
- }
-
- return 0;
-}
-
-int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
-{
- int r;
- bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
- ASSERT(vcpu);
- ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
-
- if (!is_paging(vcpu))
- r = nonpaging_init_context(vcpu, context);
- else if (is_long_mode(vcpu))
- r = paging64_init_context(vcpu, context);
- else if (is_pae(vcpu))
- r = paging32E_init_context(vcpu, context);
- else
- r = paging32_init_context(vcpu, context);
-
- vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
- vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
- vcpu->arch.mmu.base_role.smep_andnot_wp
- = smep && !is_write_protection(vcpu);
-
- return r;
-}
-EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
-
-static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
-{
- int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
-
- vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3;
- vcpu->arch.walk_mmu->get_cr3 = get_cr3;
- vcpu->arch.walk_mmu->get_pdptr = kvm_pdptr_read;
- vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
-
- return r;
-}
-
-static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
-
- g_context->get_cr3 = get_cr3;
- g_context->get_pdptr = kvm_pdptr_read;
- g_context->inject_page_fault = kvm_inject_page_fault;
-
- /*
- * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The
- * translation of l2_gpa to l1_gpa addresses is done using the
- * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa
- * functions between mmu and nested_mmu are swapped.
- */
- if (!is_paging(vcpu)) {
- g_context->nx = false;
- g_context->root_level = 0;
- g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
- } else if (is_long_mode(vcpu)) {
- g_context->nx = is_nx(vcpu);
- g_context->root_level = PT64_ROOT_LEVEL;
- reset_rsvds_bits_mask(vcpu, g_context);
- g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
- } else if (is_pae(vcpu)) {
- g_context->nx = is_nx(vcpu);
- g_context->root_level = PT32E_ROOT_LEVEL;
- reset_rsvds_bits_mask(vcpu, g_context);
- g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
- } else {
- g_context->nx = false;
- g_context->root_level = PT32_ROOT_LEVEL;
- reset_rsvds_bits_mask(vcpu, g_context);
- g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
- }
-
- return 0;
-}
-
-static int init_kvm_mmu(struct kvm_vcpu *vcpu)
-{
- if (mmu_is_nested(vcpu))
- return init_kvm_nested_mmu(vcpu);
- else if (tdp_enabled)
- return init_kvm_tdp_mmu(vcpu);
- else
- return init_kvm_softmmu(vcpu);
-}
-
-static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
-{
- ASSERT(vcpu);
- if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
- /* mmu.free() should set root_hpa = INVALID_PAGE */
- vcpu->arch.mmu.free(vcpu);
-}
-
-int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
-{
- destroy_kvm_mmu(vcpu);
- return init_kvm_mmu(vcpu);
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
-
-int kvm_mmu_load(struct kvm_vcpu *vcpu)
-{
- int r;
-
- r = mmu_topup_memory_caches(vcpu);
- if (r)
- goto out;
- r = mmu_alloc_roots(vcpu);
- spin_lock(&vcpu->kvm->mmu_lock);
- mmu_sync_roots(vcpu);
- spin_unlock(&vcpu->kvm->mmu_lock);
- if (r)
- goto out;
- /* set_cr3() should ensure TLB has been flushed */
- vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
-out:
- return r;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_load);
-
-void kvm_mmu_unload(struct kvm_vcpu *vcpu)
-{
- mmu_free_roots(vcpu);
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_unload);
-
-static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, u64 *spte,
- const void *new)
-{
- if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
- ++vcpu->kvm->stat.mmu_pde_zapped;
- return;
- }
-
- ++vcpu->kvm->stat.mmu_pte_updated;
- vcpu->arch.mmu.update_pte(vcpu, sp, spte, new);
-}
-
-static bool need_remote_flush(u64 old, u64 new)
-{
- if (!is_shadow_present_pte(old))
- return false;
- if (!is_shadow_present_pte(new))
- return true;
- if ((old ^ new) & PT64_BASE_ADDR_MASK)
- return true;
- old ^= PT64_NX_MASK;
- new ^= PT64_NX_MASK;
- return (old & ~new & PT64_PERM_MASK) != 0;
-}
-
-static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
- bool remote_flush, bool local_flush)
-{
- if (zap_page)
- return;
-
- if (remote_flush)
- kvm_flush_remote_tlbs(vcpu->kvm);
- else if (local_flush)
- kvm_mmu_flush_tlb(vcpu);
-}
-
-static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
- const u8 *new, int *bytes)
-{
- u64 gentry;
- int r;
-
- /*
- * Assume that the pte write on a page table of the same type
- * as the current vcpu paging mode since we update the sptes only
- * when they have the same mode.
- */
- if (is_pae(vcpu) && *bytes == 4) {
- /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
- *gpa &= ~(gpa_t)7;
- *bytes = 8;
- r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, min(*bytes, 8));
- if (r)
- gentry = 0;
- new = (const u8 *)&gentry;
- }
-
- switch (*bytes) {
- case 4:
- gentry = *(const u32 *)new;
- break;
- case 8:
- gentry = *(const u64 *)new;
- break;
- default:
- gentry = 0;
- break;
- }
-
- return gentry;
-}
-
-/*
- * If we're seeing too many writes to a page, it may no longer be a page table,
- * or we may be forking, in which case it is better to unmap the page.
- */
-static bool detect_write_flooding(struct kvm_mmu_page *sp)
-{
- /*
- * Skip write-flooding detected for the sp whose level is 1, because
- * it can become unsync, then the guest page is not write-protected.
- */
- if (sp->role.level == 1)
- return false;
-
- return ++sp->write_flooding_count >= 3;
-}
-
-/*
- * Misaligned accesses are too much trouble to fix up; also, they usually
- * indicate a page is not used as a page table.
- */
-static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
- int bytes)
-{
- unsigned offset, pte_size, misaligned;
-
- pgprintk("misaligned: gpa %llx bytes %d role %x\n",
- gpa, bytes, sp->role.word);
-
- offset = offset_in_page(gpa);
- pte_size = sp->role.cr4_pae ? 8 : 4;
-
- /*
- * Sometimes, the OS only writes the last one bytes to update status
- * bits, for example, in linux, andb instruction is used in clear_bit().
- */
- if (!(offset & (pte_size - 1)) && bytes == 1)
- return false;
-
- misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
- misaligned |= bytes < 4;
-
- return misaligned;
-}
-
-static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
-{
- unsigned page_offset, quadrant;
- u64 *spte;
- int level;
-
- page_offset = offset_in_page(gpa);
- level = sp->role.level;
- *nspte = 1;
- if (!sp->role.cr4_pae) {
- page_offset <<= 1; /* 32->64 */
- /*
- * A 32-bit pde maps 4MB while the shadow pdes map
- * only 2MB. So we need to double the offset again
- * and zap two pdes instead of one.
- */
- if (level == PT32_ROOT_LEVEL) {
- page_offset &= ~7; /* kill rounding error */
- page_offset <<= 1;
- *nspte = 2;
- }
- quadrant = page_offset >> PAGE_SHIFT;
- page_offset &= ~PAGE_MASK;
- if (quadrant != sp->role.quadrant)
- return NULL;
- }
-
- spte = &sp->spt[page_offset / sizeof(*spte)];
- return spte;
-}
-
-void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
- const u8 *new, int bytes)
-{
- gfn_t gfn = gpa >> PAGE_SHIFT;
- union kvm_mmu_page_role mask = { .word = 0 };
- struct kvm_mmu_page *sp;
- struct hlist_node *node;
- LIST_HEAD(invalid_list);
- u64 entry, gentry, *spte;
- int npte;
- bool remote_flush, local_flush, zap_page;
-
- /*
- * If we don't have indirect shadow pages, it means no page is
- * write-protected, so we can exit simply.
- */
- if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
- return;
-
- zap_page = remote_flush = local_flush = false;
-
- pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
-
- gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, new, &bytes);
-
- /*
- * No need to care whether allocation memory is successful
- * or not since pte prefetch is skiped if it does not have
- * enough objects in the cache.
- */
- mmu_topup_memory_caches(vcpu);
-
- spin_lock(&vcpu->kvm->mmu_lock);
- ++vcpu->kvm->stat.mmu_pte_write;
- kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
-
- mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
- for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
- if (detect_write_misaligned(sp, gpa, bytes) ||
- detect_write_flooding(sp)) {
- zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
- &invalid_list);
- ++vcpu->kvm->stat.mmu_flooded;
- continue;
- }
-
- spte = get_written_sptes(sp, gpa, &npte);
- if (!spte)
- continue;
-
- local_flush = true;
- while (npte--) {
- entry = *spte;
- mmu_page_zap_pte(vcpu->kvm, sp, spte);
- if (gentry &&
- !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
- & mask.word) && rmap_can_add(vcpu))
- mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
- if (!remote_flush && need_remote_flush(entry, *spte))
- remote_flush = true;
- ++spte;
- }
- }
- mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
- kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
- spin_unlock(&vcpu->kvm->mmu_lock);
-}
-
-int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
-{
- gpa_t gpa;
- int r;
-
- if (vcpu->arch.mmu.direct_map)
- return 0;
-
- gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
-
- r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
-
- return r;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
-
-void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
-{
- LIST_HEAD(invalid_list);
-
- while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES &&
- !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
- struct kvm_mmu_page *sp;
-
- sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
- struct kvm_mmu_page, link);
- kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
- ++vcpu->kvm->stat.mmu_recycled;
- }
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
-}
-
-static bool is_mmio_page_fault(struct kvm_vcpu *vcpu, gva_t addr)
-{
- if (vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu))
- return vcpu_match_mmio_gpa(vcpu, addr);
-
- return vcpu_match_mmio_gva(vcpu, addr);
-}
-
-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
- void *insn, int insn_len)
-{
- int r, emulation_type = EMULTYPE_RETRY;
- enum emulation_result er;
-
- r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
- if (r < 0)
- goto out;
-
- if (!r) {
- r = 1;
- goto out;
- }
-
- if (is_mmio_page_fault(vcpu, cr2))
- emulation_type = 0;
-
- er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
-
- switch (er) {
- case EMULATE_DONE:
- return 1;
- case EMULATE_DO_MMIO:
- ++vcpu->stat.mmio_exits;
- /* fall through */
- case EMULATE_FAIL:
- return 0;
- default:
- BUG();
- }
-out:
- return r;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
-
-void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
-{
- vcpu->arch.mmu.invlpg(vcpu, gva);
- kvm_mmu_flush_tlb(vcpu);
- ++vcpu->stat.invlpg;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
-
-void kvm_enable_tdp(void)
-{
- tdp_enabled = true;
-}
-EXPORT_SYMBOL_GPL(kvm_enable_tdp);
-
-void kvm_disable_tdp(void)
-{
- tdp_enabled = false;
-}
-EXPORT_SYMBOL_GPL(kvm_disable_tdp);
-
-static void free_mmu_pages(struct kvm_vcpu *vcpu)
-{
- free_page((unsigned long)vcpu->arch.mmu.pae_root);
- if (vcpu->arch.mmu.lm_root != NULL)
- free_page((unsigned long)vcpu->arch.mmu.lm_root);
-}
-
-static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
-{
- struct page *page;
- int i;
-
- ASSERT(vcpu);
-
- /*
- * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
- * Therefore we need to allocate shadow page tables in the first
- * 4GB of memory, which happens to fit the DMA32 zone.
- */
- page = alloc_page(GFP_KERNEL | __GFP_DMA32);
- if (!page)
- return -ENOMEM;
-
- vcpu->arch.mmu.pae_root = page_address(page);
- for (i = 0; i < 4; ++i)
- vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
-
- return 0;
-}
-
-int kvm_mmu_create(struct kvm_vcpu *vcpu)
-{
- ASSERT(vcpu);
-
- vcpu->arch.walk_mmu = &vcpu->arch.mmu;
- vcpu->arch.mmu.root_hpa = INVALID_PAGE;
- vcpu->arch.mmu.translate_gpa = translate_gpa;
- vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
-
- return alloc_mmu_pages(vcpu);
-}
-
-int kvm_mmu_setup(struct kvm_vcpu *vcpu)
-{
- ASSERT(vcpu);
- ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
-
- return init_kvm_mmu(vcpu);
-}
-
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
-{
- struct kvm_mmu_page *sp;
-
- list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
- int i;
- u64 *pt;
-
- if (!test_bit(slot, sp->slot_bitmap))
- continue;
-
- pt = sp->spt;
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- if (!is_shadow_present_pte(pt[i]) ||
- !is_last_spte(pt[i], sp->role.level))
- continue;
-
- if (is_large_pte(pt[i])) {
- drop_spte(kvm, &pt[i]);
- --kvm->stat.lpages;
- continue;
- }
-
- /* avoid RMW */
- if (is_writable_pte(pt[i]))
- mmu_spte_update(&pt[i],
- pt[i] & ~PT_WRITABLE_MASK);
- }
- }
- kvm_flush_remote_tlbs(kvm);
-}
-
-void kvm_mmu_zap_all(struct kvm *kvm)
-{
- struct kvm_mmu_page *sp, *node;
- LIST_HEAD(invalid_list);
-
- spin_lock(&kvm->mmu_lock);
-restart:
- list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
- if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
- goto restart;
-
- kvm_mmu_commit_zap_page(kvm, &invalid_list);
- spin_unlock(&kvm->mmu_lock);
-}
-
-static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
- struct list_head *invalid_list)
-{
- struct kvm_mmu_page *page;
-
- page = container_of(kvm->arch.active_mmu_pages.prev,
- struct kvm_mmu_page, link);
- kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
-}
-
-static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
-{
- struct kvm *kvm;
- struct kvm *kvm_freed = NULL;
- int nr_to_scan = sc->nr_to_scan;
-
- if (nr_to_scan == 0)
- goto out;
-
- raw_spin_lock(&kvm_lock);
-
- list_for_each_entry(kvm, &vm_list, vm_list) {
- int idx;
- LIST_HEAD(invalid_list);
-
- idx = srcu_read_lock(&kvm->srcu);
- spin_lock(&kvm->mmu_lock);
- if (!kvm_freed && nr_to_scan > 0 &&
- kvm->arch.n_used_mmu_pages > 0) {
- kvm_mmu_remove_some_alloc_mmu_pages(kvm,
- &invalid_list);
- kvm_freed = kvm;
- }
- nr_to_scan--;
-
- kvm_mmu_commit_zap_page(kvm, &invalid_list);
- spin_unlock(&kvm->mmu_lock);
- srcu_read_unlock(&kvm->srcu, idx);
- }
- if (kvm_freed)
- list_move_tail(&kvm_freed->vm_list, &vm_list);
-
- raw_spin_unlock(&kvm_lock);
-
-out:
- return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
-}
-
-static struct shrinker mmu_shrinker = {
- .shrink = mmu_shrink,
- .seeks = DEFAULT_SEEKS * 10,
-};
-
-static void mmu_destroy_caches(void)
-{
- if (pte_list_desc_cache)
- kmem_cache_destroy(pte_list_desc_cache);
- if (mmu_page_header_cache)
- kmem_cache_destroy(mmu_page_header_cache);
-}
-
-int kvm_mmu_module_init(void)
-{
- pte_list_desc_cache = kmem_cache_create("pte_list_desc",
- sizeof(struct pte_list_desc),
- 0, 0, NULL);
- if (!pte_list_desc_cache)
- goto nomem;
-
- mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
- sizeof(struct kvm_mmu_page),
- 0, 0, NULL);
- if (!mmu_page_header_cache)
- goto nomem;
-
- if (percpu_counter_init(&kvm_total_used_mmu_pages, 0))
- goto nomem;
-
- register_shrinker(&mmu_shrinker);
-
- return 0;
-
-nomem:
- mmu_destroy_caches();
- return -ENOMEM;
-}
-
-/*
- * Caculate mmu pages needed for kvm.
- */
-unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
-{
- unsigned int nr_mmu_pages;
- unsigned int nr_pages = 0;
- struct kvm_memslots *slots;
- struct kvm_memory_slot *memslot;
-
- slots = kvm_memslots(kvm);
-
- kvm_for_each_memslot(memslot, slots)
- nr_pages += memslot->npages;
-
- nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
- nr_mmu_pages = max(nr_mmu_pages,
- (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
-
- return nr_mmu_pages;
-}
-
-int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
-{
- struct kvm_shadow_walk_iterator iterator;
- u64 spte;
- int nr_sptes = 0;
-
- walk_shadow_page_lockless_begin(vcpu);
- for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
- sptes[iterator.level-1] = spte;
- nr_sptes++;
- if (!is_shadow_present_pte(spte))
- break;
- }
- walk_shadow_page_lockless_end(vcpu);
-
- return nr_sptes;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
-
-void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
-{
- ASSERT(vcpu);
-
- destroy_kvm_mmu(vcpu);
- free_mmu_pages(vcpu);
- mmu_free_memory_caches(vcpu);
-}
-
-void kvm_mmu_module_exit(void)
-{
- mmu_destroy_caches();
- percpu_counter_destroy(&kvm_total_used_mmu_pages);
- unregister_shrinker(&mmu_shrinker);
- mmu_audit_disable();
-}
diff --git a/ANDROID_3.4.5/arch/x86/kvm/mmu.h b/ANDROID_3.4.5/arch/x86/kvm/mmu.h
deleted file mode 100644
index e374db9a..00000000
--- a/ANDROID_3.4.5/arch/x86/kvm/mmu.h
+++ /dev/null
@@ -1,104 +0,0 @@
-#ifndef __KVM_X86_MMU_H
-#define __KVM_X86_MMU_H
-
-#include <linux/kvm_host.h>
-#include "kvm_cache_regs.h"
-
-#define PT64_PT_BITS 9
-#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
-#define PT32_PT_BITS 10
-#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
-
-#define PT_WRITABLE_SHIFT 1
-
-#define PT_PRESENT_MASK (1ULL << 0)
-#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
-#define PT_USER_MASK (1ULL << 2)
-#define PT_PWT_MASK (1ULL << 3)
-#define PT_PCD_MASK (1ULL << 4)
-#define PT_ACCESSED_SHIFT 5
-#define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT)
-#define PT_DIRTY_MASK (1ULL << 6)
-#define PT_PAGE_SIZE_MASK (1ULL << 7)
-#define PT_PAT_MASK (1ULL << 7)
-#define PT_GLOBAL_MASK (1ULL << 8)
-#define PT64_NX_SHIFT 63
-#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT)
-
-#define PT_PAT_SHIFT 7
-#define PT_DIR_PAT_SHIFT 12
-#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
-
-#define PT32_DIR_PSE36_SIZE 4
-#define PT32_DIR_PSE36_SHIFT 13
-#define PT32_DIR_PSE36_MASK \
- (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
-
-#define PT64_ROOT_LEVEL 4
-#define PT32_ROOT_LEVEL 2
-#define PT32E_ROOT_LEVEL 3
-
-#define PT_PDPE_LEVEL 3
-#define PT_DIRECTORY_LEVEL 2
-#define PT_PAGE_TABLE_LEVEL 1
-
-#define PFERR_PRESENT_MASK (1U << 0)
-#define PFERR_WRITE_MASK (1U << 1)
-#define PFERR_USER_MASK (1U << 2)
-#define PFERR_RSVD_MASK (1U << 3)
-#define PFERR_FETCH_MASK (1U << 4)
-
-int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
-void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
-int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
-int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
-
-static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
-{
- return kvm->arch.n_max_mmu_pages -
- kvm->arch.n_used_mmu_pages;
-}
-
-static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
-{
- if (unlikely(kvm_mmu_available_pages(vcpu->kvm)< KVM_MIN_FREE_MMU_PAGES))
- __kvm_mmu_free_some_pages(vcpu);
-}
-
-static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
-{
- if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE))
- return 0;
-
- return kvm_mmu_load(vcpu);
-}
-
-static inline int is_present_gpte(unsigned long pte)
-{
- return pte & PT_PRESENT_MASK;
-}
-
-static inline int is_writable_pte(unsigned long pte)
-{
- return pte & PT_WRITABLE_MASK;
-}
-
-static inline bool is_write_protection(struct kvm_vcpu *vcpu)
-{
- return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
-}
-
-static inline bool check_write_user_access(struct kvm_vcpu *vcpu,
- bool write_fault, bool user_fault,
- unsigned long pte)
-{
- if (unlikely(write_fault && !is_writable_pte(pte)
- && (user_fault || is_write_protection(vcpu))))
- return false;
-
- if (unlikely(user_fault && !(pte & PT_USER_MASK)))
- return false;
-
- return true;
-}
-#endif
diff --git a/ANDROID_3.4.5/arch/x86/kvm/mmu_audit.c b/ANDROID_3.4.5/arch/x86/kvm/mmu_audit.c
deleted file mode 100644
index 715da5a1..00000000
--- a/ANDROID_3.4.5/arch/x86/kvm/mmu_audit.c
+++ /dev/null
@@ -1,303 +0,0 @@
-/*
- * mmu_audit.c:
- *
- * Audit code for KVM MMU
- *
- * Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
- *
- * Authors:
- * Yaniv Kamay <yaniv@qumranet.com>
- * Avi Kivity <avi@qumranet.com>
- * Marcelo Tosatti <mtosatti@redhat.com>
- * Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- *
- */
-
-#include <linux/ratelimit.h>
-
-char const *audit_point_name[] = {
- "pre page fault",
- "post page fault",
- "pre pte write",
- "post pte write",
- "pre sync",
- "post sync"
-};
-
-#define audit_printk(kvm, fmt, args...) \
- printk(KERN_ERR "audit: (%s) error: " \
- fmt, audit_point_name[kvm->arch.audit_point], ##args)
-
-typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
-
-static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- inspect_spte_fn fn, int level)
-{
- int i;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- u64 *ent = sp->spt;
-
- fn(vcpu, ent + i, level);
-
- if (is_shadow_present_pte(ent[i]) &&
- !is_last_spte(ent[i], level)) {
- struct kvm_mmu_page *child;
-
- child = page_header(ent[i] & PT64_BASE_ADDR_MASK);
- __mmu_spte_walk(vcpu, child, fn, level - 1);
- }
- }
-}
-
-static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
-{
- int i;
- struct kvm_mmu_page *sp;
-
- if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
- return;
-
- if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
- hpa_t root = vcpu->arch.mmu.root_hpa;
-
- sp = page_header(root);
- __mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_LEVEL);
- return;
- }
-
- for (i = 0; i < 4; ++i) {
- hpa_t root = vcpu->arch.mmu.pae_root[i];
-
- if (root && VALID_PAGE(root)) {
- root &= PT64_BASE_ADDR_MASK;
- sp = page_header(root);
- __mmu_spte_walk(vcpu, sp, fn, 2);
- }
- }
-
- return;
-}
-
-typedef void (*sp_handler) (struct kvm *kvm, struct kvm_mmu_page *sp);
-
-static void walk_all_active_sps(struct kvm *kvm, sp_handler fn)
-{
- struct kvm_mmu_page *sp;
-
- list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link)
- fn(kvm, sp);
-}
-
-static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
-{
- struct kvm_mmu_page *sp;
- gfn_t gfn;
- pfn_t pfn;
- hpa_t hpa;
-
- sp = page_header(__pa(sptep));
-
- if (sp->unsync) {
- if (level != PT_PAGE_TABLE_LEVEL) {
- audit_printk(vcpu->kvm, "unsync sp: %p "
- "level = %d\n", sp, level);
- return;
- }
- }
-
- if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level))
- return;
-
- gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
- pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
-
- if (is_error_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
- return;
- }
-
- hpa = pfn << PAGE_SHIFT;
- if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
- audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx "
- "ent %llxn", vcpu->arch.mmu.root_level, pfn,
- hpa, *sptep);
-}
-
-static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
-{
- static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
- unsigned long *rmapp;
- struct kvm_mmu_page *rev_sp;
- gfn_t gfn;
-
- rev_sp = page_header(__pa(sptep));
- gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
-
- if (!gfn_to_memslot(kvm, gfn)) {
- if (!__ratelimit(&ratelimit_state))
- return;
- audit_printk(kvm, "no memslot for gfn %llx\n", gfn);
- audit_printk(kvm, "index %ld of sp (gfn=%llx)\n",
- (long int)(sptep - rev_sp->spt), rev_sp->gfn);
- dump_stack();
- return;
- }
-
- rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
- if (!*rmapp) {
- if (!__ratelimit(&ratelimit_state))
- return;
- audit_printk(kvm, "no rmap for writable spte %llx\n",
- *sptep);
- dump_stack();
- }
-}
-
-static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level)
-{
- if (is_shadow_present_pte(*sptep) && is_last_spte(*sptep, level))
- inspect_spte_has_rmap(vcpu->kvm, sptep);
-}
-
-static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level)
-{
- struct kvm_mmu_page *sp = page_header(__pa(sptep));
-
- if (vcpu->kvm->arch.audit_point == AUDIT_POST_SYNC && sp->unsync)
- audit_printk(vcpu->kvm, "meet unsync sp(%p) after sync "
- "root.\n", sp);
-}
-
-static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- int i;
-
- if (sp->role.level != PT_PAGE_TABLE_LEVEL)
- return;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- if (!is_rmap_spte(sp->spt[i]))
- continue;
-
- inspect_spte_has_rmap(kvm, sp->spt + i);
- }
-}
-
-static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- struct kvm_memory_slot *slot;
- unsigned long *rmapp;
- u64 *spte;
-
- if (sp->role.direct || sp->unsync || sp->role.invalid)
- return;
-
- slot = gfn_to_memslot(kvm, sp->gfn);
- rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
-
- spte = rmap_next(rmapp, NULL);
- while (spte) {
- if (is_writable_pte(*spte))
- audit_printk(kvm, "shadow page has writable "
- "mappings: gfn %llx role %x\n",
- sp->gfn, sp->role.word);
- spte = rmap_next(rmapp, spte);
- }
-}
-
-static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- check_mappings_rmap(kvm, sp);
- audit_write_protection(kvm, sp);
-}
-
-static void audit_all_active_sps(struct kvm *kvm)
-{
- walk_all_active_sps(kvm, audit_sp);
-}
-
-static void audit_spte(struct kvm_vcpu *vcpu, u64 *sptep, int level)
-{
- audit_sptes_have_rmaps(vcpu, sptep, level);
- audit_mappings(vcpu, sptep, level);
- audit_spte_after_sync(vcpu, sptep, level);
-}
-
-static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
-{
- mmu_spte_walk(vcpu, audit_spte);
-}
-
-static bool mmu_audit;
-static struct static_key mmu_audit_key;
-
-static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
-{
- static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
-
- if (!__ratelimit(&ratelimit_state))
- return;
-
- vcpu->kvm->arch.audit_point = point;
- audit_all_active_sps(vcpu->kvm);
- audit_vcpu_spte(vcpu);
-}
-
-static inline void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
-{
- if (static_key_false((&mmu_audit_key)))
- __kvm_mmu_audit(vcpu, point);
-}
-
-static void mmu_audit_enable(void)
-{
- if (mmu_audit)
- return;
-
- static_key_slow_inc(&mmu_audit_key);
- mmu_audit = true;
-}
-
-static void mmu_audit_disable(void)
-{
- if (!mmu_audit)
- return;
-
- static_key_slow_dec(&mmu_audit_key);
- mmu_audit = false;
-}
-
-static int mmu_audit_set(const char *val, const struct kernel_param *kp)
-{
- int ret;
- unsigned long enable;
-
- ret = strict_strtoul(val, 10, &enable);
- if (ret < 0)
- return -EINVAL;
-
- switch (enable) {
- case 0:
- mmu_audit_disable();
- break;
- case 1:
- mmu_audit_enable();
- break;
- default:
- return -EINVAL;
- }
-
- return 0;
-}
-
-static struct kernel_param_ops audit_param_ops = {
- .set = mmu_audit_set,
- .get = param_get_bool,
-};
-
-module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644);
diff --git a/ANDROID_3.4.5/arch/x86/kvm/mmutrace.h b/ANDROID_3.4.5/arch/x86/kvm/mmutrace.h
deleted file mode 100644
index 89fb0e81..00000000
--- a/ANDROID_3.4.5/arch/x86/kvm/mmutrace.h
+++ /dev/null
@@ -1,254 +0,0 @@
-#if !defined(_TRACE_KVMMMU_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_KVMMMU_H
-
-#include <linux/tracepoint.h>
-#include <linux/ftrace_event.h>
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM kvmmmu
-
-#define KVM_MMU_PAGE_FIELDS \
- __field(__u64, gfn) \
- __field(__u32, role) \
- __field(__u32, root_count) \
- __field(bool, unsync)
-
-#define KVM_MMU_PAGE_ASSIGN(sp) \
- __entry->gfn = sp->gfn; \
- __entry->role = sp->role.word; \
- __entry->root_count = sp->root_count; \
- __entry->unsync = sp->unsync;
-
-#define KVM_MMU_PAGE_PRINTK() ({ \
- const char *ret = p->buffer + p->len; \
- static const char *access_str[] = { \
- "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux" \
- }; \
- union kvm_mmu_page_role role; \
- \
- role.word = __entry->role; \
- \
- trace_seq_printf(p, "sp gfn %llx %u%s q%u%s %s%s" \
- " %snxe root %u %s%c", \
- __entry->gfn, role.level, \
- role.cr4_pae ? " pae" : "", \
- role.quadrant, \
- role.direct ? " direct" : "", \
- access_str[role.access], \
- role.invalid ? " invalid" : "", \
- role.nxe ? "" : "!", \
- __entry->root_count, \
- __entry->unsync ? "unsync" : "sync", 0); \
- ret; \
- })
-
-#define kvm_mmu_trace_pferr_flags \
- { PFERR_PRESENT_MASK, "P" }, \
- { PFERR_WRITE_MASK, "W" }, \
- { PFERR_USER_MASK, "U" }, \
- { PFERR_RSVD_MASK, "RSVD" }, \
- { PFERR_FETCH_MASK, "F" }
-
-/*
- * A pagetable walk has started
- */
-TRACE_EVENT(
- kvm_mmu_pagetable_walk,
- TP_PROTO(u64 addr, int write_fault, int user_fault, int fetch_fault),
- TP_ARGS(addr, write_fault, user_fault, fetch_fault),
-
- TP_STRUCT__entry(
- __field(__u64, addr)
- __field(__u32, pferr)
- ),
-
- TP_fast_assign(
- __entry->addr = addr;
- __entry->pferr = (!!write_fault << 1) | (!!user_fault << 2)
- | (!!fetch_fault << 4);
- ),
-
- TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr,
- __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags))
-);
-
-
-/* We just walked a paging element */
-TRACE_EVENT(
- kvm_mmu_paging_element,
- TP_PROTO(u64 pte, int level),
- TP_ARGS(pte, level),
-
- TP_STRUCT__entry(
- __field(__u64, pte)
- __field(__u32, level)
- ),
-
- TP_fast_assign(
- __entry->pte = pte;
- __entry->level = level;
- ),
-
- TP_printk("pte %llx level %u", __entry->pte, __entry->level)
-);
-
-DECLARE_EVENT_CLASS(kvm_mmu_set_bit_class,
-
- TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
-
- TP_ARGS(table_gfn, index, size),
-
- TP_STRUCT__entry(
- __field(__u64, gpa)
- ),
-
- TP_fast_assign(
- __entry->gpa = ((u64)table_gfn << PAGE_SHIFT)
- + index * size;
- ),
-
- TP_printk("gpa %llx", __entry->gpa)
-);
-
-/* We set a pte accessed bit */
-DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_accessed_bit,
-
- TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
-
- TP_ARGS(table_gfn, index, size)
-);
-
-/* We set a pte dirty bit */
-DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_dirty_bit,
-
- TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
-
- TP_ARGS(table_gfn, index, size)
-);
-
-TRACE_EVENT(
- kvm_mmu_walker_error,
- TP_PROTO(u32 pferr),
- TP_ARGS(pferr),
-
- TP_STRUCT__entry(
- __field(__u32, pferr)
- ),
-
- TP_fast_assign(
- __entry->pferr = pferr;
- ),
-
- TP_printk("pferr %x %s", __entry->pferr,
- __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags))
-);
-
-TRACE_EVENT(
- kvm_mmu_get_page,
- TP_PROTO(struct kvm_mmu_page *sp, bool created),
- TP_ARGS(sp, created),
-
- TP_STRUCT__entry(
- KVM_MMU_PAGE_FIELDS
- __field(bool, created)
- ),
-
- TP_fast_assign(
- KVM_MMU_PAGE_ASSIGN(sp)
- __entry->created = created;
- ),
-
- TP_printk("%s %s", KVM_MMU_PAGE_PRINTK(),
- __entry->created ? "new" : "existing")
-);
-
-DECLARE_EVENT_CLASS(kvm_mmu_page_class,
-
- TP_PROTO(struct kvm_mmu_page *sp),
- TP_ARGS(sp),
-
- TP_STRUCT__entry(
- KVM_MMU_PAGE_FIELDS
- ),
-
- TP_fast_assign(
- KVM_MMU_PAGE_ASSIGN(sp)
- ),
-
- TP_printk("%s", KVM_MMU_PAGE_PRINTK())
-);
-
-DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_sync_page,
- TP_PROTO(struct kvm_mmu_page *sp),
-
- TP_ARGS(sp)
-);
-
-DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page,
- TP_PROTO(struct kvm_mmu_page *sp),
-
- TP_ARGS(sp)
-);
-
-DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
- TP_PROTO(struct kvm_mmu_page *sp),
-
- TP_ARGS(sp)
-);
-
-DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_delay_free_pages,
- TP_PROTO(struct kvm_mmu_page *sp),
-
- TP_ARGS(sp)
-);
-
-TRACE_EVENT(
- mark_mmio_spte,
- TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access),
- TP_ARGS(sptep, gfn, access),
-
- TP_STRUCT__entry(
- __field(void *, sptep)
- __field(gfn_t, gfn)
- __field(unsigned, access)
- ),
-
- TP_fast_assign(
- __entry->sptep = sptep;
- __entry->gfn = gfn;
- __entry->access = access;
- ),
-
- TP_printk("sptep:%p gfn %llx access %x", __entry->sptep, __entry->gfn,
- __entry->access)
-);
-
-TRACE_EVENT(
- handle_mmio_page_fault,
- TP_PROTO(u64 addr, gfn_t gfn, unsigned access),
- TP_ARGS(addr, gfn, access),
-
- TP_STRUCT__entry(
- __field(u64, addr)
- __field(gfn_t, gfn)
- __field(unsigned, access)
- ),
-
- TP_fast_assign(
- __entry->addr = addr;
- __entry->gfn = gfn;
- __entry->access = access;
- ),
-
- TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn,
- __entry->access)
-);
-#endif /* _TRACE_KVMMMU_H */
-
-#undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH .
-#undef TRACE_INCLUDE_FILE
-#define TRACE_INCLUDE_FILE mmutrace
-
-/* This part must be outside protection */
-#include <trace/define_trace.h>
diff --git a/ANDROID_3.4.5/arch/x86/kvm/paging_tmpl.h b/ANDROID_3.4.5/arch/x86/kvm/paging_tmpl.h
deleted file mode 100644
index df5a7031..00000000
--- a/ANDROID_3.4.5/arch/x86/kvm/paging_tmpl.h
+++ /dev/null
@@ -1,837 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
- *
- * MMU support
- *
- * Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
- *
- * Authors:
- * Yaniv Kamay <yaniv@qumranet.com>
- * Avi Kivity <avi@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- *
- */
-
-/*
- * We need the mmu code to access both 32-bit and 64-bit guest ptes,
- * so the code in this file is compiled twice, once per pte size.
- */
-
-#if PTTYPE == 64
- #define pt_element_t u64
- #define guest_walker guest_walker64
- #define FNAME(name) paging##64_##name
- #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
- #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
- #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
- #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
- #define PT_LEVEL_BITS PT64_LEVEL_BITS
- #ifdef CONFIG_X86_64
- #define PT_MAX_FULL_LEVELS 4
- #define CMPXCHG cmpxchg
- #else
- #define CMPXCHG cmpxchg64
- #define PT_MAX_FULL_LEVELS 2
- #endif
-#elif PTTYPE == 32
- #define pt_element_t u32
- #define guest_walker guest_walker32
- #define FNAME(name) paging##32_##name
- #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
- #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl)
- #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl)
- #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
- #define PT_LEVEL_BITS PT32_LEVEL_BITS
- #define PT_MAX_FULL_LEVELS 2
- #define CMPXCHG cmpxchg
-#else
- #error Invalid PTTYPE value
-#endif
-
-#define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl)
-#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL)
-
-/*
- * The guest_walker structure emulates the behavior of the hardware page
- * table walker.
- */
-struct guest_walker {
- int level;
- gfn_t table_gfn[PT_MAX_FULL_LEVELS];
- pt_element_t ptes[PT_MAX_FULL_LEVELS];
- pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
- gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
- unsigned pt_access;
- unsigned pte_access;
- gfn_t gfn;
- struct x86_exception fault;
-};
-
-static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
-{
- return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
-}
-
-static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
- pt_element_t __user *ptep_user, unsigned index,
- pt_element_t orig_pte, pt_element_t new_pte)
-{
- int npages;
- pt_element_t ret;
- pt_element_t *table;
- struct page *page;
-
- npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page);
- /* Check if the user is doing something meaningless. */
- if (unlikely(npages != 1))
- return -EFAULT;
-
- table = kmap_atomic(page);
- ret = CMPXCHG(&table[index], orig_pte, new_pte);
- kunmap_atomic(table);
-
- kvm_release_page_dirty(page);
-
- return (ret != orig_pte);
-}
-
-static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte,
- bool last)
-{
- unsigned access;
-
- access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
- if (last && !is_dirty_gpte(gpte))
- access &= ~ACC_WRITE_MASK;
-
-#if PTTYPE == 64
- if (vcpu->arch.mmu.nx)
- access &= ~(gpte >> PT64_NX_SHIFT);
-#endif
- return access;
-}
-
-static bool FNAME(is_last_gpte)(struct guest_walker *walker,
- struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
- pt_element_t gpte)
-{
- if (walker->level == PT_PAGE_TABLE_LEVEL)
- return true;
-
- if ((walker->level == PT_DIRECTORY_LEVEL) && is_large_pte(gpte) &&
- (PTTYPE == 64 || is_pse(vcpu)))
- return true;
-
- if ((walker->level == PT_PDPE_LEVEL) && is_large_pte(gpte) &&
- (mmu->root_level == PT64_ROOT_LEVEL))
- return true;
-
- return false;
-}
-
-/*
- * Fetch a guest pte for a guest virtual address
- */
-static int FNAME(walk_addr_generic)(struct guest_walker *walker,
- struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
- gva_t addr, u32 access)
-{
- pt_element_t pte;
- pt_element_t __user *uninitialized_var(ptep_user);
- gfn_t table_gfn;
- unsigned index, pt_access, uninitialized_var(pte_access);
- gpa_t pte_gpa;
- bool eperm, last_gpte;
- int offset;
- const int write_fault = access & PFERR_WRITE_MASK;
- const int user_fault = access & PFERR_USER_MASK;
- const int fetch_fault = access & PFERR_FETCH_MASK;
- u16 errcode = 0;
-
- trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
- fetch_fault);
-retry_walk:
- eperm = false;
- walker->level = mmu->root_level;
- pte = mmu->get_cr3(vcpu);
-
-#if PTTYPE == 64
- if (walker->level == PT32E_ROOT_LEVEL) {
- pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
- trace_kvm_mmu_paging_element(pte, walker->level);
- if (!is_present_gpte(pte))
- goto error;
- --walker->level;
- }
-#endif
- ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
- (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
-
- pt_access = ACC_ALL;
-
- for (;;) {
- gfn_t real_gfn;
- unsigned long host_addr;
-
- index = PT_INDEX(addr, walker->level);
-
- table_gfn = gpte_to_gfn(pte);
- offset = index * sizeof(pt_element_t);
- pte_gpa = gfn_to_gpa(table_gfn) + offset;
- walker->table_gfn[walker->level - 1] = table_gfn;
- walker->pte_gpa[walker->level - 1] = pte_gpa;
-
- real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
- PFERR_USER_MASK|PFERR_WRITE_MASK);
- if (unlikely(real_gfn == UNMAPPED_GVA))
- goto error;
- real_gfn = gpa_to_gfn(real_gfn);
-
- host_addr = gfn_to_hva(vcpu->kvm, real_gfn);
- if (unlikely(kvm_is_error_hva(host_addr)))
- goto error;
-
- ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
- if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte))))
- goto error;
-
- trace_kvm_mmu_paging_element(pte, walker->level);
-
- if (unlikely(!is_present_gpte(pte)))
- goto error;
-
- if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte,
- walker->level))) {
- errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
- goto error;
- }
-
- if (!check_write_user_access(vcpu, write_fault, user_fault,
- pte))
- eperm = true;
-
-#if PTTYPE == 64
- if (unlikely(fetch_fault && (pte & PT64_NX_MASK)))
- eperm = true;
-#endif
-
- last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte);
- if (last_gpte) {
- pte_access = pt_access &
- FNAME(gpte_access)(vcpu, pte, true);
- /* check if the kernel is fetching from user page */
- if (unlikely(pte_access & PT_USER_MASK) &&
- kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
- if (fetch_fault && !user_fault)
- eperm = true;
- }
-
- if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) {
- int ret;
- trace_kvm_mmu_set_accessed_bit(table_gfn, index,
- sizeof(pte));
- ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
- pte, pte|PT_ACCESSED_MASK);
- if (unlikely(ret < 0))
- goto error;
- else if (ret)
- goto retry_walk;
-
- mark_page_dirty(vcpu->kvm, table_gfn);
- pte |= PT_ACCESSED_MASK;
- }
-
- walker->ptes[walker->level - 1] = pte;
-
- if (last_gpte) {
- int lvl = walker->level;
- gpa_t real_gpa;
- gfn_t gfn;
- u32 ac;
-
- gfn = gpte_to_gfn_lvl(pte, lvl);
- gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;
-
- if (PTTYPE == 32 &&
- walker->level == PT_DIRECTORY_LEVEL &&
- is_cpuid_PSE36())
- gfn += pse36_gfn_delta(pte);
-
- ac = write_fault | fetch_fault | user_fault;
-
- real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn),
- ac);
- if (real_gpa == UNMAPPED_GVA)
- return 0;
-
- walker->gfn = real_gpa >> PAGE_SHIFT;
-
- break;
- }
-
- pt_access &= FNAME(gpte_access)(vcpu, pte, false);
- --walker->level;
- }
-
- if (unlikely(eperm)) {
- errcode |= PFERR_PRESENT_MASK;
- goto error;
- }
-
- if (write_fault && unlikely(!is_dirty_gpte(pte))) {
- int ret;
-
- trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
- ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
- pte, pte|PT_DIRTY_MASK);
- if (unlikely(ret < 0))
- goto error;
- else if (ret)
- goto retry_walk;
-
- mark_page_dirty(vcpu->kvm, table_gfn);
- pte |= PT_DIRTY_MASK;
- walker->ptes[walker->level - 1] = pte;
- }
-
- walker->pt_access = pt_access;
- walker->pte_access = pte_access;
- pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
- __func__, (u64)pte, pte_access, pt_access);
- return 1;
-
-error:
- errcode |= write_fault | user_fault;
- if (fetch_fault && (mmu->nx ||
- kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)))
- errcode |= PFERR_FETCH_MASK;
-
- walker->fault.vector = PF_VECTOR;
- walker->fault.error_code_valid = true;
- walker->fault.error_code = errcode;
- walker->fault.address = addr;
- walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
-
- trace_kvm_mmu_walker_error(walker->fault.error_code);
- return 0;
-}
-
-static int FNAME(walk_addr)(struct guest_walker *walker,
- struct kvm_vcpu *vcpu, gva_t addr, u32 access)
-{
- return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr,
- access);
-}
-
-static int FNAME(walk_addr_nested)(struct guest_walker *walker,
- struct kvm_vcpu *vcpu, gva_t addr,
- u32 access)
-{
- return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
- addr, access);
-}
-
-static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, u64 *spte,
- pt_element_t gpte)
-{
- if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
- goto no_present;
-
- if (!is_present_gpte(gpte))
- goto no_present;
-
- if (!(gpte & PT_ACCESSED_MASK))
- goto no_present;
-
- return false;
-
-no_present:
- drop_spte(vcpu->kvm, spte);
- return true;
-}
-
-static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- u64 *spte, const void *pte)
-{
- pt_element_t gpte;
- unsigned pte_access;
- pfn_t pfn;
-
- gpte = *(const pt_element_t *)pte;
- if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
- return;
-
- pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
- pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, true);
- pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
- if (mmu_invalid_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
- return;
- }
-
- /*
- * we call mmu_set_spte() with host_writable = true because that
- * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
- */
- mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
- NULL, PT_PAGE_TABLE_LEVEL,
- gpte_to_gfn(gpte), pfn, true, true);
-}
-
-static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
- struct guest_walker *gw, int level)
-{
- pt_element_t curr_pte;
- gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1];
- u64 mask;
- int r, index;
-
- if (level == PT_PAGE_TABLE_LEVEL) {
- mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1;
- base_gpa = pte_gpa & ~mask;
- index = (pte_gpa - base_gpa) / sizeof(pt_element_t);
-
- r = kvm_read_guest_atomic(vcpu->kvm, base_gpa,
- gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
- curr_pte = gw->prefetch_ptes[index];
- } else
- r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa,
- &curr_pte, sizeof(curr_pte));
-
- return r || curr_pte != gw->ptes[level - 1];
-}
-
-static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
- u64 *sptep)
-{
- struct kvm_mmu_page *sp;
- pt_element_t *gptep = gw->prefetch_ptes;
- u64 *spte;
- int i;
-
- sp = page_header(__pa(sptep));
-
- if (sp->role.level > PT_PAGE_TABLE_LEVEL)
- return;
-
- if (sp->role.direct)
- return __direct_pte_prefetch(vcpu, sp, sptep);
-
- i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
- spte = sp->spt + i;
-
- for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
- pt_element_t gpte;
- unsigned pte_access;
- gfn_t gfn;
- pfn_t pfn;
-
- if (spte == sptep)
- continue;
-
- if (is_shadow_present_pte(*spte))
- continue;
-
- gpte = gptep[i];
-
- if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
- continue;
-
- pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte,
- true);
- gfn = gpte_to_gfn(gpte);
- pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
- pte_access & ACC_WRITE_MASK);
- if (mmu_invalid_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
- break;
- }
-
- mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
- NULL, PT_PAGE_TABLE_LEVEL, gfn,
- pfn, true, true);
- }
-}
-
-/*
- * Fetch a shadow pte for a specific level in the paging hierarchy.
- */
-static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
- struct guest_walker *gw,
- int user_fault, int write_fault, int hlevel,
- int *emulate, pfn_t pfn, bool map_writable,
- bool prefault)
-{
- unsigned access = gw->pt_access;
- struct kvm_mmu_page *sp = NULL;
- int top_level;
- unsigned direct_access;
- struct kvm_shadow_walk_iterator it;
-
- if (!is_present_gpte(gw->ptes[gw->level - 1]))
- return NULL;
-
- direct_access = gw->pte_access;
-
- top_level = vcpu->arch.mmu.root_level;
- if (top_level == PT32E_ROOT_LEVEL)
- top_level = PT32_ROOT_LEVEL;
- /*
- * Verify that the top-level gpte is still there. Since the page
- * is a root page, it is either write protected (and cannot be
- * changed from now on) or it is invalid (in which case, we don't
- * really care if it changes underneath us after this point).
- */
- if (FNAME(gpte_changed)(vcpu, gw, top_level))
- goto out_gpte_changed;
-
- for (shadow_walk_init(&it, vcpu, addr);
- shadow_walk_okay(&it) && it.level > gw->level;
- shadow_walk_next(&it)) {
- gfn_t table_gfn;
-
- clear_sp_write_flooding_count(it.sptep);
- drop_large_spte(vcpu, it.sptep);
-
- sp = NULL;
- if (!is_shadow_present_pte(*it.sptep)) {
- table_gfn = gw->table_gfn[it.level - 2];
- sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1,
- false, access, it.sptep);
- }
-
- /*
- * Verify that the gpte in the page we've just write
- * protected is still there.
- */
- if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
- goto out_gpte_changed;
-
- if (sp)
- link_shadow_page(it.sptep, sp);
- }
-
- for (;
- shadow_walk_okay(&it) && it.level > hlevel;
- shadow_walk_next(&it)) {
- gfn_t direct_gfn;
-
- clear_sp_write_flooding_count(it.sptep);
- validate_direct_spte(vcpu, it.sptep, direct_access);
-
- drop_large_spte(vcpu, it.sptep);
-
- if (is_shadow_present_pte(*it.sptep))
- continue;
-
- direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
-
- sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
- true, direct_access, it.sptep);
- link_shadow_page(it.sptep, sp);
- }
-
- clear_sp_write_flooding_count(it.sptep);
- mmu_set_spte(vcpu, it.sptep, access, gw->pte_access,
- user_fault, write_fault, emulate, it.level,
- gw->gfn, pfn, prefault, map_writable);
- FNAME(pte_prefetch)(vcpu, gw, it.sptep);
-
- return it.sptep;
-
-out_gpte_changed:
- if (sp)
- kvm_mmu_put_page(sp, it.sptep);
- kvm_release_pfn_clean(pfn);
- return NULL;
-}
-
-/*
- * Page fault handler. There are several causes for a page fault:
- * - there is no shadow pte for the guest pte
- * - write access through a shadow pte marked read only so that we can set
- * the dirty bit
- * - write access to a shadow pte marked read only so we can update the page
- * dirty bitmap, when userspace requests it
- * - mmio access; in this case we will never install a present shadow pte
- * - normal guest page fault due to the guest pte marked not present, not
- * writable, or not executable
- *
- * Returns: 1 if we need to emulate the instruction, 0 otherwise, or
- * a negative value on error.
- */
-static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
- bool prefault)
-{
- int write_fault = error_code & PFERR_WRITE_MASK;
- int user_fault = error_code & PFERR_USER_MASK;
- struct guest_walker walker;
- u64 *sptep;
- int emulate = 0;
- int r;
- pfn_t pfn;
- int level = PT_PAGE_TABLE_LEVEL;
- int force_pt_level;
- unsigned long mmu_seq;
- bool map_writable;
-
- pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
-
- if (unlikely(error_code & PFERR_RSVD_MASK))
- return handle_mmio_page_fault(vcpu, addr, error_code,
- mmu_is_nested(vcpu));
-
- r = mmu_topup_memory_caches(vcpu);
- if (r)
- return r;
-
- /*
- * Look up the guest pte for the faulting address.
- */
- r = FNAME(walk_addr)(&walker, vcpu, addr, error_code);
-
- /*
- * The page is not mapped by the guest. Let the guest handle it.
- */
- if (!r) {
- pgprintk("%s: guest page fault\n", __func__);
- if (!prefault)
- inject_page_fault(vcpu, &walker.fault);
-
- return 0;
- }
-
- if (walker.level >= PT_DIRECTORY_LEVEL)
- force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn);
- else
- force_pt_level = 1;
- if (!force_pt_level) {
- level = min(walker.level, mapping_level(vcpu, walker.gfn));
- walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
- }
-
- mmu_seq = vcpu->kvm->mmu_notifier_seq;
- smp_rmb();
-
- if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
- &map_writable))
- return 0;
-
- if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr,
- walker.gfn, pfn, walker.pte_access, &r))
- return r;
-
- spin_lock(&vcpu->kvm->mmu_lock);
- if (mmu_notifier_retry(vcpu, mmu_seq))
- goto out_unlock;
-
- kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
- kvm_mmu_free_some_pages(vcpu);
- if (!force_pt_level)
- transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
- sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
- level, &emulate, pfn, map_writable, prefault);
- (void)sptep;
- pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__,
- sptep, *sptep, emulate);
-
- ++vcpu->stat.pf_fixed;
- kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
- spin_unlock(&vcpu->kvm->mmu_lock);
-
- return emulate;
-
-out_unlock:
- spin_unlock(&vcpu->kvm->mmu_lock);
- kvm_release_pfn_clean(pfn);
- return 0;
-}
-
-static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
-{
- int offset = 0;
-
- WARN_ON(sp->role.level != 1);
-
- if (PTTYPE == 32)
- offset = sp->role.quadrant << PT64_LEVEL_BITS;
-
- return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
-}
-
-static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
-{
- struct kvm_shadow_walk_iterator iterator;
- struct kvm_mmu_page *sp;
- int level;
- u64 *sptep;
-
- vcpu_clear_mmio_info(vcpu, gva);
-
- /*
- * No need to check return value here, rmap_can_add() can
- * help us to skip pte prefetch later.
- */
- mmu_topup_memory_caches(vcpu);
-
- spin_lock(&vcpu->kvm->mmu_lock);
- for_each_shadow_entry(vcpu, gva, iterator) {
- level = iterator.level;
- sptep = iterator.sptep;
-
- sp = page_header(__pa(sptep));
- if (is_last_spte(*sptep, level)) {
- pt_element_t gpte;
- gpa_t pte_gpa;
-
- if (!sp->unsync)
- break;
-
- pte_gpa = FNAME(get_level1_sp_gpa)(sp);
- pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
-
- if (mmu_page_zap_pte(vcpu->kvm, sp, sptep))
- kvm_flush_remote_tlbs(vcpu->kvm);
-
- if (!rmap_can_add(vcpu))
- break;
-
- if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
- sizeof(pt_element_t)))
- break;
-
- FNAME(update_pte)(vcpu, sp, sptep, &gpte);
- }
-
- if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
- break;
- }
- spin_unlock(&vcpu->kvm->mmu_lock);
-}
-
-static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
- struct x86_exception *exception)
-{
- struct guest_walker walker;
- gpa_t gpa = UNMAPPED_GVA;
- int r;
-
- r = FNAME(walk_addr)(&walker, vcpu, vaddr, access);
-
- if (r) {
- gpa = gfn_to_gpa(walker.gfn);
- gpa |= vaddr & ~PAGE_MASK;
- } else if (exception)
- *exception = walker.fault;
-
- return gpa;
-}
-
-static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
- u32 access,
- struct x86_exception *exception)
-{
- struct guest_walker walker;
- gpa_t gpa = UNMAPPED_GVA;
- int r;
-
- r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access);
-
- if (r) {
- gpa = gfn_to_gpa(walker.gfn);
- gpa |= vaddr & ~PAGE_MASK;
- } else if (exception)
- *exception = walker.fault;
-
- return gpa;
-}
-
-/*
- * Using the cached information from sp->gfns is safe because:
- * - The spte has a reference to the struct page, so the pfn for a given gfn
- * can't change unless all sptes pointing to it are nuked first.
- *
- * Note:
- * We should flush all tlbs if spte is dropped even though guest is
- * responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page
- * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't
- * used by guest then tlbs are not flushed, so guest is allowed to access the
- * freed pages.
- * And we increase kvm->tlbs_dirty to delay tlbs flush in this case.
- */
-static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
-{
- int i, nr_present = 0;
- bool host_writable;
- gpa_t first_pte_gpa;
-
- /* direct kvm_mmu_page can not be unsync. */
- BUG_ON(sp->role.direct);
-
- first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
-
- for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
- unsigned pte_access;
- pt_element_t gpte;
- gpa_t pte_gpa;
- gfn_t gfn;
-
- if (!sp->spt[i])
- continue;
-
- pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
-
- if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
- sizeof(pt_element_t)))
- return -EINVAL;
-
- if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
- vcpu->kvm->tlbs_dirty++;
- continue;
- }
-
- gfn = gpte_to_gfn(gpte);
- pte_access = sp->role.access;
- pte_access &= FNAME(gpte_access)(vcpu, gpte, true);
-
- if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present))
- continue;
-
- if (gfn != sp->gfns[i]) {
- drop_spte(vcpu->kvm, &sp->spt[i]);
- vcpu->kvm->tlbs_dirty++;
- continue;
- }
-
- nr_present++;
-
- host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
-
- set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
- PT_PAGE_TABLE_LEVEL, gfn,
- spte_to_pfn(sp->spt[i]), true, false,
- host_writable);
- }
-
- return !nr_present;
-}
-
-#undef pt_element_t
-#undef guest_walker
-#undef FNAME
-#undef PT_BASE_ADDR_MASK
-#undef PT_INDEX
-#undef PT_LVL_ADDR_MASK
-#undef PT_LVL_OFFSET_MASK
-#undef PT_LEVEL_BITS
-#undef PT_MAX_FULL_LEVELS
-#undef gpte_to_gfn
-#undef gpte_to_gfn_lvl
-#undef CMPXCHG
diff --git a/ANDROID_3.4.5/arch/x86/kvm/pmu.c b/ANDROID_3.4.5/arch/x86/kvm/pmu.c
deleted file mode 100644
index 2e88438f..00000000
--- a/ANDROID_3.4.5/arch/x86/kvm/pmu.c
+++ /dev/null
@@ -1,537 +0,0 @@
-/*
- * Kernel-based Virtual Machine -- Performane Monitoring Unit support
- *
- * Copyright 2011 Red Hat, Inc. and/or its affiliates.
- *
- * Authors:
- * Avi Kivity <avi@redhat.com>
- * Gleb Natapov <gleb@redhat.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- *
- */
-
-#include <linux/types.h>
-#include <linux/kvm_host.h>
-#include <linux/perf_event.h>
-#include "x86.h"
-#include "cpuid.h"
-#include "lapic.h"
-
-static struct kvm_arch_event_perf_mapping {
- u8 eventsel;
- u8 unit_mask;
- unsigned event_type;
- bool inexact;
-} arch_events[] = {
- /* Index must match CPUID 0x0A.EBX bit vector */
- [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES },
- [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
- [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES },
- [3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES },
- [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
- [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
- [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
- [7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES },
-};
-
-/* mapping between fixed pmc index and arch_events array */
-int fixed_pmc_events[] = {1, 0, 7};
-
-static bool pmc_is_gp(struct kvm_pmc *pmc)
-{
- return pmc->type == KVM_PMC_GP;
-}
-
-static inline u64 pmc_bitmask(struct kvm_pmc *pmc)
-{
- struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
-
- return pmu->counter_bitmask[pmc->type];
-}
-
-static inline bool pmc_enabled(struct kvm_pmc *pmc)
-{
- struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
- return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl);
-}
-
-static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr,
- u32 base)
-{
- if (msr >= base && msr < base + pmu->nr_arch_gp_counters)
- return &pmu->gp_counters[msr - base];
- return NULL;
-}
-
-static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr)
-{
- int base = MSR_CORE_PERF_FIXED_CTR0;
- if (msr >= base && msr < base + pmu->nr_arch_fixed_counters)
- return &pmu->fixed_counters[msr - base];
- return NULL;
-}
-
-static inline struct kvm_pmc *get_fixed_pmc_idx(struct kvm_pmu *pmu, int idx)
-{
- return get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + idx);
-}
-
-static struct kvm_pmc *global_idx_to_pmc(struct kvm_pmu *pmu, int idx)
-{
- if (idx < X86_PMC_IDX_FIXED)
- return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + idx, MSR_P6_EVNTSEL0);
- else
- return get_fixed_pmc_idx(pmu, idx - X86_PMC_IDX_FIXED);
-}
-
-void kvm_deliver_pmi(struct kvm_vcpu *vcpu)
-{
- if (vcpu->arch.apic)
- kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
-}
-
-static void trigger_pmi(struct irq_work *irq_work)
-{
- struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu,
- irq_work);
- struct kvm_vcpu *vcpu = container_of(pmu, struct kvm_vcpu,
- arch.pmu);
-
- kvm_deliver_pmi(vcpu);
-}
-
-static void kvm_perf_overflow(struct perf_event *perf_event,
- struct perf_sample_data *data,
- struct pt_regs *regs)
-{
- struct kvm_pmc *pmc = perf_event->overflow_handler_context;
- struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
- __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
-}
-
-static void kvm_perf_overflow_intr(struct perf_event *perf_event,
- struct perf_sample_data *data, struct pt_regs *regs)
-{
- struct kvm_pmc *pmc = perf_event->overflow_handler_context;
- struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
- if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) {
- kvm_perf_overflow(perf_event, data, regs);
- kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
- /*
- * Inject PMI. If vcpu was in a guest mode during NMI PMI
- * can be ejected on a guest mode re-entry. Otherwise we can't
- * be sure that vcpu wasn't executing hlt instruction at the
- * time of vmexit and is not going to re-enter guest mode until,
- * woken up. So we should wake it, but this is impossible from
- * NMI context. Do it from irq work instead.
- */
- if (!kvm_is_in_guest())
- irq_work_queue(&pmc->vcpu->arch.pmu.irq_work);
- else
- kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
- }
-}
-
-static u64 read_pmc(struct kvm_pmc *pmc)
-{
- u64 counter, enabled, running;
-
- counter = pmc->counter;
-
- if (pmc->perf_event)
- counter += perf_event_read_value(pmc->perf_event,
- &enabled, &running);
-
- /* FIXME: Scaling needed? */
-
- return counter & pmc_bitmask(pmc);
-}
-
-static void stop_counter(struct kvm_pmc *pmc)
-{
- if (pmc->perf_event) {
- pmc->counter = read_pmc(pmc);
- perf_event_release_kernel(pmc->perf_event);
- pmc->perf_event = NULL;
- }
-}
-
-static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
- unsigned config, bool exclude_user, bool exclude_kernel,
- bool intr)
-{
- struct perf_event *event;
- struct perf_event_attr attr = {
- .type = type,
- .size = sizeof(attr),
- .pinned = true,
- .exclude_idle = true,
- .exclude_host = 1,
- .exclude_user = exclude_user,
- .exclude_kernel = exclude_kernel,
- .config = config,
- };
-
- attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc);
-
- event = perf_event_create_kernel_counter(&attr, -1, current,
- intr ? kvm_perf_overflow_intr :
- kvm_perf_overflow, pmc);
- if (IS_ERR(event)) {
- printk_once("kvm: pmu event creation failed %ld\n",
- PTR_ERR(event));
- return;
- }
-
- pmc->perf_event = event;
- clear_bit(pmc->idx, (unsigned long*)&pmc->vcpu->arch.pmu.reprogram_pmi);
-}
-
-static unsigned find_arch_event(struct kvm_pmu *pmu, u8 event_select,
- u8 unit_mask)
-{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(arch_events); i++)
- if (arch_events[i].eventsel == event_select
- && arch_events[i].unit_mask == unit_mask
- && (pmu->available_event_types & (1 << i)))
- break;
-
- if (i == ARRAY_SIZE(arch_events))
- return PERF_COUNT_HW_MAX;
-
- return arch_events[i].event_type;
-}
-
-static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
-{
- unsigned config, type = PERF_TYPE_RAW;
- u8 event_select, unit_mask;
-
- if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
- printk_once("kvm pmu: pin control bit is ignored\n");
-
- pmc->eventsel = eventsel;
-
- stop_counter(pmc);
-
- if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_enabled(pmc))
- return;
-
- event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
- unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
-
- if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE |
- ARCH_PERFMON_EVENTSEL_INV |
- ARCH_PERFMON_EVENTSEL_CMASK))) {
- config = find_arch_event(&pmc->vcpu->arch.pmu, event_select,
- unit_mask);
- if (config != PERF_COUNT_HW_MAX)
- type = PERF_TYPE_HARDWARE;
- }
-
- if (type == PERF_TYPE_RAW)
- config = eventsel & X86_RAW_EVENT_MASK;
-
- reprogram_counter(pmc, type, config,
- !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
- !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
- eventsel & ARCH_PERFMON_EVENTSEL_INT);
-}
-
-static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx)
-{
- unsigned en = en_pmi & 0x3;
- bool pmi = en_pmi & 0x8;
-
- stop_counter(pmc);
-
- if (!en || !pmc_enabled(pmc))
- return;
-
- reprogram_counter(pmc, PERF_TYPE_HARDWARE,
- arch_events[fixed_pmc_events[idx]].event_type,
- !(en & 0x2), /* exclude user */
- !(en & 0x1), /* exclude kernel */
- pmi);
-}
-
-static inline u8 fixed_en_pmi(u64 ctrl, int idx)
-{
- return (ctrl >> (idx * 4)) & 0xf;
-}
-
-static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
-{
- int i;
-
- for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
- u8 en_pmi = fixed_en_pmi(data, i);
- struct kvm_pmc *pmc = get_fixed_pmc_idx(pmu, i);
-
- if (fixed_en_pmi(pmu->fixed_ctr_ctrl, i) == en_pmi)
- continue;
-
- reprogram_fixed_counter(pmc, en_pmi, i);
- }
-
- pmu->fixed_ctr_ctrl = data;
-}
-
-static void reprogram_idx(struct kvm_pmu *pmu, int idx)
-{
- struct kvm_pmc *pmc = global_idx_to_pmc(pmu, idx);
-
- if (!pmc)
- return;
-
- if (pmc_is_gp(pmc))
- reprogram_gp_counter(pmc, pmc->eventsel);
- else {
- int fidx = idx - X86_PMC_IDX_FIXED;
- reprogram_fixed_counter(pmc,
- fixed_en_pmi(pmu->fixed_ctr_ctrl, fidx), fidx);
- }
-}
-
-static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data)
-{
- int bit;
- u64 diff = pmu->global_ctrl ^ data;
-
- pmu->global_ctrl = data;
-
- for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX)
- reprogram_idx(pmu, bit);
-}
-
-bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr)
-{
- struct kvm_pmu *pmu = &vcpu->arch.pmu;
- int ret;
-
- switch (msr) {
- case MSR_CORE_PERF_FIXED_CTR_CTRL:
- case MSR_CORE_PERF_GLOBAL_STATUS:
- case MSR_CORE_PERF_GLOBAL_CTRL:
- case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
- ret = pmu->version > 1;
- break;
- default:
- ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)
- || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0)
- || get_fixed_pmc(pmu, msr);
- break;
- }
- return ret;
-}
-
-int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
-{
- struct kvm_pmu *pmu = &vcpu->arch.pmu;
- struct kvm_pmc *pmc;
-
- switch (index) {
- case MSR_CORE_PERF_FIXED_CTR_CTRL:
- *data = pmu->fixed_ctr_ctrl;
- return 0;
- case MSR_CORE_PERF_GLOBAL_STATUS:
- *data = pmu->global_status;
- return 0;
- case MSR_CORE_PERF_GLOBAL_CTRL:
- *data = pmu->global_ctrl;
- return 0;
- case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
- *data = pmu->global_ovf_ctrl;
- return 0;
- default:
- if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) ||
- (pmc = get_fixed_pmc(pmu, index))) {
- *data = read_pmc(pmc);
- return 0;
- } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
- *data = pmc->eventsel;
- return 0;
- }
- }
- return 1;
-}
-
-int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
-{
- struct kvm_pmu *pmu = &vcpu->arch.pmu;
- struct kvm_pmc *pmc;
-
- switch (index) {
- case MSR_CORE_PERF_FIXED_CTR_CTRL:
- if (pmu->fixed_ctr_ctrl == data)
- return 0;
- if (!(data & 0xfffffffffffff444ull)) {
- reprogram_fixed_counters(pmu, data);
- return 0;
- }
- break;
- case MSR_CORE_PERF_GLOBAL_STATUS:
- break; /* RO MSR */
- case MSR_CORE_PERF_GLOBAL_CTRL:
- if (pmu->global_ctrl == data)
- return 0;
- if (!(data & pmu->global_ctrl_mask)) {
- global_ctrl_changed(pmu, data);
- return 0;
- }
- break;
- case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
- if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) {
- pmu->global_status &= ~data;
- pmu->global_ovf_ctrl = data;
- return 0;
- }
- break;
- default:
- if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) ||
- (pmc = get_fixed_pmc(pmu, index))) {
- data = (s64)(s32)data;
- pmc->counter += data - read_pmc(pmc);
- return 0;
- } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
- if (data == pmc->eventsel)
- return 0;
- if (!(data & 0xffffffff00200000ull)) {
- reprogram_gp_counter(pmc, data);
- return 0;
- }
- }
- }
- return 1;
-}
-
-int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data)
-{
- struct kvm_pmu *pmu = &vcpu->arch.pmu;
- bool fast_mode = pmc & (1u << 31);
- bool fixed = pmc & (1u << 30);
- struct kvm_pmc *counters;
- u64 ctr;
-
- pmc &= ~(3u << 30);
- if (!fixed && pmc >= pmu->nr_arch_gp_counters)
- return 1;
- if (fixed && pmc >= pmu->nr_arch_fixed_counters)
- return 1;
- counters = fixed ? pmu->fixed_counters : pmu->gp_counters;
- ctr = read_pmc(&counters[pmc]);
- if (fast_mode)
- ctr = (u32)ctr;
- *data = ctr;
-
- return 0;
-}
-
-void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
-{
- struct kvm_pmu *pmu = &vcpu->arch.pmu;
- struct kvm_cpuid_entry2 *entry;
- unsigned bitmap_len;
-
- pmu->nr_arch_gp_counters = 0;
- pmu->nr_arch_fixed_counters = 0;
- pmu->counter_bitmask[KVM_PMC_GP] = 0;
- pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
- pmu->version = 0;
-
- entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
- if (!entry)
- return;
-
- pmu->version = entry->eax & 0xff;
- if (!pmu->version)
- return;
-
- pmu->nr_arch_gp_counters = min((int)(entry->eax >> 8) & 0xff,
- X86_PMC_MAX_GENERIC);
- pmu->counter_bitmask[KVM_PMC_GP] =
- ((u64)1 << ((entry->eax >> 16) & 0xff)) - 1;
- bitmap_len = (entry->eax >> 24) & 0xff;
- pmu->available_event_types = ~entry->ebx & ((1ull << bitmap_len) - 1);
-
- if (pmu->version == 1) {
- pmu->nr_arch_fixed_counters = 0;
- } else {
- pmu->nr_arch_fixed_counters = min((int)(entry->edx & 0x1f),
- X86_PMC_MAX_FIXED);
- pmu->counter_bitmask[KVM_PMC_FIXED] =
- ((u64)1 << ((entry->edx >> 5) & 0xff)) - 1;
- }
-
- pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) |
- (((1ull << pmu->nr_arch_fixed_counters) - 1) << X86_PMC_IDX_FIXED);
- pmu->global_ctrl_mask = ~pmu->global_ctrl;
-}
-
-void kvm_pmu_init(struct kvm_vcpu *vcpu)
-{
- int i;
- struct kvm_pmu *pmu = &vcpu->arch.pmu;
-
- memset(pmu, 0, sizeof(*pmu));
- for (i = 0; i < X86_PMC_MAX_GENERIC; i++) {
- pmu->gp_counters[i].type = KVM_PMC_GP;
- pmu->gp_counters[i].vcpu = vcpu;
- pmu->gp_counters[i].idx = i;
- }
- for (i = 0; i < X86_PMC_MAX_FIXED; i++) {
- pmu->fixed_counters[i].type = KVM_PMC_FIXED;
- pmu->fixed_counters[i].vcpu = vcpu;
- pmu->fixed_counters[i].idx = i + X86_PMC_IDX_FIXED;
- }
- init_irq_work(&pmu->irq_work, trigger_pmi);
- kvm_pmu_cpuid_update(vcpu);
-}
-
-void kvm_pmu_reset(struct kvm_vcpu *vcpu)
-{
- struct kvm_pmu *pmu = &vcpu->arch.pmu;
- int i;
-
- irq_work_sync(&pmu->irq_work);
- for (i = 0; i < X86_PMC_MAX_GENERIC; i++) {
- struct kvm_pmc *pmc = &pmu->gp_counters[i];
- stop_counter(pmc);
- pmc->counter = pmc->eventsel = 0;
- }
-
- for (i = 0; i < X86_PMC_MAX_FIXED; i++)
- stop_counter(&pmu->fixed_counters[i]);
-
- pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status =
- pmu->global_ovf_ctrl = 0;
-}
-
-void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
-{
- kvm_pmu_reset(vcpu);
-}
-
-void kvm_handle_pmu_event(struct kvm_vcpu *vcpu)
-{
- struct kvm_pmu *pmu = &vcpu->arch.pmu;
- u64 bitmask;
- int bit;
-
- bitmask = pmu->reprogram_pmi;
-
- for_each_set_bit(bit, (unsigned long *)&bitmask, X86_PMC_IDX_MAX) {
- struct kvm_pmc *pmc = global_idx_to_pmc(pmu, bit);
-
- if (unlikely(!pmc || !pmc->perf_event)) {
- clear_bit(bit, (unsigned long *)&pmu->reprogram_pmi);
- continue;
- }
-
- reprogram_idx(pmu, bit);
- }
-}
diff --git a/ANDROID_3.4.5/arch/x86/kvm/svm.c b/ANDROID_3.4.5/arch/x86/kvm/svm.c
deleted file mode 100644
index e334389e..00000000
--- a/ANDROID_3.4.5/arch/x86/kvm/svm.c
+++ /dev/null
@@ -1,4336 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * AMD SVM support
- *
- * Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
- *
- * Authors:
- * Yaniv Kamay <yaniv@qumranet.com>
- * Avi Kivity <avi@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- *
- */
-#include <linux/kvm_host.h>
-
-#include "irq.h"
-#include "mmu.h"
-#include "kvm_cache_regs.h"
-#include "x86.h"
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/vmalloc.h>
-#include <linux/highmem.h>
-#include <linux/sched.h>
-#include <linux/ftrace_event.h>
-#include <linux/slab.h>
-
-#include <asm/perf_event.h>
-#include <asm/tlbflush.h>
-#include <asm/desc.h>
-#include <asm/kvm_para.h>
-
-#include <asm/virtext.h>
-#include "trace.h"
-
-#define __ex(x) __kvm_handle_fault_on_reboot(x)
-
-MODULE_AUTHOR("Qumranet");
-MODULE_LICENSE("GPL");
-
-#define IOPM_ALLOC_ORDER 2
-#define MSRPM_ALLOC_ORDER 1
-
-#define SEG_TYPE_LDT 2
-#define SEG_TYPE_BUSY_TSS16 3
-
-#define SVM_FEATURE_NPT (1 << 0)
-#define SVM_FEATURE_LBRV (1 << 1)
-#define SVM_FEATURE_SVML (1 << 2)
-#define SVM_FEATURE_NRIP (1 << 3)
-#define SVM_FEATURE_TSC_RATE (1 << 4)
-#define SVM_FEATURE_VMCB_CLEAN (1 << 5)
-#define SVM_FEATURE_FLUSH_ASID (1 << 6)
-#define SVM_FEATURE_DECODE_ASSIST (1 << 7)
-#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
-
-#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
-#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */
-#define NESTED_EXIT_CONTINUE 2 /* Further checks needed */
-
-#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
-
-#define TSC_RATIO_RSVD 0xffffff0000000000ULL
-#define TSC_RATIO_MIN 0x0000000000000001ULL
-#define TSC_RATIO_MAX 0x000000ffffffffffULL
-
-static bool erratum_383_found __read_mostly;
-
-static const u32 host_save_user_msrs[] = {
-#ifdef CONFIG_X86_64
- MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
- MSR_FS_BASE,
-#endif
- MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
-};
-
-#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
-
-struct kvm_vcpu;
-
-struct nested_state {
- struct vmcb *hsave;
- u64 hsave_msr;
- u64 vm_cr_msr;
- u64 vmcb;
-
- /* These are the merged vectors */
- u32 *msrpm;
-
- /* gpa pointers to the real vectors */
- u64 vmcb_msrpm;
- u64 vmcb_iopm;
-
- /* A VMEXIT is required but not yet emulated */
- bool exit_required;
-
- /* cache for intercepts of the guest */
- u32 intercept_cr;
- u32 intercept_dr;
- u32 intercept_exceptions;
- u64 intercept;
-
- /* Nested Paging related state */
- u64 nested_cr3;
-};
-
-#define MSRPM_OFFSETS 16
-static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
-
-/*
- * Set osvw_len to higher value when updated Revision Guides
- * are published and we know what the new status bits are
- */
-static uint64_t osvw_len = 4, osvw_status;
-
-struct vcpu_svm {
- struct kvm_vcpu vcpu;
- struct vmcb *vmcb;
- unsigned long vmcb_pa;
- struct svm_cpu_data *svm_data;
- uint64_t asid_generation;
- uint64_t sysenter_esp;
- uint64_t sysenter_eip;
-
- u64 next_rip;
-
- u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
- struct {
- u16 fs;
- u16 gs;
- u16 ldt;
- u64 gs_base;
- } host;
-
- u32 *msrpm;
-
- ulong nmi_iret_rip;
-
- struct nested_state nested;
-
- bool nmi_singlestep;
-
- unsigned int3_injected;
- unsigned long int3_rip;
- u32 apf_reason;
-
- u64 tsc_ratio;
-};
-
-static DEFINE_PER_CPU(u64, current_tsc_ratio);
-#define TSC_RATIO_DEFAULT 0x0100000000ULL
-
-#define MSR_INVALID 0xffffffffU
-
-static struct svm_direct_access_msrs {
- u32 index; /* Index of the MSR */
- bool always; /* True if intercept is always on */
-} direct_access_msrs[] = {
- { .index = MSR_STAR, .always = true },
- { .index = MSR_IA32_SYSENTER_CS, .always = true },
-#ifdef CONFIG_X86_64
- { .index = MSR_GS_BASE, .always = true },
- { .index = MSR_FS_BASE, .always = true },
- { .index = MSR_KERNEL_GS_BASE, .always = true },
- { .index = MSR_LSTAR, .always = true },
- { .index = MSR_CSTAR, .always = true },
- { .index = MSR_SYSCALL_MASK, .always = true },
-#endif
- { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
- { .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
- { .index = MSR_IA32_LASTINTFROMIP, .always = false },
- { .index = MSR_IA32_LASTINTTOIP, .always = false },
- { .index = MSR_INVALID, .always = false },
-};
-
-/* enable NPT for AMD64 and X86 with PAE */
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
-static bool npt_enabled = true;
-#else
-static bool npt_enabled;
-#endif
-
-/* allow nested paging (virtualized MMU) for all guests */
-static int npt = true;
-module_param(npt, int, S_IRUGO);
-
-/* allow nested virtualization in KVM/SVM */
-static int nested = true;
-module_param(nested, int, S_IRUGO);
-
-static void svm_flush_tlb(struct kvm_vcpu *vcpu);
-static void svm_complete_interrupts(struct vcpu_svm *svm);
-
-static int nested_svm_exit_handled(struct vcpu_svm *svm);
-static int nested_svm_intercept(struct vcpu_svm *svm);
-static int nested_svm_vmexit(struct vcpu_svm *svm);
-static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
- bool has_error_code, u32 error_code);
-static u64 __scale_tsc(u64 ratio, u64 tsc);
-
-enum {
- VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
- pause filter count */
- VMCB_PERM_MAP, /* IOPM Base and MSRPM Base */
- VMCB_ASID, /* ASID */
- VMCB_INTR, /* int_ctl, int_vector */
- VMCB_NPT, /* npt_en, nCR3, gPAT */
- VMCB_CR, /* CR0, CR3, CR4, EFER */
- VMCB_DR, /* DR6, DR7 */
- VMCB_DT, /* GDT, IDT */
- VMCB_SEG, /* CS, DS, SS, ES, CPL */
- VMCB_CR2, /* CR2 only */
- VMCB_LBR, /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
- VMCB_DIRTY_MAX,
-};
-
-/* TPR and CR2 are always written before VMRUN */
-#define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2))
-
-static inline void mark_all_dirty(struct vmcb *vmcb)
-{
- vmcb->control.clean = 0;
-}
-
-static inline void mark_all_clean(struct vmcb *vmcb)
-{
- vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1)
- & ~VMCB_ALWAYS_DIRTY_MASK;
-}
-
-static inline void mark_dirty(struct vmcb *vmcb, int bit)
-{
- vmcb->control.clean &= ~(1 << bit);
-}
-
-static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
-{
- return container_of(vcpu, struct vcpu_svm, vcpu);
-}
-
-static void recalc_intercepts(struct vcpu_svm *svm)
-{
- struct vmcb_control_area *c, *h;
- struct nested_state *g;
-
- mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
-
- if (!is_guest_mode(&svm->vcpu))
- return;
-
- c = &svm->vmcb->control;
- h = &svm->nested.hsave->control;
- g = &svm->nested;
-
- c->intercept_cr = h->intercept_cr | g->intercept_cr;
- c->intercept_dr = h->intercept_dr | g->intercept_dr;
- c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
- c->intercept = h->intercept | g->intercept;
-}
-
-static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
-{
- if (is_guest_mode(&svm->vcpu))
- return svm->nested.hsave;
- else
- return svm->vmcb;
-}
-
-static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
-{
- struct vmcb *vmcb = get_host_vmcb(svm);
-
- vmcb->control.intercept_cr |= (1U << bit);
-
- recalc_intercepts(svm);
-}
-
-static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit)
-{
- struct vmcb *vmcb = get_host_vmcb(svm);
-
- vmcb->control.intercept_cr &= ~(1U << bit);
-
- recalc_intercepts(svm);
-}
-
-static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
-{
- struct vmcb *vmcb = get_host_vmcb(svm);
-
- return vmcb->control.intercept_cr & (1U << bit);
-}
-
-static inline void set_dr_intercept(struct vcpu_svm *svm, int bit)
-{
- struct vmcb *vmcb = get_host_vmcb(svm);
-
- vmcb->control.intercept_dr |= (1U << bit);
-
- recalc_intercepts(svm);
-}
-
-static inline void clr_dr_intercept(struct vcpu_svm *svm, int bit)
-{
- struct vmcb *vmcb = get_host_vmcb(svm);
-
- vmcb->control.intercept_dr &= ~(1U << bit);
-
- recalc_intercepts(svm);
-}
-
-static inline void set_exception_intercept(struct vcpu_svm *svm, int bit)
-{
- struct vmcb *vmcb = get_host_vmcb(svm);
-
- vmcb->control.intercept_exceptions |= (1U << bit);
-
- recalc_intercepts(svm);
-}
-
-static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
-{
- struct vmcb *vmcb = get_host_vmcb(svm);
-
- vmcb->control.intercept_exceptions &= ~(1U << bit);
-
- recalc_intercepts(svm);
-}
-
-static inline void set_intercept(struct vcpu_svm *svm, int bit)
-{
- struct vmcb *vmcb = get_host_vmcb(svm);
-
- vmcb->control.intercept |= (1ULL << bit);
-
- recalc_intercepts(svm);
-}
-
-static inline void clr_intercept(struct vcpu_svm *svm, int bit)
-{
- struct vmcb *vmcb = get_host_vmcb(svm);
-
- vmcb->control.intercept &= ~(1ULL << bit);
-
- recalc_intercepts(svm);
-}
-
-static inline void enable_gif(struct vcpu_svm *svm)
-{
- svm->vcpu.arch.hflags |= HF_GIF_MASK;
-}
-
-static inline void disable_gif(struct vcpu_svm *svm)
-{
- svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
-}
-
-static inline bool gif_set(struct vcpu_svm *svm)
-{
- return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
-}
-
-static unsigned long iopm_base;
-
-struct kvm_ldttss_desc {
- u16 limit0;
- u16 base0;
- unsigned base1:8, type:5, dpl:2, p:1;
- unsigned limit1:4, zero0:3, g:1, base2:8;
- u32 base3;
- u32 zero1;
-} __attribute__((packed));
-
-struct svm_cpu_data {
- int cpu;
-
- u64 asid_generation;
- u32 max_asid;
- u32 next_asid;
- struct kvm_ldttss_desc *tss_desc;
-
- struct page *save_area;
-};
-
-static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
-
-struct svm_init_data {
- int cpu;
- int r;
-};
-
-static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
-
-#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
-#define MSRS_RANGE_SIZE 2048
-#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
-
-static u32 svm_msrpm_offset(u32 msr)
-{
- u32 offset;
- int i;
-
- for (i = 0; i < NUM_MSR_MAPS; i++) {
- if (msr < msrpm_ranges[i] ||
- msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
- continue;
-
- offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
- offset += (i * MSRS_RANGE_SIZE); /* add range offset */
-
- /* Now we have the u8 offset - but need the u32 offset */
- return offset / 4;
- }
-
- /* MSR not in any range */
- return MSR_INVALID;
-}
-
-#define MAX_INST_SIZE 15
-
-static inline void clgi(void)
-{
- asm volatile (__ex(SVM_CLGI));
-}
-
-static inline void stgi(void)
-{
- asm volatile (__ex(SVM_STGI));
-}
-
-static inline void invlpga(unsigned long addr, u32 asid)
-{
- asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
-}
-
-static int get_npt_level(void)
-{
-#ifdef CONFIG_X86_64
- return PT64_ROOT_LEVEL;
-#else
- return PT32E_ROOT_LEVEL;
-#endif
-}
-
-static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
-{
- vcpu->arch.efer = efer;
- if (!npt_enabled && !(efer & EFER_LMA))
- efer &= ~EFER_LME;
-
- to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
- mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
-}
-
-static int is_external_interrupt(u32 info)
-{
- info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
- return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
-}
-
-static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- u32 ret = 0;
-
- if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
- ret |= KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
- return ret & mask;
-}
-
-static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if (mask == 0)
- svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
- else
- svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
-
-}
-
-static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if (svm->vmcb->control.next_rip != 0)
- svm->next_rip = svm->vmcb->control.next_rip;
-
- if (!svm->next_rip) {
- if (emulate_instruction(vcpu, EMULTYPE_SKIP) !=
- EMULATE_DONE)
- printk(KERN_DEBUG "%s: NOP\n", __func__);
- return;
- }
- if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
- printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n",
- __func__, kvm_rip_read(vcpu), svm->next_rip);
-
- kvm_rip_write(vcpu, svm->next_rip);
- svm_set_interrupt_shadow(vcpu, 0);
-}
-
-static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
- bool has_error_code, u32 error_code,
- bool reinject)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- /*
- * If we are within a nested VM we'd better #VMEXIT and let the guest
- * handle the exception
- */
- if (!reinject &&
- nested_svm_check_exception(svm, nr, has_error_code, error_code))
- return;
-
- if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {
- unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
-
- /*
- * For guest debugging where we have to reinject #BP if some
- * INT3 is guest-owned:
- * Emulate nRIP by moving RIP forward. Will fail if injection
- * raises a fault that is not intercepted. Still better than
- * failing in all cases.
- */
- skip_emulated_instruction(&svm->vcpu);
- rip = kvm_rip_read(&svm->vcpu);
- svm->int3_rip = rip + svm->vmcb->save.cs.base;
- svm->int3_injected = rip - old_rip;
- }
-
- svm->vmcb->control.event_inj = nr
- | SVM_EVTINJ_VALID
- | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
- | SVM_EVTINJ_TYPE_EXEPT;
- svm->vmcb->control.event_inj_err = error_code;
-}
-
-static void svm_init_erratum_383(void)
-{
- u32 low, high;
- int err;
- u64 val;
-
- if (!cpu_has_amd_erratum(amd_erratum_383))
- return;
-
- /* Use _safe variants to not break nested virtualization */
- val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
- if (err)
- return;
-
- val |= (1ULL << 47);
-
- low = lower_32_bits(val);
- high = upper_32_bits(val);
-
- native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
-
- erratum_383_found = true;
-}
-
-static void svm_init_osvw(struct kvm_vcpu *vcpu)
-{
- /*
- * Guests should see errata 400 and 415 as fixed (assuming that
- * HLT and IO instructions are intercepted).
- */
- vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
- vcpu->arch.osvw.status = osvw_status & ~(6ULL);
-
- /*
- * By increasing VCPU's osvw.length to 3 we are telling the guest that
- * all osvw.status bits inside that length, including bit 0 (which is
- * reserved for erratum 298), are valid. However, if host processor's
- * osvw_len is 0 then osvw_status[0] carries no information. We need to
- * be conservative here and therefore we tell the guest that erratum 298
- * is present (because we really don't know).
- */
- if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
- vcpu->arch.osvw.status |= 1;
-}
-
-static int has_svm(void)
-{
- const char *msg;
-
- if (!cpu_has_svm(&msg)) {
- printk(KERN_INFO "has_svm: %s\n", msg);
- return 0;
- }
-
- return 1;
-}
-
-static void svm_hardware_disable(void *garbage)
-{
- /* Make sure we clean up behind us */
- if (static_cpu_has(X86_FEATURE_TSCRATEMSR))
- wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
-
- cpu_svm_disable();
-
- amd_pmu_disable_virt();
-}
-
-static int svm_hardware_enable(void *garbage)
-{
-
- struct svm_cpu_data *sd;
- uint64_t efer;
- struct desc_ptr gdt_descr;
- struct desc_struct *gdt;
- int me = raw_smp_processor_id();
-
- rdmsrl(MSR_EFER, efer);
- if (efer & EFER_SVME)
- return -EBUSY;
-
- if (!has_svm()) {
- printk(KERN_ERR "svm_hardware_enable: err EOPNOTSUPP on %d\n",
- me);
- return -EINVAL;
- }
- sd = per_cpu(svm_data, me);
-
- if (!sd) {
- printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n",
- me);
- return -EINVAL;
- }
-
- sd->asid_generation = 1;
- sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
- sd->next_asid = sd->max_asid + 1;
-
- native_store_gdt(&gdt_descr);
- gdt = (struct desc_struct *)gdt_descr.address;
- sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
-
- wrmsrl(MSR_EFER, efer | EFER_SVME);
-
- wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
-
- if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
- wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
- __get_cpu_var(current_tsc_ratio) = TSC_RATIO_DEFAULT;
- }
-
-
- /*
- * Get OSVW bits.
- *
- * Note that it is possible to have a system with mixed processor
- * revisions and therefore different OSVW bits. If bits are not the same
- * on different processors then choose the worst case (i.e. if erratum
- * is present on one processor and not on another then assume that the
- * erratum is present everywhere).
- */
- if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
- uint64_t len, status = 0;
- int err;
-
- len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
- if (!err)
- status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
- &err);
-
- if (err)
- osvw_status = osvw_len = 0;
- else {
- if (len < osvw_len)
- osvw_len = len;
- osvw_status |= status;
- osvw_status &= (1ULL << osvw_len) - 1;
- }
- } else
- osvw_status = osvw_len = 0;
-
- svm_init_erratum_383();
-
- amd_pmu_enable_virt();
-
- return 0;
-}
-
-static void svm_cpu_uninit(int cpu)
-{
- struct svm_cpu_data *sd = per_cpu(svm_data, raw_smp_processor_id());
-
- if (!sd)
- return;
-
- per_cpu(svm_data, raw_smp_processor_id()) = NULL;
- __free_page(sd->save_area);
- kfree(sd);
-}
-
-static int svm_cpu_init(int cpu)
-{
- struct svm_cpu_data *sd;
- int r;
-
- sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
- if (!sd)
- return -ENOMEM;
- sd->cpu = cpu;
- sd->save_area = alloc_page(GFP_KERNEL);
- r = -ENOMEM;
- if (!sd->save_area)
- goto err_1;
-
- per_cpu(svm_data, cpu) = sd;
-
- return 0;
-
-err_1:
- kfree(sd);
- return r;
-
-}
-
-static bool valid_msr_intercept(u32 index)
-{
- int i;
-
- for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
- if (direct_access_msrs[i].index == index)
- return true;
-
- return false;
-}
-
-static void set_msr_interception(u32 *msrpm, unsigned msr,
- int read, int write)
-{
- u8 bit_read, bit_write;
- unsigned long tmp;
- u32 offset;
-
- /*
- * If this warning triggers extend the direct_access_msrs list at the
- * beginning of the file
- */
- WARN_ON(!valid_msr_intercept(msr));
-
- offset = svm_msrpm_offset(msr);
- bit_read = 2 * (msr & 0x0f);
- bit_write = 2 * (msr & 0x0f) + 1;
- tmp = msrpm[offset];
-
- BUG_ON(offset == MSR_INVALID);
-
- read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp);
- write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
-
- msrpm[offset] = tmp;
-}
-
-static void svm_vcpu_init_msrpm(u32 *msrpm)
-{
- int i;
-
- memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
-
- for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
- if (!direct_access_msrs[i].always)
- continue;
-
- set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
- }
-}
-
-static void add_msr_offset(u32 offset)
-{
- int i;
-
- for (i = 0; i < MSRPM_OFFSETS; ++i) {
-
- /* Offset already in list? */
- if (msrpm_offsets[i] == offset)
- return;
-
- /* Slot used by another offset? */
- if (msrpm_offsets[i] != MSR_INVALID)
- continue;
-
- /* Add offset to list */
- msrpm_offsets[i] = offset;
-
- return;
- }
-
- /*
- * If this BUG triggers the msrpm_offsets table has an overflow. Just
- * increase MSRPM_OFFSETS in this case.
- */
- BUG();
-}
-
-static void init_msrpm_offsets(void)
-{
- int i;
-
- memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
-
- for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
- u32 offset;
-
- offset = svm_msrpm_offset(direct_access_msrs[i].index);
- BUG_ON(offset == MSR_INVALID);
-
- add_msr_offset(offset);
- }
-}
-
-static void svm_enable_lbrv(struct vcpu_svm *svm)
-{
- u32 *msrpm = svm->msrpm;
-
- svm->vmcb->control.lbr_ctl = 1;
- set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
- set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
- set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
- set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
-}
-
-static void svm_disable_lbrv(struct vcpu_svm *svm)
-{
- u32 *msrpm = svm->msrpm;
-
- svm->vmcb->control.lbr_ctl = 0;
- set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
- set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
- set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
- set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
-}
-
-static __init int svm_hardware_setup(void)
-{
- int cpu;
- struct page *iopm_pages;
- void *iopm_va;
- int r;
-
- iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
-
- if (!iopm_pages)
- return -ENOMEM;
-
- iopm_va = page_address(iopm_pages);
- memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
- iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
-
- init_msrpm_offsets();
-
- if (boot_cpu_has(X86_FEATURE_NX))
- kvm_enable_efer_bits(EFER_NX);
-
- if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
- kvm_enable_efer_bits(EFER_FFXSR);
-
- if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
- u64 max;
-
- kvm_has_tsc_control = true;
-
- /*
- * Make sure the user can only configure tsc_khz values that
- * fit into a signed integer.
- * A min value is not calculated needed because it will always
- * be 1 on all machines and a value of 0 is used to disable
- * tsc-scaling for the vcpu.
- */
- max = min(0x7fffffffULL, __scale_tsc(tsc_khz, TSC_RATIO_MAX));
-
- kvm_max_guest_tsc_khz = max;
- }
-
- if (nested) {
- printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
- kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
- }
-
- for_each_possible_cpu(cpu) {
- r = svm_cpu_init(cpu);
- if (r)
- goto err;
- }
-
- if (!boot_cpu_has(X86_FEATURE_NPT))
- npt_enabled = false;
-
- if (npt_enabled && !npt) {
- printk(KERN_INFO "kvm: Nested Paging disabled\n");
- npt_enabled = false;
- }
-
- if (npt_enabled) {
- printk(KERN_INFO "kvm: Nested Paging enabled\n");
- kvm_enable_tdp();
- } else
- kvm_disable_tdp();
-
- return 0;
-
-err:
- __free_pages(iopm_pages, IOPM_ALLOC_ORDER);
- iopm_base = 0;
- return r;
-}
-
-static __exit void svm_hardware_unsetup(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- svm_cpu_uninit(cpu);
-
- __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
- iopm_base = 0;
-}
-
-static void init_seg(struct vmcb_seg *seg)
-{
- seg->selector = 0;
- seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
- SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
- seg->limit = 0xffff;
- seg->base = 0;
-}
-
-static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
-{
- seg->selector = 0;
- seg->attrib = SVM_SELECTOR_P_MASK | type;
- seg->limit = 0xffff;
- seg->base = 0;
-}
-
-static u64 __scale_tsc(u64 ratio, u64 tsc)
-{
- u64 mult, frac, _tsc;
-
- mult = ratio >> 32;
- frac = ratio & ((1ULL << 32) - 1);
-
- _tsc = tsc;
- _tsc *= mult;
- _tsc += (tsc >> 32) * frac;
- _tsc += ((tsc & ((1ULL << 32) - 1)) * frac) >> 32;
-
- return _tsc;
-}
-
-static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- u64 _tsc = tsc;
-
- if (svm->tsc_ratio != TSC_RATIO_DEFAULT)
- _tsc = __scale_tsc(svm->tsc_ratio, tsc);
-
- return _tsc;
-}
-
-static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- u64 ratio;
- u64 khz;
-
- /* Guest TSC same frequency as host TSC? */
- if (!scale) {
- svm->tsc_ratio = TSC_RATIO_DEFAULT;
- return;
- }
-
- /* TSC scaling supported? */
- if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
- if (user_tsc_khz > tsc_khz) {
- vcpu->arch.tsc_catchup = 1;
- vcpu->arch.tsc_always_catchup = 1;
- } else
- WARN(1, "user requested TSC rate below hardware speed\n");
- return;
- }
-
- khz = user_tsc_khz;
-
- /* TSC scaling required - calculate ratio */
- ratio = khz << 32;
- do_div(ratio, tsc_khz);
-
- if (ratio == 0 || ratio & TSC_RATIO_RSVD) {
- WARN_ONCE(1, "Invalid TSC ratio - virtual-tsc-khz=%u\n",
- user_tsc_khz);
- return;
- }
- svm->tsc_ratio = ratio;
-}
-
-static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- u64 g_tsc_offset = 0;
-
- if (is_guest_mode(vcpu)) {
- g_tsc_offset = svm->vmcb->control.tsc_offset -
- svm->nested.hsave->control.tsc_offset;
- svm->nested.hsave->control.tsc_offset = offset;
- }
-
- svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
-
- mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
-}
-
-static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- WARN_ON(adjustment < 0);
- if (host)
- adjustment = svm_scale_tsc(vcpu, adjustment);
-
- svm->vmcb->control.tsc_offset += adjustment;
- if (is_guest_mode(vcpu))
- svm->nested.hsave->control.tsc_offset += adjustment;
- mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
-}
-
-static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
-{
- u64 tsc;
-
- tsc = svm_scale_tsc(vcpu, native_read_tsc());
-
- return target_tsc - tsc;
-}
-
-static void init_vmcb(struct vcpu_svm *svm)
-{
- struct vmcb_control_area *control = &svm->vmcb->control;
- struct vmcb_save_area *save = &svm->vmcb->save;
-
- svm->vcpu.fpu_active = 1;
- svm->vcpu.arch.hflags = 0;
-
- set_cr_intercept(svm, INTERCEPT_CR0_READ);
- set_cr_intercept(svm, INTERCEPT_CR3_READ);
- set_cr_intercept(svm, INTERCEPT_CR4_READ);
- set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
- set_cr_intercept(svm, INTERCEPT_CR3_WRITE);
- set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
- set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
-
- set_dr_intercept(svm, INTERCEPT_DR0_READ);
- set_dr_intercept(svm, INTERCEPT_DR1_READ);
- set_dr_intercept(svm, INTERCEPT_DR2_READ);
- set_dr_intercept(svm, INTERCEPT_DR3_READ);
- set_dr_intercept(svm, INTERCEPT_DR4_READ);
- set_dr_intercept(svm, INTERCEPT_DR5_READ);
- set_dr_intercept(svm, INTERCEPT_DR6_READ);
- set_dr_intercept(svm, INTERCEPT_DR7_READ);
-
- set_dr_intercept(svm, INTERCEPT_DR0_WRITE);
- set_dr_intercept(svm, INTERCEPT_DR1_WRITE);
- set_dr_intercept(svm, INTERCEPT_DR2_WRITE);
- set_dr_intercept(svm, INTERCEPT_DR3_WRITE);
- set_dr_intercept(svm, INTERCEPT_DR4_WRITE);
- set_dr_intercept(svm, INTERCEPT_DR5_WRITE);
- set_dr_intercept(svm, INTERCEPT_DR6_WRITE);
- set_dr_intercept(svm, INTERCEPT_DR7_WRITE);
-
- set_exception_intercept(svm, PF_VECTOR);
- set_exception_intercept(svm, UD_VECTOR);
- set_exception_intercept(svm, MC_VECTOR);
-
- set_intercept(svm, INTERCEPT_INTR);
- set_intercept(svm, INTERCEPT_NMI);
- set_intercept(svm, INTERCEPT_SMI);
- set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
- set_intercept(svm, INTERCEPT_RDPMC);
- set_intercept(svm, INTERCEPT_CPUID);
- set_intercept(svm, INTERCEPT_INVD);
- set_intercept(svm, INTERCEPT_HLT);
- set_intercept(svm, INTERCEPT_INVLPG);
- set_intercept(svm, INTERCEPT_INVLPGA);
- set_intercept(svm, INTERCEPT_IOIO_PROT);
- set_intercept(svm, INTERCEPT_MSR_PROT);
- set_intercept(svm, INTERCEPT_TASK_SWITCH);
- set_intercept(svm, INTERCEPT_SHUTDOWN);
- set_intercept(svm, INTERCEPT_VMRUN);
- set_intercept(svm, INTERCEPT_VMMCALL);
- set_intercept(svm, INTERCEPT_VMLOAD);
- set_intercept(svm, INTERCEPT_VMSAVE);
- set_intercept(svm, INTERCEPT_STGI);
- set_intercept(svm, INTERCEPT_CLGI);
- set_intercept(svm, INTERCEPT_SKINIT);
- set_intercept(svm, INTERCEPT_WBINVD);
- set_intercept(svm, INTERCEPT_MONITOR);
- set_intercept(svm, INTERCEPT_MWAIT);
- set_intercept(svm, INTERCEPT_XSETBV);
-
- control->iopm_base_pa = iopm_base;
- control->msrpm_base_pa = __pa(svm->msrpm);
- control->int_ctl = V_INTR_MASKING_MASK;
-
- init_seg(&save->es);
- init_seg(&save->ss);
- init_seg(&save->ds);
- init_seg(&save->fs);
- init_seg(&save->gs);
-
- save->cs.selector = 0xf000;
- /* Executable/Readable Code Segment */
- save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
- SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
- save->cs.limit = 0xffff;
- /*
- * cs.base should really be 0xffff0000, but vmx can't handle that, so
- * be consistent with it.
- *
- * Replace when we have real mode working for vmx.
- */
- save->cs.base = 0xf0000;
-
- save->gdtr.limit = 0xffff;
- save->idtr.limit = 0xffff;
-
- init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
- init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
-
- svm_set_efer(&svm->vcpu, 0);
- save->dr6 = 0xffff0ff0;
- save->dr7 = 0x400;
- kvm_set_rflags(&svm->vcpu, 2);
- save->rip = 0x0000fff0;
- svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
-
- /*
- * This is the guest-visible cr0 value.
- * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
- */
- svm->vcpu.arch.cr0 = 0;
- (void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
-
- save->cr4 = X86_CR4_PAE;
- /* rdx = ?? */
-
- if (npt_enabled) {
- /* Setup VMCB for Nested Paging */
- control->nested_ctl = 1;
- clr_intercept(svm, INTERCEPT_INVLPG);
- clr_exception_intercept(svm, PF_VECTOR);
- clr_cr_intercept(svm, INTERCEPT_CR3_READ);
- clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
- save->g_pat = 0x0007040600070406ULL;
- save->cr3 = 0;
- save->cr4 = 0;
- }
- svm->asid_generation = 0;
-
- svm->nested.vmcb = 0;
- svm->vcpu.arch.hflags = 0;
-
- if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
- control->pause_filter_count = 3000;
- set_intercept(svm, INTERCEPT_PAUSE);
- }
-
- mark_all_dirty(svm->vmcb);
-
- enable_gif(svm);
-}
-
-static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- init_vmcb(svm);
-
- if (!kvm_vcpu_is_bsp(vcpu)) {
- kvm_rip_write(vcpu, 0);
- svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
- svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
- }
- vcpu->arch.regs_avail = ~0;
- vcpu->arch.regs_dirty = ~0;
-
- return 0;
-}
-
-static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
-{
- struct vcpu_svm *svm;
- struct page *page;
- struct page *msrpm_pages;
- struct page *hsave_page;
- struct page *nested_msrpm_pages;
- int err;
-
- svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
- if (!svm) {
- err = -ENOMEM;
- goto out;
- }
-
- svm->tsc_ratio = TSC_RATIO_DEFAULT;
-
- err = kvm_vcpu_init(&svm->vcpu, kvm, id);
- if (err)
- goto free_svm;
-
- err = -ENOMEM;
- page = alloc_page(GFP_KERNEL);
- if (!page)
- goto uninit;
-
- msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
- if (!msrpm_pages)
- goto free_page1;
-
- nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
- if (!nested_msrpm_pages)
- goto free_page2;
-
- hsave_page = alloc_page(GFP_KERNEL);
- if (!hsave_page)
- goto free_page3;
-
- svm->nested.hsave = page_address(hsave_page);
-
- svm->msrpm = page_address(msrpm_pages);
- svm_vcpu_init_msrpm(svm->msrpm);
-
- svm->nested.msrpm = page_address(nested_msrpm_pages);
- svm_vcpu_init_msrpm(svm->nested.msrpm);
-
- svm->vmcb = page_address(page);
- clear_page(svm->vmcb);
- svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
- svm->asid_generation = 0;
- init_vmcb(svm);
- kvm_write_tsc(&svm->vcpu, 0);
-
- err = fx_init(&svm->vcpu);
- if (err)
- goto free_page4;
-
- svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
- if (kvm_vcpu_is_bsp(&svm->vcpu))
- svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
-
- svm_init_osvw(&svm->vcpu);
-
- return &svm->vcpu;
-
-free_page4:
- __free_page(hsave_page);
-free_page3:
- __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
-free_page2:
- __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
-free_page1:
- __free_page(page);
-uninit:
- kvm_vcpu_uninit(&svm->vcpu);
-free_svm:
- kmem_cache_free(kvm_vcpu_cache, svm);
-out:
- return ERR_PTR(err);
-}
-
-static void svm_free_vcpu(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
- __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
- __free_page(virt_to_page(svm->nested.hsave));
- __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
- kvm_vcpu_uninit(vcpu);
- kmem_cache_free(kvm_vcpu_cache, svm);
-}
-
-static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- int i;
-
- if (unlikely(cpu != vcpu->cpu)) {
- svm->asid_generation = 0;
- mark_all_dirty(svm->vmcb);
- }
-
-#ifdef CONFIG_X86_64
- rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
-#endif
- savesegment(fs, svm->host.fs);
- savesegment(gs, svm->host.gs);
- svm->host.ldt = kvm_read_ldt();
-
- for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
- rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
-
- if (static_cpu_has(X86_FEATURE_TSCRATEMSR) &&
- svm->tsc_ratio != __get_cpu_var(current_tsc_ratio)) {
- __get_cpu_var(current_tsc_ratio) = svm->tsc_ratio;
- wrmsrl(MSR_AMD64_TSC_RATIO, svm->tsc_ratio);
- }
-}
-
-static void svm_vcpu_put(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- int i;
-
- ++vcpu->stat.host_state_reload;
- kvm_load_ldt(svm->host.ldt);
-#ifdef CONFIG_X86_64
- loadsegment(fs, svm->host.fs);
- wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
- load_gs_index(svm->host.gs);
-#else
-#ifdef CONFIG_X86_32_LAZY_GS
- loadsegment(gs, svm->host.gs);
-#endif
-#endif
- for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
- wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
-}
-
-static void svm_update_cpl(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- int cpl;
-
- if (!is_protmode(vcpu))
- cpl = 0;
- else if (svm->vmcb->save.rflags & X86_EFLAGS_VM)
- cpl = 3;
- else
- cpl = svm->vmcb->save.cs.selector & 0x3;
-
- svm->vmcb->save.cpl = cpl;
-}
-
-static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
-{
- return to_svm(vcpu)->vmcb->save.rflags;
-}
-
-static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
-{
- unsigned long old_rflags = to_svm(vcpu)->vmcb->save.rflags;
-
- to_svm(vcpu)->vmcb->save.rflags = rflags;
- if ((old_rflags ^ rflags) & X86_EFLAGS_VM)
- svm_update_cpl(vcpu);
-}
-
-static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
-{
- switch (reg) {
- case VCPU_EXREG_PDPTR:
- BUG_ON(!npt_enabled);
- load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
- break;
- default:
- BUG();
- }
-}
-
-static void svm_set_vintr(struct vcpu_svm *svm)
-{
- set_intercept(svm, INTERCEPT_VINTR);
-}
-
-static void svm_clear_vintr(struct vcpu_svm *svm)
-{
- clr_intercept(svm, INTERCEPT_VINTR);
-}
-
-static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
-{
- struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
-
- switch (seg) {
- case VCPU_SREG_CS: return &save->cs;
- case VCPU_SREG_DS: return &save->ds;
- case VCPU_SREG_ES: return &save->es;
- case VCPU_SREG_FS: return &save->fs;
- case VCPU_SREG_GS: return &save->gs;
- case VCPU_SREG_SS: return &save->ss;
- case VCPU_SREG_TR: return &save->tr;
- case VCPU_SREG_LDTR: return &save->ldtr;
- }
- BUG();
- return NULL;
-}
-
-static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
-{
- struct vmcb_seg *s = svm_seg(vcpu, seg);
-
- return s->base;
-}
-
-static void svm_get_segment(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg)
-{
- struct vmcb_seg *s = svm_seg(vcpu, seg);
-
- var->base = s->base;
- var->limit = s->limit;
- var->selector = s->selector;
- var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
- var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
- var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
- var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
- var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
- var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
- var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
- var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
-
- /*
- * AMD's VMCB does not have an explicit unusable field, so emulate it
- * for cross vendor migration purposes by "not present"
- */
- var->unusable = !var->present || (var->type == 0);
-
- switch (seg) {
- case VCPU_SREG_CS:
- /*
- * SVM always stores 0 for the 'G' bit in the CS selector in
- * the VMCB on a VMEXIT. This hurts cross-vendor migration:
- * Intel's VMENTRY has a check on the 'G' bit.
- */
- var->g = s->limit > 0xfffff;
- break;
- case VCPU_SREG_TR:
- /*
- * Work around a bug where the busy flag in the tr selector
- * isn't exposed
- */
- var->type |= 0x2;
- break;
- case VCPU_SREG_DS:
- case VCPU_SREG_ES:
- case VCPU_SREG_FS:
- case VCPU_SREG_GS:
- /*
- * The accessed bit must always be set in the segment
- * descriptor cache, although it can be cleared in the
- * descriptor, the cached bit always remains at 1. Since
- * Intel has a check on this, set it here to support
- * cross-vendor migration.
- */
- if (!var->unusable)
- var->type |= 0x1;
- break;
- case VCPU_SREG_SS:
- /*
- * On AMD CPUs sometimes the DB bit in the segment
- * descriptor is left as 1, although the whole segment has
- * been made unusable. Clear it here to pass an Intel VMX
- * entry check when cross vendor migrating.
- */
- if (var->unusable)
- var->db = 0;
- break;
- }
-}
-
-static int svm_get_cpl(struct kvm_vcpu *vcpu)
-{
- struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
-
- return save->cpl;
-}
-
-static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- dt->size = svm->vmcb->save.idtr.limit;
- dt->address = svm->vmcb->save.idtr.base;
-}
-
-static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- svm->vmcb->save.idtr.limit = dt->size;
- svm->vmcb->save.idtr.base = dt->address ;
- mark_dirty(svm->vmcb, VMCB_DT);
-}
-
-static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- dt->size = svm->vmcb->save.gdtr.limit;
- dt->address = svm->vmcb->save.gdtr.base;
-}
-
-static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- svm->vmcb->save.gdtr.limit = dt->size;
- svm->vmcb->save.gdtr.base = dt->address ;
- mark_dirty(svm->vmcb, VMCB_DT);
-}
-
-static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
-{
-}
-
-static void svm_decache_cr3(struct kvm_vcpu *vcpu)
-{
-}
-
-static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
-{
-}
-
-static void update_cr0_intercept(struct vcpu_svm *svm)
-{
- ulong gcr0 = svm->vcpu.arch.cr0;
- u64 *hcr0 = &svm->vmcb->save.cr0;
-
- if (!svm->vcpu.fpu_active)
- *hcr0 |= SVM_CR0_SELECTIVE_MASK;
- else
- *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
- | (gcr0 & SVM_CR0_SELECTIVE_MASK);
-
- mark_dirty(svm->vmcb, VMCB_CR);
-
- if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
- clr_cr_intercept(svm, INTERCEPT_CR0_READ);
- clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
- } else {
- set_cr_intercept(svm, INTERCEPT_CR0_READ);
- set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
- }
-}
-
-static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
-#ifdef CONFIG_X86_64
- if (vcpu->arch.efer & EFER_LME) {
- if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
- vcpu->arch.efer |= EFER_LMA;
- svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
- }
-
- if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
- vcpu->arch.efer &= ~EFER_LMA;
- svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
- }
- }
-#endif
- vcpu->arch.cr0 = cr0;
-
- if (!npt_enabled)
- cr0 |= X86_CR0_PG | X86_CR0_WP;
-
- if (!vcpu->fpu_active)
- cr0 |= X86_CR0_TS;
- /*
- * re-enable caching here because the QEMU bios
- * does not do it - this results in some delay at
- * reboot
- */
- cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
- svm->vmcb->save.cr0 = cr0;
- mark_dirty(svm->vmcb, VMCB_CR);
- update_cr0_intercept(svm);
-}
-
-static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
-{
- unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE;
- unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
-
- if (cr4 & X86_CR4_VMXE)
- return 1;
-
- if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
- svm_flush_tlb(vcpu);
-
- vcpu->arch.cr4 = cr4;
- if (!npt_enabled)
- cr4 |= X86_CR4_PAE;
- cr4 |= host_cr4_mce;
- to_svm(vcpu)->vmcb->save.cr4 = cr4;
- mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
- return 0;
-}
-
-static void svm_set_segment(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb_seg *s = svm_seg(vcpu, seg);
-
- s->base = var->base;
- s->limit = var->limit;
- s->selector = var->selector;
- if (var->unusable)
- s->attrib = 0;
- else {
- s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
- s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
- s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
- s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT;
- s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
- s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
- s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
- s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
- }
- if (seg == VCPU_SREG_CS)
- svm_update_cpl(vcpu);
-
- mark_dirty(svm->vmcb, VMCB_SEG);
-}
-
-static void update_db_intercept(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- clr_exception_intercept(svm, DB_VECTOR);
- clr_exception_intercept(svm, BP_VECTOR);
-
- if (svm->nmi_singlestep)
- set_exception_intercept(svm, DB_VECTOR);
-
- if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
- if (vcpu->guest_debug &
- (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
- set_exception_intercept(svm, DB_VECTOR);
- if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
- set_exception_intercept(svm, BP_VECTOR);
- } else
- vcpu->guest_debug = 0;
-}
-
-static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
- svm->vmcb->save.dr7 = dbg->arch.debugreg[7];
- else
- svm->vmcb->save.dr7 = vcpu->arch.dr7;
-
- mark_dirty(svm->vmcb, VMCB_DR);
-
- update_db_intercept(vcpu);
-}
-
-static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
-{
- if (sd->next_asid > sd->max_asid) {
- ++sd->asid_generation;
- sd->next_asid = 1;
- svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
- }
-
- svm->asid_generation = sd->asid_generation;
- svm->vmcb->control.asid = sd->next_asid++;
-
- mark_dirty(svm->vmcb, VMCB_ASID);
-}
-
-static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- svm->vmcb->save.dr7 = value;
- mark_dirty(svm->vmcb, VMCB_DR);
-}
-
-static int pf_interception(struct vcpu_svm *svm)
-{
- u64 fault_address = svm->vmcb->control.exit_info_2;
- u32 error_code;
- int r = 1;
-
- switch (svm->apf_reason) {
- default:
- error_code = svm->vmcb->control.exit_info_1;
-
- trace_kvm_page_fault(fault_address, error_code);
- if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
- kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
- r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
- svm->vmcb->control.insn_bytes,
- svm->vmcb->control.insn_len);
- break;
- case KVM_PV_REASON_PAGE_NOT_PRESENT:
- svm->apf_reason = 0;
- local_irq_disable();
- kvm_async_pf_task_wait(fault_address);
- local_irq_enable();
- break;
- case KVM_PV_REASON_PAGE_READY:
- svm->apf_reason = 0;
- local_irq_disable();
- kvm_async_pf_task_wake(fault_address);
- local_irq_enable();
- break;
- }
- return r;
-}
-
-static int db_interception(struct vcpu_svm *svm)
-{
- struct kvm_run *kvm_run = svm->vcpu.run;
-
- if (!(svm->vcpu.guest_debug &
- (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
- !svm->nmi_singlestep) {
- kvm_queue_exception(&svm->vcpu, DB_VECTOR);
- return 1;
- }
-
- if (svm->nmi_singlestep) {
- svm->nmi_singlestep = false;
- if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
- svm->vmcb->save.rflags &=
- ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
- update_db_intercept(&svm->vcpu);
- }
-
- if (svm->vcpu.guest_debug &
- (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
- kvm_run->exit_reason = KVM_EXIT_DEBUG;
- kvm_run->debug.arch.pc =
- svm->vmcb->save.cs.base + svm->vmcb->save.rip;
- kvm_run->debug.arch.exception = DB_VECTOR;
- return 0;
- }
-
- return 1;
-}
-
-static int bp_interception(struct vcpu_svm *svm)
-{
- struct kvm_run *kvm_run = svm->vcpu.run;
-
- kvm_run->exit_reason = KVM_EXIT_DEBUG;
- kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
- kvm_run->debug.arch.exception = BP_VECTOR;
- return 0;
-}
-
-static int ud_interception(struct vcpu_svm *svm)
-{
- int er;
-
- er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD);
- if (er != EMULATE_DONE)
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
- return 1;
-}
-
-static void svm_fpu_activate(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- clr_exception_intercept(svm, NM_VECTOR);
-
- svm->vcpu.fpu_active = 1;
- update_cr0_intercept(svm);
-}
-
-static int nm_interception(struct vcpu_svm *svm)
-{
- svm_fpu_activate(&svm->vcpu);
- return 1;
-}
-
-static bool is_erratum_383(void)
-{
- int err, i;
- u64 value;
-
- if (!erratum_383_found)
- return false;
-
- value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
- if (err)
- return false;
-
- /* Bit 62 may or may not be set for this mce */
- value &= ~(1ULL << 62);
-
- if (value != 0xb600000000010015ULL)
- return false;
-
- /* Clear MCi_STATUS registers */
- for (i = 0; i < 6; ++i)
- native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
-
- value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
- if (!err) {
- u32 low, high;
-
- value &= ~(1ULL << 2);
- low = lower_32_bits(value);
- high = upper_32_bits(value);
-
- native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
- }
-
- /* Flush tlb to evict multi-match entries */
- __flush_tlb_all();
-
- return true;
-}
-
-static void svm_handle_mce(struct vcpu_svm *svm)
-{
- if (is_erratum_383()) {
- /*
- * Erratum 383 triggered. Guest state is corrupt so kill the
- * guest.
- */
- pr_err("KVM: Guest triggered AMD Erratum 383\n");
-
- kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
-
- return;
- }
-
- /*
- * On an #MC intercept the MCE handler is not called automatically in
- * the host. So do it by hand here.
- */
- asm volatile (
- "int $0x12\n");
- /* not sure if we ever come back to this point */
-
- return;
-}
-
-static int mc_interception(struct vcpu_svm *svm)
-{
- return 1;
-}
-
-static int shutdown_interception(struct vcpu_svm *svm)
-{
- struct kvm_run *kvm_run = svm->vcpu.run;
-
- /*
- * VMCB is undefined after a SHUTDOWN intercept
- * so reinitialize it.
- */
- clear_page(svm->vmcb);
- init_vmcb(svm);
-
- kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
- return 0;
-}
-
-static int io_interception(struct vcpu_svm *svm)
-{
- struct kvm_vcpu *vcpu = &svm->vcpu;
- u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
- int size, in, string;
- unsigned port;
-
- ++svm->vcpu.stat.io_exits;
- string = (io_info & SVM_IOIO_STR_MASK) != 0;
- in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
- if (string || in)
- return emulate_instruction(vcpu, 0) == EMULATE_DONE;
-
- port = io_info >> 16;
- size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
- svm->next_rip = svm->vmcb->control.exit_info_2;
- skip_emulated_instruction(&svm->vcpu);
-
- return kvm_fast_pio_out(vcpu, size, port);
-}
-
-static int nmi_interception(struct vcpu_svm *svm)
-{
- return 1;
-}
-
-static int intr_interception(struct vcpu_svm *svm)
-{
- ++svm->vcpu.stat.irq_exits;
- return 1;
-}
-
-static int nop_on_interception(struct vcpu_svm *svm)
-{
- return 1;
-}
-
-static int halt_interception(struct vcpu_svm *svm)
-{
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
- skip_emulated_instruction(&svm->vcpu);
- return kvm_emulate_halt(&svm->vcpu);
-}
-
-static int vmmcall_interception(struct vcpu_svm *svm)
-{
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
- skip_emulated_instruction(&svm->vcpu);
- kvm_emulate_hypercall(&svm->vcpu);
- return 1;
-}
-
-static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- return svm->nested.nested_cr3;
-}
-
-static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- u64 cr3 = svm->nested.nested_cr3;
- u64 pdpte;
- int ret;
-
- ret = kvm_read_guest_page(vcpu->kvm, gpa_to_gfn(cr3), &pdpte,
- offset_in_page(cr3) + index * 8, 8);
- if (ret)
- return 0;
- return pdpte;
-}
-
-static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
- unsigned long root)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- svm->vmcb->control.nested_cr3 = root;
- mark_dirty(svm->vmcb, VMCB_NPT);
- svm_flush_tlb(vcpu);
-}
-
-static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
- struct x86_exception *fault)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- svm->vmcb->control.exit_code = SVM_EXIT_NPF;
- svm->vmcb->control.exit_code_hi = 0;
- svm->vmcb->control.exit_info_1 = fault->error_code;
- svm->vmcb->control.exit_info_2 = fault->address;
-
- nested_svm_vmexit(svm);
-}
-
-static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
-{
- int r;
-
- r = kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu);
-
- vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3;
- vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3;
- vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr;
- vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
- vcpu->arch.mmu.shadow_root_level = get_npt_level();
- vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
-
- return r;
-}
-
-static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
-{
- vcpu->arch.walk_mmu = &vcpu->arch.mmu;
-}
-
-static int nested_svm_check_permissions(struct vcpu_svm *svm)
-{
- if (!(svm->vcpu.arch.efer & EFER_SVME)
- || !is_paging(&svm->vcpu)) {
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
- return 1;
- }
-
- if (svm->vmcb->save.cpl) {
- kvm_inject_gp(&svm->vcpu, 0);
- return 1;
- }
-
- return 0;
-}
-
-static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
- bool has_error_code, u32 error_code)
-{
- int vmexit;
-
- if (!is_guest_mode(&svm->vcpu))
- return 0;
-
- svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
- svm->vmcb->control.exit_code_hi = 0;
- svm->vmcb->control.exit_info_1 = error_code;
- svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
-
- vmexit = nested_svm_intercept(svm);
- if (vmexit == NESTED_EXIT_DONE)
- svm->nested.exit_required = true;
-
- return vmexit;
-}
-
-/* This function returns true if it is save to enable the irq window */
-static inline bool nested_svm_intr(struct vcpu_svm *svm)
-{
- if (!is_guest_mode(&svm->vcpu))
- return true;
-
- if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
- return true;
-
- if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
- return false;
-
- /*
- * if vmexit was already requested (by intercepted exception
- * for instance) do not overwrite it with "external interrupt"
- * vmexit.
- */
- if (svm->nested.exit_required)
- return false;
-
- svm->vmcb->control.exit_code = SVM_EXIT_INTR;
- svm->vmcb->control.exit_info_1 = 0;
- svm->vmcb->control.exit_info_2 = 0;
-
- if (svm->nested.intercept & 1ULL) {
- /*
- * The #vmexit can't be emulated here directly because this
- * code path runs with irqs and preemtion disabled. A
- * #vmexit emulation might sleep. Only signal request for
- * the #vmexit here.
- */
- svm->nested.exit_required = true;
- trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
- return false;
- }
-
- return true;
-}
-
-/* This function returns true if it is save to enable the nmi window */
-static inline bool nested_svm_nmi(struct vcpu_svm *svm)
-{
- if (!is_guest_mode(&svm->vcpu))
- return true;
-
- if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
- return true;
-
- svm->vmcb->control.exit_code = SVM_EXIT_NMI;
- svm->nested.exit_required = true;
-
- return false;
-}
-
-static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
-{
- struct page *page;
-
- might_sleep();
-
- page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
- if (is_error_page(page))
- goto error;
-
- *_page = page;
-
- return kmap(page);
-
-error:
- kvm_release_page_clean(page);
- kvm_inject_gp(&svm->vcpu, 0);
-
- return NULL;
-}
-
-static void nested_svm_unmap(struct page *page)
-{
- kunmap(page);
- kvm_release_page_dirty(page);
-}
-
-static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
-{
- unsigned port;
- u8 val, bit;
- u64 gpa;
-
- if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
- return NESTED_EXIT_HOST;
-
- port = svm->vmcb->control.exit_info_1 >> 16;
- gpa = svm->nested.vmcb_iopm + (port / 8);
- bit = port % 8;
- val = 0;
-
- if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, 1))
- val &= (1 << bit);
-
- return val ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
-}
-
-static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
-{
- u32 offset, msr, value;
- int write, mask;
-
- if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
- return NESTED_EXIT_HOST;
-
- msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
- offset = svm_msrpm_offset(msr);
- write = svm->vmcb->control.exit_info_1 & 1;
- mask = 1 << ((2 * (msr & 0xf)) + write);
-
- if (offset == MSR_INVALID)
- return NESTED_EXIT_DONE;
-
- /* Offset is in 32 bit units but need in 8 bit units */
- offset *= 4;
-
- if (kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + offset, &value, 4))
- return NESTED_EXIT_DONE;
-
- return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
-}
-
-static int nested_svm_exit_special(struct vcpu_svm *svm)
-{
- u32 exit_code = svm->vmcb->control.exit_code;
-
- switch (exit_code) {
- case SVM_EXIT_INTR:
- case SVM_EXIT_NMI:
- case SVM_EXIT_EXCP_BASE + MC_VECTOR:
- return NESTED_EXIT_HOST;
- case SVM_EXIT_NPF:
- /* For now we are always handling NPFs when using them */
- if (npt_enabled)
- return NESTED_EXIT_HOST;
- break;
- case SVM_EXIT_EXCP_BASE + PF_VECTOR:
- /* When we're shadowing, trap PFs, but not async PF */
- if (!npt_enabled && svm->apf_reason == 0)
- return NESTED_EXIT_HOST;
- break;
- case SVM_EXIT_EXCP_BASE + NM_VECTOR:
- nm_interception(svm);
- break;
- default:
- break;
- }
-
- return NESTED_EXIT_CONTINUE;
-}
-
-/*
- * If this function returns true, this #vmexit was already handled
- */
-static int nested_svm_intercept(struct vcpu_svm *svm)
-{
- u32 exit_code = svm->vmcb->control.exit_code;
- int vmexit = NESTED_EXIT_HOST;
-
- switch (exit_code) {
- case SVM_EXIT_MSR:
- vmexit = nested_svm_exit_handled_msr(svm);
- break;
- case SVM_EXIT_IOIO:
- vmexit = nested_svm_intercept_ioio(svm);
- break;
- case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
- u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0);
- if (svm->nested.intercept_cr & bit)
- vmexit = NESTED_EXIT_DONE;
- break;
- }
- case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
- u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0);
- if (svm->nested.intercept_dr & bit)
- vmexit = NESTED_EXIT_DONE;
- break;
- }
- case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
- u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
- if (svm->nested.intercept_exceptions & excp_bits)
- vmexit = NESTED_EXIT_DONE;
- /* async page fault always cause vmexit */
- else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
- svm->apf_reason != 0)
- vmexit = NESTED_EXIT_DONE;
- break;
- }
- case SVM_EXIT_ERR: {
- vmexit = NESTED_EXIT_DONE;
- break;
- }
- default: {
- u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
- if (svm->nested.intercept & exit_bits)
- vmexit = NESTED_EXIT_DONE;
- }
- }
-
- return vmexit;
-}
-
-static int nested_svm_exit_handled(struct vcpu_svm *svm)
-{
- int vmexit;
-
- vmexit = nested_svm_intercept(svm);
-
- if (vmexit == NESTED_EXIT_DONE)
- nested_svm_vmexit(svm);
-
- return vmexit;
-}
-
-static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb)
-{
- struct vmcb_control_area *dst = &dst_vmcb->control;
- struct vmcb_control_area *from = &from_vmcb->control;
-
- dst->intercept_cr = from->intercept_cr;
- dst->intercept_dr = from->intercept_dr;
- dst->intercept_exceptions = from->intercept_exceptions;
- dst->intercept = from->intercept;
- dst->iopm_base_pa = from->iopm_base_pa;
- dst->msrpm_base_pa = from->msrpm_base_pa;
- dst->tsc_offset = from->tsc_offset;
- dst->asid = from->asid;
- dst->tlb_ctl = from->tlb_ctl;
- dst->int_ctl = from->int_ctl;
- dst->int_vector = from->int_vector;
- dst->int_state = from->int_state;
- dst->exit_code = from->exit_code;
- dst->exit_code_hi = from->exit_code_hi;
- dst->exit_info_1 = from->exit_info_1;
- dst->exit_info_2 = from->exit_info_2;
- dst->exit_int_info = from->exit_int_info;
- dst->exit_int_info_err = from->exit_int_info_err;
- dst->nested_ctl = from->nested_ctl;
- dst->event_inj = from->event_inj;
- dst->event_inj_err = from->event_inj_err;
- dst->nested_cr3 = from->nested_cr3;
- dst->lbr_ctl = from->lbr_ctl;
-}
-
-static int nested_svm_vmexit(struct vcpu_svm *svm)
-{
- struct vmcb *nested_vmcb;
- struct vmcb *hsave = svm->nested.hsave;
- struct vmcb *vmcb = svm->vmcb;
- struct page *page;
-
- trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
- vmcb->control.exit_info_1,
- vmcb->control.exit_info_2,
- vmcb->control.exit_int_info,
- vmcb->control.exit_int_info_err,
- KVM_ISA_SVM);
-
- nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
- if (!nested_vmcb)
- return 1;
-
- /* Exit Guest-Mode */
- leave_guest_mode(&svm->vcpu);
- svm->nested.vmcb = 0;
-
- /* Give the current vmcb to the guest */
- disable_gif(svm);
-
- nested_vmcb->save.es = vmcb->save.es;
- nested_vmcb->save.cs = vmcb->save.cs;
- nested_vmcb->save.ss = vmcb->save.ss;
- nested_vmcb->save.ds = vmcb->save.ds;
- nested_vmcb->save.gdtr = vmcb->save.gdtr;
- nested_vmcb->save.idtr = vmcb->save.idtr;
- nested_vmcb->save.efer = svm->vcpu.arch.efer;
- nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu);
- nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu);
- nested_vmcb->save.cr2 = vmcb->save.cr2;
- nested_vmcb->save.cr4 = svm->vcpu.arch.cr4;
- nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu);
- nested_vmcb->save.rip = vmcb->save.rip;
- nested_vmcb->save.rsp = vmcb->save.rsp;
- nested_vmcb->save.rax = vmcb->save.rax;
- nested_vmcb->save.dr7 = vmcb->save.dr7;
- nested_vmcb->save.dr6 = vmcb->save.dr6;
- nested_vmcb->save.cpl = vmcb->save.cpl;
-
- nested_vmcb->control.int_ctl = vmcb->control.int_ctl;
- nested_vmcb->control.int_vector = vmcb->control.int_vector;
- nested_vmcb->control.int_state = vmcb->control.int_state;
- nested_vmcb->control.exit_code = vmcb->control.exit_code;
- nested_vmcb->control.exit_code_hi = vmcb->control.exit_code_hi;
- nested_vmcb->control.exit_info_1 = vmcb->control.exit_info_1;
- nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2;
- nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info;
- nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
- nested_vmcb->control.next_rip = vmcb->control.next_rip;
-
- /*
- * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
- * to make sure that we do not lose injected events. So check event_inj
- * here and copy it to exit_int_info if it is valid.
- * Exit_int_info and event_inj can't be both valid because the case
- * below only happens on a VMRUN instruction intercept which has
- * no valid exit_int_info set.
- */
- if (vmcb->control.event_inj & SVM_EVTINJ_VALID) {
- struct vmcb_control_area *nc = &nested_vmcb->control;
-
- nc->exit_int_info = vmcb->control.event_inj;
- nc->exit_int_info_err = vmcb->control.event_inj_err;
- }
-
- nested_vmcb->control.tlb_ctl = 0;
- nested_vmcb->control.event_inj = 0;
- nested_vmcb->control.event_inj_err = 0;
-
- /* We always set V_INTR_MASKING and remember the old value in hflags */
- if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
- nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
-
- /* Restore the original control entries */
- copy_vmcb_control_area(vmcb, hsave);
-
- kvm_clear_exception_queue(&svm->vcpu);
- kvm_clear_interrupt_queue(&svm->vcpu);
-
- svm->nested.nested_cr3 = 0;
-
- /* Restore selected save entries */
- svm->vmcb->save.es = hsave->save.es;
- svm->vmcb->save.cs = hsave->save.cs;
- svm->vmcb->save.ss = hsave->save.ss;
- svm->vmcb->save.ds = hsave->save.ds;
- svm->vmcb->save.gdtr = hsave->save.gdtr;
- svm->vmcb->save.idtr = hsave->save.idtr;
- kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
- svm_set_efer(&svm->vcpu, hsave->save.efer);
- svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
- svm_set_cr4(&svm->vcpu, hsave->save.cr4);
- if (npt_enabled) {
- svm->vmcb->save.cr3 = hsave->save.cr3;
- svm->vcpu.arch.cr3 = hsave->save.cr3;
- } else {
- (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
- }
- kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);
- kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp);
- kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, hsave->save.rip);
- svm->vmcb->save.dr7 = 0;
- svm->vmcb->save.cpl = 0;
- svm->vmcb->control.exit_int_info = 0;
-
- mark_all_dirty(svm->vmcb);
-
- nested_svm_unmap(page);
-
- nested_svm_uninit_mmu_context(&svm->vcpu);
- kvm_mmu_reset_context(&svm->vcpu);
- kvm_mmu_load(&svm->vcpu);
-
- return 0;
-}
-
-static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
-{
- /*
- * This function merges the msr permission bitmaps of kvm and the
- * nested vmcb. It is omptimized in that it only merges the parts where
- * the kvm msr permission bitmap may contain zero bits
- */
- int i;
-
- if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
- return true;
-
- for (i = 0; i < MSRPM_OFFSETS; i++) {
- u32 value, p;
- u64 offset;
-
- if (msrpm_offsets[i] == 0xffffffff)
- break;
-
- p = msrpm_offsets[i];
- offset = svm->nested.vmcb_msrpm + (p * 4);
-
- if (kvm_read_guest(svm->vcpu.kvm, offset, &value, 4))
- return false;
-
- svm->nested.msrpm[p] = svm->msrpm[p] | value;
- }
-
- svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
-
- return true;
-}
-
-static bool nested_vmcb_checks(struct vmcb *vmcb)
-{
- if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
- return false;
-
- if (vmcb->control.asid == 0)
- return false;
-
- if (vmcb->control.nested_ctl && !npt_enabled)
- return false;
-
- return true;
-}
-
-static bool nested_svm_vmrun(struct vcpu_svm *svm)
-{
- struct vmcb *nested_vmcb;
- struct vmcb *hsave = svm->nested.hsave;
- struct vmcb *vmcb = svm->vmcb;
- struct page *page;
- u64 vmcb_gpa;
-
- vmcb_gpa = svm->vmcb->save.rax;
-
- nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
- if (!nested_vmcb)
- return false;
-
- if (!nested_vmcb_checks(nested_vmcb)) {
- nested_vmcb->control.exit_code = SVM_EXIT_ERR;
- nested_vmcb->control.exit_code_hi = 0;
- nested_vmcb->control.exit_info_1 = 0;
- nested_vmcb->control.exit_info_2 = 0;
-
- nested_svm_unmap(page);
-
- return false;
- }
-
- trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
- nested_vmcb->save.rip,
- nested_vmcb->control.int_ctl,
- nested_vmcb->control.event_inj,
- nested_vmcb->control.nested_ctl);
-
- trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
- nested_vmcb->control.intercept_cr >> 16,
- nested_vmcb->control.intercept_exceptions,
- nested_vmcb->control.intercept);
-
- /* Clear internal status */
- kvm_clear_exception_queue(&svm->vcpu);
- kvm_clear_interrupt_queue(&svm->vcpu);
-
- /*
- * Save the old vmcb, so we don't need to pick what we save, but can
- * restore everything when a VMEXIT occurs
- */
- hsave->save.es = vmcb->save.es;
- hsave->save.cs = vmcb->save.cs;
- hsave->save.ss = vmcb->save.ss;
- hsave->save.ds = vmcb->save.ds;
- hsave->save.gdtr = vmcb->save.gdtr;
- hsave->save.idtr = vmcb->save.idtr;
- hsave->save.efer = svm->vcpu.arch.efer;
- hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
- hsave->save.cr4 = svm->vcpu.arch.cr4;
- hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
- hsave->save.rip = kvm_rip_read(&svm->vcpu);
- hsave->save.rsp = vmcb->save.rsp;
- hsave->save.rax = vmcb->save.rax;
- if (npt_enabled)
- hsave->save.cr3 = vmcb->save.cr3;
- else
- hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
-
- copy_vmcb_control_area(hsave, vmcb);
-
- if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
- svm->vcpu.arch.hflags |= HF_HIF_MASK;
- else
- svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
-
- if (nested_vmcb->control.nested_ctl) {
- kvm_mmu_unload(&svm->vcpu);
- svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3;
- nested_svm_init_mmu_context(&svm->vcpu);
- }
-
- /* Load the nested guest state */
- svm->vmcb->save.es = nested_vmcb->save.es;
- svm->vmcb->save.cs = nested_vmcb->save.cs;
- svm->vmcb->save.ss = nested_vmcb->save.ss;
- svm->vmcb->save.ds = nested_vmcb->save.ds;
- svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;
- svm->vmcb->save.idtr = nested_vmcb->save.idtr;
- kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags);
- svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
- svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
- svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
- if (npt_enabled) {
- svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
- svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
- } else
- (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
-
- /* Guest paging mode is active - reset mmu */
- kvm_mmu_reset_context(&svm->vcpu);
-
- svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
- kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
- kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
- kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
-
- /* In case we don't even reach vcpu_run, the fields are not updated */
- svm->vmcb->save.rax = nested_vmcb->save.rax;
- svm->vmcb->save.rsp = nested_vmcb->save.rsp;
- svm->vmcb->save.rip = nested_vmcb->save.rip;
- svm->vmcb->save.dr7 = nested_vmcb->save.dr7;
- svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
- svm->vmcb->save.cpl = nested_vmcb->save.cpl;
-
- svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL;
- svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL;
-
- /* cache intercepts */
- svm->nested.intercept_cr = nested_vmcb->control.intercept_cr;
- svm->nested.intercept_dr = nested_vmcb->control.intercept_dr;
- svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
- svm->nested.intercept = nested_vmcb->control.intercept;
-
- svm_flush_tlb(&svm->vcpu);
- svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
- if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
- svm->vcpu.arch.hflags |= HF_VINTR_MASK;
- else
- svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
-
- if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
- /* We only want the cr8 intercept bits of the guest */
- clr_cr_intercept(svm, INTERCEPT_CR8_READ);
- clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
- }
-
- /* We don't want to see VMMCALLs from a nested guest */
- clr_intercept(svm, INTERCEPT_VMMCALL);
-
- svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl;
- svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
- svm->vmcb->control.int_state = nested_vmcb->control.int_state;
- svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
- svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
- svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
-
- nested_svm_unmap(page);
-
- /* Enter Guest-Mode */
- enter_guest_mode(&svm->vcpu);
-
- /*
- * Merge guest and host intercepts - must be called with vcpu in
- * guest-mode to take affect here
- */
- recalc_intercepts(svm);
-
- svm->nested.vmcb = vmcb_gpa;
-
- enable_gif(svm);
-
- mark_all_dirty(svm->vmcb);
-
- return true;
-}
-
-static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
-{
- to_vmcb->save.fs = from_vmcb->save.fs;
- to_vmcb->save.gs = from_vmcb->save.gs;
- to_vmcb->save.tr = from_vmcb->save.tr;
- to_vmcb->save.ldtr = from_vmcb->save.ldtr;
- to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base;
- to_vmcb->save.star = from_vmcb->save.star;
- to_vmcb->save.lstar = from_vmcb->save.lstar;
- to_vmcb->save.cstar = from_vmcb->save.cstar;
- to_vmcb->save.sfmask = from_vmcb->save.sfmask;
- to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
- to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
- to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
-}
-
-static int vmload_interception(struct vcpu_svm *svm)
-{
- struct vmcb *nested_vmcb;
- struct page *page;
-
- if (nested_svm_check_permissions(svm))
- return 1;
-
- nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
- if (!nested_vmcb)
- return 1;
-
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
- skip_emulated_instruction(&svm->vcpu);
-
- nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
- nested_svm_unmap(page);
-
- return 1;
-}
-
-static int vmsave_interception(struct vcpu_svm *svm)
-{
- struct vmcb *nested_vmcb;
- struct page *page;
-
- if (nested_svm_check_permissions(svm))
- return 1;
-
- nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
- if (!nested_vmcb)
- return 1;
-
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
- skip_emulated_instruction(&svm->vcpu);
-
- nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
- nested_svm_unmap(page);
-
- return 1;
-}
-
-static int vmrun_interception(struct vcpu_svm *svm)
-{
- if (nested_svm_check_permissions(svm))
- return 1;
-
- /* Save rip after vmrun instruction */
- kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3);
-
- if (!nested_svm_vmrun(svm))
- return 1;
-
- if (!nested_svm_vmrun_msrpm(svm))
- goto failed;
-
- return 1;
-
-failed:
-
- svm->vmcb->control.exit_code = SVM_EXIT_ERR;
- svm->vmcb->control.exit_code_hi = 0;
- svm->vmcb->control.exit_info_1 = 0;
- svm->vmcb->control.exit_info_2 = 0;
-
- nested_svm_vmexit(svm);
-
- return 1;
-}
-
-static int stgi_interception(struct vcpu_svm *svm)
-{
- if (nested_svm_check_permissions(svm))
- return 1;
-
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
- skip_emulated_instruction(&svm->vcpu);
- kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
-
- enable_gif(svm);
-
- return 1;
-}
-
-static int clgi_interception(struct vcpu_svm *svm)
-{
- if (nested_svm_check_permissions(svm))
- return 1;
-
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
- skip_emulated_instruction(&svm->vcpu);
-
- disable_gif(svm);
-
- /* After a CLGI no interrupts should come */
- svm_clear_vintr(svm);
- svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
-
- mark_dirty(svm->vmcb, VMCB_INTR);
-
- return 1;
-}
-
-static int invlpga_interception(struct vcpu_svm *svm)
-{
- struct kvm_vcpu *vcpu = &svm->vcpu;
-
- trace_kvm_invlpga(svm->vmcb->save.rip, vcpu->arch.regs[VCPU_REGS_RCX],
- vcpu->arch.regs[VCPU_REGS_RAX]);
-
- /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
- kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]);
-
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
- skip_emulated_instruction(&svm->vcpu);
- return 1;
-}
-
-static int skinit_interception(struct vcpu_svm *svm)
-{
- trace_kvm_skinit(svm->vmcb->save.rip, svm->vcpu.arch.regs[VCPU_REGS_RAX]);
-
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
- return 1;
-}
-
-static int xsetbv_interception(struct vcpu_svm *svm)
-{
- u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
- u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
-
- if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
- skip_emulated_instruction(&svm->vcpu);
- }
-
- return 1;
-}
-
-static int invalid_op_interception(struct vcpu_svm *svm)
-{
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
- return 1;
-}
-
-static int task_switch_interception(struct vcpu_svm *svm)
-{
- u16 tss_selector;
- int reason;
- int int_type = svm->vmcb->control.exit_int_info &
- SVM_EXITINTINFO_TYPE_MASK;
- int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
- uint32_t type =
- svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
- uint32_t idt_v =
- svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
- bool has_error_code = false;
- u32 error_code = 0;
-
- tss_selector = (u16)svm->vmcb->control.exit_info_1;
-
- if (svm->vmcb->control.exit_info_2 &
- (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
- reason = TASK_SWITCH_IRET;
- else if (svm->vmcb->control.exit_info_2 &
- (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
- reason = TASK_SWITCH_JMP;
- else if (idt_v)
- reason = TASK_SWITCH_GATE;
- else
- reason = TASK_SWITCH_CALL;
-
- if (reason == TASK_SWITCH_GATE) {
- switch (type) {
- case SVM_EXITINTINFO_TYPE_NMI:
- svm->vcpu.arch.nmi_injected = false;
- break;
- case SVM_EXITINTINFO_TYPE_EXEPT:
- if (svm->vmcb->control.exit_info_2 &
- (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
- has_error_code = true;
- error_code =
- (u32)svm->vmcb->control.exit_info_2;
- }
- kvm_clear_exception_queue(&svm->vcpu);
- break;
- case SVM_EXITINTINFO_TYPE_INTR:
- kvm_clear_interrupt_queue(&svm->vcpu);
- break;
- default:
- break;
- }
- }
-
- if (reason != TASK_SWITCH_GATE ||
- int_type == SVM_EXITINTINFO_TYPE_SOFT ||
- (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
- (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
- skip_emulated_instruction(&svm->vcpu);
-
- if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
- int_vec = -1;
-
- if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
- has_error_code, error_code) == EMULATE_FAIL) {
- svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- svm->vcpu.run->internal.ndata = 0;
- return 0;
- }
- return 1;
-}
-
-static int cpuid_interception(struct vcpu_svm *svm)
-{
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
- kvm_emulate_cpuid(&svm->vcpu);
- return 1;
-}
-
-static int iret_interception(struct vcpu_svm *svm)
-{
- ++svm->vcpu.stat.nmi_window_exits;
- clr_intercept(svm, INTERCEPT_IRET);
- svm->vcpu.arch.hflags |= HF_IRET_MASK;
- svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
- return 1;
-}
-
-static int invlpg_interception(struct vcpu_svm *svm)
-{
- if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
- return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
-
- kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
- skip_emulated_instruction(&svm->vcpu);
- return 1;
-}
-
-static int emulate_on_interception(struct vcpu_svm *svm)
-{
- return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
-}
-
-static int rdpmc_interception(struct vcpu_svm *svm)
-{
- int err;
-
- if (!static_cpu_has(X86_FEATURE_NRIPS))
- return emulate_on_interception(svm);
-
- err = kvm_rdpmc(&svm->vcpu);
- kvm_complete_insn_gp(&svm->vcpu, err);
-
- return 1;
-}
-
-bool check_selective_cr0_intercepted(struct vcpu_svm *svm, unsigned long val)
-{
- unsigned long cr0 = svm->vcpu.arch.cr0;
- bool ret = false;
- u64 intercept;
-
- intercept = svm->nested.intercept;
-
- if (!is_guest_mode(&svm->vcpu) ||
- (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))))
- return false;
-
- cr0 &= ~SVM_CR0_SELECTIVE_MASK;
- val &= ~SVM_CR0_SELECTIVE_MASK;
-
- if (cr0 ^ val) {
- svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
- ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
- }
-
- return ret;
-}
-
-#define CR_VALID (1ULL << 63)
-
-static int cr_interception(struct vcpu_svm *svm)
-{
- int reg, cr;
- unsigned long val;
- int err;
-
- if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
- return emulate_on_interception(svm);
-
- if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
- return emulate_on_interception(svm);
-
- reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
- cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
-
- err = 0;
- if (cr >= 16) { /* mov to cr */
- cr -= 16;
- val = kvm_register_read(&svm->vcpu, reg);
- switch (cr) {
- case 0:
- if (!check_selective_cr0_intercepted(svm, val))
- err = kvm_set_cr0(&svm->vcpu, val);
- else
- return 1;
-
- break;
- case 3:
- err = kvm_set_cr3(&svm->vcpu, val);
- break;
- case 4:
- err = kvm_set_cr4(&svm->vcpu, val);
- break;
- case 8:
- err = kvm_set_cr8(&svm->vcpu, val);
- break;
- default:
- WARN(1, "unhandled write to CR%d", cr);
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
- return 1;
- }
- } else { /* mov from cr */
- switch (cr) {
- case 0:
- val = kvm_read_cr0(&svm->vcpu);
- break;
- case 2:
- val = svm->vcpu.arch.cr2;
- break;
- case 3:
- val = kvm_read_cr3(&svm->vcpu);
- break;
- case 4:
- val = kvm_read_cr4(&svm->vcpu);
- break;
- case 8:
- val = kvm_get_cr8(&svm->vcpu);
- break;
- default:
- WARN(1, "unhandled read from CR%d", cr);
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
- return 1;
- }
- kvm_register_write(&svm->vcpu, reg, val);
- }
- kvm_complete_insn_gp(&svm->vcpu, err);
-
- return 1;
-}
-
-static int dr_interception(struct vcpu_svm *svm)
-{
- int reg, dr;
- unsigned long val;
- int err;
-
- if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
- return emulate_on_interception(svm);
-
- reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
- dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
-
- if (dr >= 16) { /* mov to DRn */
- val = kvm_register_read(&svm->vcpu, reg);
- kvm_set_dr(&svm->vcpu, dr - 16, val);
- } else {
- err = kvm_get_dr(&svm->vcpu, dr, &val);
- if (!err)
- kvm_register_write(&svm->vcpu, reg, val);
- }
-
- skip_emulated_instruction(&svm->vcpu);
-
- return 1;
-}
-
-static int cr8_write_interception(struct vcpu_svm *svm)
-{
- struct kvm_run *kvm_run = svm->vcpu.run;
- int r;
-
- u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
- /* instruction emulation calls kvm_set_cr8() */
- r = cr_interception(svm);
- if (irqchip_in_kernel(svm->vcpu.kvm)) {
- clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
- return r;
- }
- if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
- return r;
- kvm_run->exit_reason = KVM_EXIT_SET_TPR;
- return 0;
-}
-
-u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu)
-{
- struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu));
- return vmcb->control.tsc_offset +
- svm_scale_tsc(vcpu, native_read_tsc());
-}
-
-static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- switch (ecx) {
- case MSR_IA32_TSC: {
- *data = svm->vmcb->control.tsc_offset +
- svm_scale_tsc(vcpu, native_read_tsc());
-
- break;
- }
- case MSR_STAR:
- *data = svm->vmcb->save.star;
- break;
-#ifdef CONFIG_X86_64
- case MSR_LSTAR:
- *data = svm->vmcb->save.lstar;
- break;
- case MSR_CSTAR:
- *data = svm->vmcb->save.cstar;
- break;
- case MSR_KERNEL_GS_BASE:
- *data = svm->vmcb->save.kernel_gs_base;
- break;
- case MSR_SYSCALL_MASK:
- *data = svm->vmcb->save.sfmask;
- break;
-#endif
- case MSR_IA32_SYSENTER_CS:
- *data = svm->vmcb->save.sysenter_cs;
- break;
- case MSR_IA32_SYSENTER_EIP:
- *data = svm->sysenter_eip;
- break;
- case MSR_IA32_SYSENTER_ESP:
- *data = svm->sysenter_esp;
- break;
- /*
- * Nobody will change the following 5 values in the VMCB so we can
- * safely return them on rdmsr. They will always be 0 until LBRV is
- * implemented.
- */
- case MSR_IA32_DEBUGCTLMSR:
- *data = svm->vmcb->save.dbgctl;
- break;
- case MSR_IA32_LASTBRANCHFROMIP:
- *data = svm->vmcb->save.br_from;
- break;
- case MSR_IA32_LASTBRANCHTOIP:
- *data = svm->vmcb->save.br_to;
- break;
- case MSR_IA32_LASTINTFROMIP:
- *data = svm->vmcb->save.last_excp_from;
- break;
- case MSR_IA32_LASTINTTOIP:
- *data = svm->vmcb->save.last_excp_to;
- break;
- case MSR_VM_HSAVE_PA:
- *data = svm->nested.hsave_msr;
- break;
- case MSR_VM_CR:
- *data = svm->nested.vm_cr_msr;
- break;
- case MSR_IA32_UCODE_REV:
- *data = 0x01000065;
- break;
- default:
- return kvm_get_msr_common(vcpu, ecx, data);
- }
- return 0;
-}
-
-static int rdmsr_interception(struct vcpu_svm *svm)
-{
- u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
- u64 data;
-
- if (svm_get_msr(&svm->vcpu, ecx, &data)) {
- trace_kvm_msr_read_ex(ecx);
- kvm_inject_gp(&svm->vcpu, 0);
- } else {
- trace_kvm_msr_read(ecx, data);
-
- svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff;
- svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
- skip_emulated_instruction(&svm->vcpu);
- }
- return 1;
-}
-
-static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- int svm_dis, chg_mask;
-
- if (data & ~SVM_VM_CR_VALID_MASK)
- return 1;
-
- chg_mask = SVM_VM_CR_VALID_MASK;
-
- if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
- chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
-
- svm->nested.vm_cr_msr &= ~chg_mask;
- svm->nested.vm_cr_msr |= (data & chg_mask);
-
- svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
-
- /* check for svm_disable while efer.svme is set */
- if (svm_dis && (vcpu->arch.efer & EFER_SVME))
- return 1;
-
- return 0;
-}
-
-static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- switch (ecx) {
- case MSR_IA32_TSC:
- kvm_write_tsc(vcpu, data);
- break;
- case MSR_STAR:
- svm->vmcb->save.star = data;
- break;
-#ifdef CONFIG_X86_64
- case MSR_LSTAR:
- svm->vmcb->save.lstar = data;
- break;
- case MSR_CSTAR:
- svm->vmcb->save.cstar = data;
- break;
- case MSR_KERNEL_GS_BASE:
- svm->vmcb->save.kernel_gs_base = data;
- break;
- case MSR_SYSCALL_MASK:
- svm->vmcb->save.sfmask = data;
- break;
-#endif
- case MSR_IA32_SYSENTER_CS:
- svm->vmcb->save.sysenter_cs = data;
- break;
- case MSR_IA32_SYSENTER_EIP:
- svm->sysenter_eip = data;
- svm->vmcb->save.sysenter_eip = data;
- break;
- case MSR_IA32_SYSENTER_ESP:
- svm->sysenter_esp = data;
- svm->vmcb->save.sysenter_esp = data;
- break;
- case MSR_IA32_DEBUGCTLMSR:
- if (!boot_cpu_has(X86_FEATURE_LBRV)) {
- pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
- __func__, data);
- break;
- }
- if (data & DEBUGCTL_RESERVED_BITS)
- return 1;
-
- svm->vmcb->save.dbgctl = data;
- mark_dirty(svm->vmcb, VMCB_LBR);
- if (data & (1ULL<<0))
- svm_enable_lbrv(svm);
- else
- svm_disable_lbrv(svm);
- break;
- case MSR_VM_HSAVE_PA:
- svm->nested.hsave_msr = data;
- break;
- case MSR_VM_CR:
- return svm_set_vm_cr(vcpu, data);
- case MSR_VM_IGNNE:
- pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
- break;
- default:
- return kvm_set_msr_common(vcpu, ecx, data);
- }
- return 0;
-}
-
-static int wrmsr_interception(struct vcpu_svm *svm)
-{
- u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
- u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
- | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
-
-
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
- if (svm_set_msr(&svm->vcpu, ecx, data)) {
- trace_kvm_msr_write_ex(ecx, data);
- kvm_inject_gp(&svm->vcpu, 0);
- } else {
- trace_kvm_msr_write(ecx, data);
- skip_emulated_instruction(&svm->vcpu);
- }
- return 1;
-}
-
-static int msr_interception(struct vcpu_svm *svm)
-{
- if (svm->vmcb->control.exit_info_1)
- return wrmsr_interception(svm);
- else
- return rdmsr_interception(svm);
-}
-
-static int interrupt_window_interception(struct vcpu_svm *svm)
-{
- struct kvm_run *kvm_run = svm->vcpu.run;
-
- kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
- svm_clear_vintr(svm);
- svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
- mark_dirty(svm->vmcb, VMCB_INTR);
- /*
- * If the user space waits to inject interrupts, exit as soon as
- * possible
- */
- if (!irqchip_in_kernel(svm->vcpu.kvm) &&
- kvm_run->request_interrupt_window &&
- !kvm_cpu_has_interrupt(&svm->vcpu)) {
- ++svm->vcpu.stat.irq_window_exits;
- kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
- return 0;
- }
-
- return 1;
-}
-
-static int pause_interception(struct vcpu_svm *svm)
-{
- kvm_vcpu_on_spin(&(svm->vcpu));
- return 1;
-}
-
-static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
- [SVM_EXIT_READ_CR0] = cr_interception,
- [SVM_EXIT_READ_CR3] = cr_interception,
- [SVM_EXIT_READ_CR4] = cr_interception,
- [SVM_EXIT_READ_CR8] = cr_interception,
- [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception,
- [SVM_EXIT_WRITE_CR0] = cr_interception,
- [SVM_EXIT_WRITE_CR3] = cr_interception,
- [SVM_EXIT_WRITE_CR4] = cr_interception,
- [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
- [SVM_EXIT_READ_DR0] = dr_interception,
- [SVM_EXIT_READ_DR1] = dr_interception,
- [SVM_EXIT_READ_DR2] = dr_interception,
- [SVM_EXIT_READ_DR3] = dr_interception,
- [SVM_EXIT_READ_DR4] = dr_interception,
- [SVM_EXIT_READ_DR5] = dr_interception,
- [SVM_EXIT_READ_DR6] = dr_interception,
- [SVM_EXIT_READ_DR7] = dr_interception,
- [SVM_EXIT_WRITE_DR0] = dr_interception,
- [SVM_EXIT_WRITE_DR1] = dr_interception,
- [SVM_EXIT_WRITE_DR2] = dr_interception,
- [SVM_EXIT_WRITE_DR3] = dr_interception,
- [SVM_EXIT_WRITE_DR4] = dr_interception,
- [SVM_EXIT_WRITE_DR5] = dr_interception,
- [SVM_EXIT_WRITE_DR6] = dr_interception,
- [SVM_EXIT_WRITE_DR7] = dr_interception,
- [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
- [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
- [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
- [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
- [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
- [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
- [SVM_EXIT_INTR] = intr_interception,
- [SVM_EXIT_NMI] = nmi_interception,
- [SVM_EXIT_SMI] = nop_on_interception,
- [SVM_EXIT_INIT] = nop_on_interception,
- [SVM_EXIT_VINTR] = interrupt_window_interception,
- [SVM_EXIT_RDPMC] = rdpmc_interception,
- [SVM_EXIT_CPUID] = cpuid_interception,
- [SVM_EXIT_IRET] = iret_interception,
- [SVM_EXIT_INVD] = emulate_on_interception,
- [SVM_EXIT_PAUSE] = pause_interception,
- [SVM_EXIT_HLT] = halt_interception,
- [SVM_EXIT_INVLPG] = invlpg_interception,
- [SVM_EXIT_INVLPGA] = invlpga_interception,
- [SVM_EXIT_IOIO] = io_interception,
- [SVM_EXIT_MSR] = msr_interception,
- [SVM_EXIT_TASK_SWITCH] = task_switch_interception,
- [SVM_EXIT_SHUTDOWN] = shutdown_interception,
- [SVM_EXIT_VMRUN] = vmrun_interception,
- [SVM_EXIT_VMMCALL] = vmmcall_interception,
- [SVM_EXIT_VMLOAD] = vmload_interception,
- [SVM_EXIT_VMSAVE] = vmsave_interception,
- [SVM_EXIT_STGI] = stgi_interception,
- [SVM_EXIT_CLGI] = clgi_interception,
- [SVM_EXIT_SKINIT] = skinit_interception,
- [SVM_EXIT_WBINVD] = emulate_on_interception,
- [SVM_EXIT_MONITOR] = invalid_op_interception,
- [SVM_EXIT_MWAIT] = invalid_op_interception,
- [SVM_EXIT_XSETBV] = xsetbv_interception,
- [SVM_EXIT_NPF] = pf_interception,
-};
-
-static void dump_vmcb(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb_control_area *control = &svm->vmcb->control;
- struct vmcb_save_area *save = &svm->vmcb->save;
-
- pr_err("VMCB Control Area:\n");
- pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff);
- pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16);
- pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff);
- pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16);
- pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions);
- pr_err("%-20s%016llx\n", "intercepts:", control->intercept);
- pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
- pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
- pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
- pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
- pr_err("%-20s%d\n", "asid:", control->asid);
- pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
- pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
- pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
- pr_err("%-20s%08x\n", "int_state:", control->int_state);
- pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
- pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
- pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
- pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
- pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
- pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
- pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
- pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
- pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
- pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl);
- pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
- pr_err("VMCB State Save Area:\n");
- pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
- "es:",
- save->es.selector, save->es.attrib,
- save->es.limit, save->es.base);
- pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
- "cs:",
- save->cs.selector, save->cs.attrib,
- save->cs.limit, save->cs.base);
- pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
- "ss:",
- save->ss.selector, save->ss.attrib,
- save->ss.limit, save->ss.base);
- pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
- "ds:",
- save->ds.selector, save->ds.attrib,
- save->ds.limit, save->ds.base);
- pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
- "fs:",
- save->fs.selector, save->fs.attrib,
- save->fs.limit, save->fs.base);
- pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
- "gs:",
- save->gs.selector, save->gs.attrib,
- save->gs.limit, save->gs.base);
- pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
- "gdtr:",
- save->gdtr.selector, save->gdtr.attrib,
- save->gdtr.limit, save->gdtr.base);
- pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
- "ldtr:",
- save->ldtr.selector, save->ldtr.attrib,
- save->ldtr.limit, save->ldtr.base);
- pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
- "idtr:",
- save->idtr.selector, save->idtr.attrib,
- save->idtr.limit, save->idtr.base);
- pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
- "tr:",
- save->tr.selector, save->tr.attrib,
- save->tr.limit, save->tr.base);
- pr_err("cpl: %d efer: %016llx\n",
- save->cpl, save->efer);
- pr_err("%-15s %016llx %-13s %016llx\n",
- "cr0:", save->cr0, "cr2:", save->cr2);
- pr_err("%-15s %016llx %-13s %016llx\n",
- "cr3:", save->cr3, "cr4:", save->cr4);
- pr_err("%-15s %016llx %-13s %016llx\n",
- "dr6:", save->dr6, "dr7:", save->dr7);
- pr_err("%-15s %016llx %-13s %016llx\n",
- "rip:", save->rip, "rflags:", save->rflags);
- pr_err("%-15s %016llx %-13s %016llx\n",
- "rsp:", save->rsp, "rax:", save->rax);
- pr_err("%-15s %016llx %-13s %016llx\n",
- "star:", save->star, "lstar:", save->lstar);
- pr_err("%-15s %016llx %-13s %016llx\n",
- "cstar:", save->cstar, "sfmask:", save->sfmask);
- pr_err("%-15s %016llx %-13s %016llx\n",
- "kernel_gs_base:", save->kernel_gs_base,
- "sysenter_cs:", save->sysenter_cs);
- pr_err("%-15s %016llx %-13s %016llx\n",
- "sysenter_esp:", save->sysenter_esp,
- "sysenter_eip:", save->sysenter_eip);
- pr_err("%-15s %016llx %-13s %016llx\n",
- "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
- pr_err("%-15s %016llx %-13s %016llx\n",
- "br_from:", save->br_from, "br_to:", save->br_to);
- pr_err("%-15s %016llx %-13s %016llx\n",
- "excp_from:", save->last_excp_from,
- "excp_to:", save->last_excp_to);
-}
-
-static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
-{
- struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
-
- *info1 = control->exit_info_1;
- *info2 = control->exit_info_2;
-}
-
-static int handle_exit(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- struct kvm_run *kvm_run = vcpu->run;
- u32 exit_code = svm->vmcb->control.exit_code;
-
- if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
- vcpu->arch.cr0 = svm->vmcb->save.cr0;
- if (npt_enabled)
- vcpu->arch.cr3 = svm->vmcb->save.cr3;
-
- if (unlikely(svm->nested.exit_required)) {
- nested_svm_vmexit(svm);
- svm->nested.exit_required = false;
-
- return 1;
- }
-
- if (is_guest_mode(vcpu)) {
- int vmexit;
-
- trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
- svm->vmcb->control.exit_info_1,
- svm->vmcb->control.exit_info_2,
- svm->vmcb->control.exit_int_info,
- svm->vmcb->control.exit_int_info_err,
- KVM_ISA_SVM);
-
- vmexit = nested_svm_exit_special(svm);
-
- if (vmexit == NESTED_EXIT_CONTINUE)
- vmexit = nested_svm_exit_handled(svm);
-
- if (vmexit == NESTED_EXIT_DONE)
- return 1;
- }
-
- svm_complete_interrupts(svm);
-
- if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
- kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
- kvm_run->fail_entry.hardware_entry_failure_reason
- = svm->vmcb->control.exit_code;
- pr_err("KVM: FAILED VMRUN WITH VMCB:\n");
- dump_vmcb(vcpu);
- return 0;
- }
-
- if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
- exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
- exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
- exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
- printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
- "exit_code 0x%x\n",
- __func__, svm->vmcb->control.exit_int_info,
- exit_code);
-
- if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
- || !svm_exit_handlers[exit_code]) {
- kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
- kvm_run->hw.hardware_exit_reason = exit_code;
- return 0;
- }
-
- return svm_exit_handlers[exit_code](svm);
-}
-
-static void reload_tss(struct kvm_vcpu *vcpu)
-{
- int cpu = raw_smp_processor_id();
-
- struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
- sd->tss_desc->type = 9; /* available 32/64-bit TSS */
- load_TR_desc();
-}
-
-static void pre_svm_run(struct vcpu_svm *svm)
-{
- int cpu = raw_smp_processor_id();
-
- struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
-
- /* FIXME: handle wraparound of asid_generation */
- if (svm->asid_generation != sd->asid_generation)
- new_asid(svm, sd);
-}
-
-static void svm_inject_nmi(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
- vcpu->arch.hflags |= HF_NMI_MASK;
- set_intercept(svm, INTERCEPT_IRET);
- ++vcpu->stat.nmi_injections;
-}
-
-static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
-{
- struct vmcb_control_area *control;
-
- control = &svm->vmcb->control;
- control->int_vector = irq;
- control->int_ctl &= ~V_INTR_PRIO_MASK;
- control->int_ctl |= V_IRQ_MASK |
- ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
- mark_dirty(svm->vmcb, VMCB_INTR);
-}
-
-static void svm_set_irq(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- BUG_ON(!(gif_set(svm)));
-
- trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
- ++vcpu->stat.irq_injections;
-
- svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
- SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
-}
-
-static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
- return;
-
- if (irr == -1)
- return;
-
- if (tpr >= irr)
- set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
-}
-
-static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb *vmcb = svm->vmcb;
- int ret;
- ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
- !(svm->vcpu.arch.hflags & HF_NMI_MASK);
- ret = ret && gif_set(svm) && nested_svm_nmi(svm);
-
- return ret;
-}
-
-static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
-}
-
-static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if (masked) {
- svm->vcpu.arch.hflags |= HF_NMI_MASK;
- set_intercept(svm, INTERCEPT_IRET);
- } else {
- svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
- clr_intercept(svm, INTERCEPT_IRET);
- }
-}
-
-static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb *vmcb = svm->vmcb;
- int ret;
-
- if (!gif_set(svm) ||
- (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
- return 0;
-
- ret = !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF);
-
- if (is_guest_mode(vcpu))
- return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
-
- return ret;
-}
-
-static void enable_irq_window(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- /*
- * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
- * 1, because that's a separate STGI/VMRUN intercept. The next time we
- * get that intercept, this function will be called again though and
- * we'll get the vintr intercept.
- */
- if (gif_set(svm) && nested_svm_intr(svm)) {
- svm_set_vintr(svm);
- svm_inject_irq(svm, 0x0);
- }
-}
-
-static void enable_nmi_window(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
- == HF_NMI_MASK)
- return; /* IRET will cause a vm exit */
-
- /*
- * Something prevents NMI from been injected. Single step over possible
- * problem (IRET or exception injection or interrupt shadow)
- */
- svm->nmi_singlestep = true;
- svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
- update_db_intercept(vcpu);
-}
-
-static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
-{
- return 0;
-}
-
-static void svm_flush_tlb(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
- svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
- else
- svm->asid_generation--;
-}
-
-static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
-{
-}
-
-static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
- return;
-
- if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
- int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
- kvm_set_cr8(vcpu, cr8);
- }
-}
-
-static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- u64 cr8;
-
- if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
- return;
-
- cr8 = kvm_get_cr8(vcpu);
- svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
- svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
-}
-
-static void svm_complete_interrupts(struct vcpu_svm *svm)
-{
- u8 vector;
- int type;
- u32 exitintinfo = svm->vmcb->control.exit_int_info;
- unsigned int3_injected = svm->int3_injected;
-
- svm->int3_injected = 0;
-
- /*
- * If we've made progress since setting HF_IRET_MASK, we've
- * executed an IRET and can allow NMI injection.
- */
- if ((svm->vcpu.arch.hflags & HF_IRET_MASK)
- && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) {
- svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
- kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
- }
-
- svm->vcpu.arch.nmi_injected = false;
- kvm_clear_exception_queue(&svm->vcpu);
- kvm_clear_interrupt_queue(&svm->vcpu);
-
- if (!(exitintinfo & SVM_EXITINTINFO_VALID))
- return;
-
- kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
-
- vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
- type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
-
- switch (type) {
- case SVM_EXITINTINFO_TYPE_NMI:
- svm->vcpu.arch.nmi_injected = true;
- break;
- case SVM_EXITINTINFO_TYPE_EXEPT:
- /*
- * In case of software exceptions, do not reinject the vector,
- * but re-execute the instruction instead. Rewind RIP first
- * if we emulated INT3 before.
- */
- if (kvm_exception_is_soft(vector)) {
- if (vector == BP_VECTOR && int3_injected &&
- kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
- kvm_rip_write(&svm->vcpu,
- kvm_rip_read(&svm->vcpu) -
- int3_injected);
- break;
- }
- if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
- u32 err = svm->vmcb->control.exit_int_info_err;
- kvm_requeue_exception_e(&svm->vcpu, vector, err);
-
- } else
- kvm_requeue_exception(&svm->vcpu, vector);
- break;
- case SVM_EXITINTINFO_TYPE_INTR:
- kvm_queue_interrupt(&svm->vcpu, vector, false);
- break;
- default:
- break;
- }
-}
-
-static void svm_cancel_injection(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb_control_area *control = &svm->vmcb->control;
-
- control->exit_int_info = control->event_inj;
- control->exit_int_info_err = control->event_inj_err;
- control->event_inj = 0;
- svm_complete_interrupts(svm);
-}
-
-#ifdef CONFIG_X86_64
-#define R "r"
-#else
-#define R "e"
-#endif
-
-static void svm_vcpu_run(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
- svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
- svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
-
- /*
- * A vmexit emulation is required before the vcpu can be executed
- * again.
- */
- if (unlikely(svm->nested.exit_required))
- return;
-
- pre_svm_run(svm);
-
- sync_lapic_to_cr8(vcpu);
-
- svm->vmcb->save.cr2 = vcpu->arch.cr2;
-
- clgi();
-
- local_irq_enable();
-
- asm volatile (
- "push %%"R"bp; \n\t"
- "mov %c[rbx](%[svm]), %%"R"bx \n\t"
- "mov %c[rcx](%[svm]), %%"R"cx \n\t"
- "mov %c[rdx](%[svm]), %%"R"dx \n\t"
- "mov %c[rsi](%[svm]), %%"R"si \n\t"
- "mov %c[rdi](%[svm]), %%"R"di \n\t"
- "mov %c[rbp](%[svm]), %%"R"bp \n\t"
-#ifdef CONFIG_X86_64
- "mov %c[r8](%[svm]), %%r8 \n\t"
- "mov %c[r9](%[svm]), %%r9 \n\t"
- "mov %c[r10](%[svm]), %%r10 \n\t"
- "mov %c[r11](%[svm]), %%r11 \n\t"
- "mov %c[r12](%[svm]), %%r12 \n\t"
- "mov %c[r13](%[svm]), %%r13 \n\t"
- "mov %c[r14](%[svm]), %%r14 \n\t"
- "mov %c[r15](%[svm]), %%r15 \n\t"
-#endif
-
- /* Enter guest mode */
- "push %%"R"ax \n\t"
- "mov %c[vmcb](%[svm]), %%"R"ax \n\t"
- __ex(SVM_VMLOAD) "\n\t"
- __ex(SVM_VMRUN) "\n\t"
- __ex(SVM_VMSAVE) "\n\t"
- "pop %%"R"ax \n\t"
-
- /* Save guest registers, load host registers */
- "mov %%"R"bx, %c[rbx](%[svm]) \n\t"
- "mov %%"R"cx, %c[rcx](%[svm]) \n\t"
- "mov %%"R"dx, %c[rdx](%[svm]) \n\t"
- "mov %%"R"si, %c[rsi](%[svm]) \n\t"
- "mov %%"R"di, %c[rdi](%[svm]) \n\t"
- "mov %%"R"bp, %c[rbp](%[svm]) \n\t"
-#ifdef CONFIG_X86_64
- "mov %%r8, %c[r8](%[svm]) \n\t"
- "mov %%r9, %c[r9](%[svm]) \n\t"
- "mov %%r10, %c[r10](%[svm]) \n\t"
- "mov %%r11, %c[r11](%[svm]) \n\t"
- "mov %%r12, %c[r12](%[svm]) \n\t"
- "mov %%r13, %c[r13](%[svm]) \n\t"
- "mov %%r14, %c[r14](%[svm]) \n\t"
- "mov %%r15, %c[r15](%[svm]) \n\t"
-#endif
- "pop %%"R"bp"
- :
- : [svm]"a"(svm),
- [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
- [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])),
- [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])),
- [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])),
- [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])),
- [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])),
- [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP]))
-#ifdef CONFIG_X86_64
- , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])),
- [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])),
- [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])),
- [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])),
- [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])),
- [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])),
- [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])),
- [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
-#endif
- : "cc", "memory"
- , R"bx", R"cx", R"dx", R"si", R"di"
-#ifdef CONFIG_X86_64
- , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
-#endif
- );
-
-#ifdef CONFIG_X86_64
- wrmsrl(MSR_GS_BASE, svm->host.gs_base);
-#else
- loadsegment(fs, svm->host.fs);
-#ifndef CONFIG_X86_32_LAZY_GS
- loadsegment(gs, svm->host.gs);
-#endif
-#endif
-
- reload_tss(vcpu);
-
- local_irq_disable();
-
- vcpu->arch.cr2 = svm->vmcb->save.cr2;
- vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
- vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
- vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
-
- trace_kvm_exit(svm->vmcb->control.exit_code, vcpu, KVM_ISA_SVM);
-
- if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
- kvm_before_handle_nmi(&svm->vcpu);
-
- stgi();
-
- /* Any pending NMI will happen here */
-
- if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
- kvm_after_handle_nmi(&svm->vcpu);
-
- sync_cr8_to_lapic(vcpu);
-
- svm->next_rip = 0;
-
- svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
-
- /* if exit due to PF check for async PF */
- if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
- svm->apf_reason = kvm_read_and_reset_pf_reason();
-
- if (npt_enabled) {
- vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
- vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
- }
-
- /*
- * We need to handle MC intercepts here before the vcpu has a chance to
- * change the physical cpu
- */
- if (unlikely(svm->vmcb->control.exit_code ==
- SVM_EXIT_EXCP_BASE + MC_VECTOR))
- svm_handle_mce(svm);
-
- mark_all_clean(svm->vmcb);
-}
-
-#undef R
-
-static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- svm->vmcb->save.cr3 = root;
- mark_dirty(svm->vmcb, VMCB_CR);
- svm_flush_tlb(vcpu);
-}
-
-static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- svm->vmcb->control.nested_cr3 = root;
- mark_dirty(svm->vmcb, VMCB_NPT);
-
- /* Also sync guest cr3 here in case we live migrate */
- svm->vmcb->save.cr3 = kvm_read_cr3(vcpu);
- mark_dirty(svm->vmcb, VMCB_CR);
-
- svm_flush_tlb(vcpu);
-}
-
-static int is_disabled(void)
-{
- u64 vm_cr;
-
- rdmsrl(MSR_VM_CR, vm_cr);
- if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
- return 1;
-
- return 0;
-}
-
-static void
-svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
-{
- /*
- * Patch in the VMMCALL instruction:
- */
- hypercall[0] = 0x0f;
- hypercall[1] = 0x01;
- hypercall[2] = 0xd9;
-}
-
-static void svm_check_processor_compat(void *rtn)
-{
- *(int *)rtn = 0;
-}
-
-static bool svm_cpu_has_accelerated_tpr(void)
-{
- return false;
-}
-
-static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
-{
- return 0;
-}
-
-static void svm_cpuid_update(struct kvm_vcpu *vcpu)
-{
-}
-
-static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
-{
- switch (func) {
- case 0x80000001:
- if (nested)
- entry->ecx |= (1 << 2); /* Set SVM bit */
- break;
- case 0x8000000A:
- entry->eax = 1; /* SVM revision 1 */
- entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
- ASID emulation to nested SVM */
- entry->ecx = 0; /* Reserved */
- entry->edx = 0; /* Per default do not support any
- additional features */
-
- /* Support next_rip if host supports it */
- if (boot_cpu_has(X86_FEATURE_NRIPS))
- entry->edx |= SVM_FEATURE_NRIP;
-
- /* Support NPT for the guest if enabled */
- if (npt_enabled)
- entry->edx |= SVM_FEATURE_NPT;
-
- break;
- }
-}
-
-static int svm_get_lpage_level(void)
-{
- return PT_PDPE_LEVEL;
-}
-
-static bool svm_rdtscp_supported(void)
-{
- return false;
-}
-
-static bool svm_has_wbinvd_exit(void)
-{
- return true;
-}
-
-static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- set_exception_intercept(svm, NM_VECTOR);
- update_cr0_intercept(svm);
-}
-
-#define PRE_EX(exit) { .exit_code = (exit), \
- .stage = X86_ICPT_PRE_EXCEPT, }
-#define POST_EX(exit) { .exit_code = (exit), \
- .stage = X86_ICPT_POST_EXCEPT, }
-#define POST_MEM(exit) { .exit_code = (exit), \
- .stage = X86_ICPT_POST_MEMACCESS, }
-
-static struct __x86_intercept {
- u32 exit_code;
- enum x86_intercept_stage stage;
-} x86_intercept_map[] = {
- [x86_intercept_cr_read] = POST_EX(SVM_EXIT_READ_CR0),
- [x86_intercept_cr_write] = POST_EX(SVM_EXIT_WRITE_CR0),
- [x86_intercept_clts] = POST_EX(SVM_EXIT_WRITE_CR0),
- [x86_intercept_lmsw] = POST_EX(SVM_EXIT_WRITE_CR0),
- [x86_intercept_smsw] = POST_EX(SVM_EXIT_READ_CR0),
- [x86_intercept_dr_read] = POST_EX(SVM_EXIT_READ_DR0),
- [x86_intercept_dr_write] = POST_EX(SVM_EXIT_WRITE_DR0),
- [x86_intercept_sldt] = POST_EX(SVM_EXIT_LDTR_READ),
- [x86_intercept_str] = POST_EX(SVM_EXIT_TR_READ),
- [x86_intercept_lldt] = POST_EX(SVM_EXIT_LDTR_WRITE),
- [x86_intercept_ltr] = POST_EX(SVM_EXIT_TR_WRITE),
- [x86_intercept_sgdt] = POST_EX(SVM_EXIT_GDTR_READ),
- [x86_intercept_sidt] = POST_EX(SVM_EXIT_IDTR_READ),
- [x86_intercept_lgdt] = POST_EX(SVM_EXIT_GDTR_WRITE),
- [x86_intercept_lidt] = POST_EX(SVM_EXIT_IDTR_WRITE),
- [x86_intercept_vmrun] = POST_EX(SVM_EXIT_VMRUN),
- [x86_intercept_vmmcall] = POST_EX(SVM_EXIT_VMMCALL),
- [x86_intercept_vmload] = POST_EX(SVM_EXIT_VMLOAD),
- [x86_intercept_vmsave] = POST_EX(SVM_EXIT_VMSAVE),
- [x86_intercept_stgi] = POST_EX(SVM_EXIT_STGI),
- [x86_intercept_clgi] = POST_EX(SVM_EXIT_CLGI),
- [x86_intercept_skinit] = POST_EX(SVM_EXIT_SKINIT),
- [x86_intercept_invlpga] = POST_EX(SVM_EXIT_INVLPGA),
- [x86_intercept_rdtscp] = POST_EX(SVM_EXIT_RDTSCP),
- [x86_intercept_monitor] = POST_MEM(SVM_EXIT_MONITOR),
- [x86_intercept_mwait] = POST_EX(SVM_EXIT_MWAIT),
- [x86_intercept_invlpg] = POST_EX(SVM_EXIT_INVLPG),
- [x86_intercept_invd] = POST_EX(SVM_EXIT_INVD),
- [x86_intercept_wbinvd] = POST_EX(SVM_EXIT_WBINVD),
- [x86_intercept_wrmsr] = POST_EX(SVM_EXIT_MSR),
- [x86_intercept_rdtsc] = POST_EX(SVM_EXIT_RDTSC),
- [x86_intercept_rdmsr] = POST_EX(SVM_EXIT_MSR),
- [x86_intercept_rdpmc] = POST_EX(SVM_EXIT_RDPMC),
- [x86_intercept_cpuid] = PRE_EX(SVM_EXIT_CPUID),
- [x86_intercept_rsm] = PRE_EX(SVM_EXIT_RSM),
- [x86_intercept_pause] = PRE_EX(SVM_EXIT_PAUSE),
- [x86_intercept_pushf] = PRE_EX(SVM_EXIT_PUSHF),
- [x86_intercept_popf] = PRE_EX(SVM_EXIT_POPF),
- [x86_intercept_intn] = PRE_EX(SVM_EXIT_SWINT),
- [x86_intercept_iret] = PRE_EX(SVM_EXIT_IRET),
- [x86_intercept_icebp] = PRE_EX(SVM_EXIT_ICEBP),
- [x86_intercept_hlt] = POST_EX(SVM_EXIT_HLT),
- [x86_intercept_in] = POST_EX(SVM_EXIT_IOIO),
- [x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO),
- [x86_intercept_out] = POST_EX(SVM_EXIT_IOIO),
- [x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO),
-};
-
-#undef PRE_EX
-#undef POST_EX
-#undef POST_MEM
-
-static int svm_check_intercept(struct kvm_vcpu *vcpu,
- struct x86_instruction_info *info,
- enum x86_intercept_stage stage)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- int vmexit, ret = X86EMUL_CONTINUE;
- struct __x86_intercept icpt_info;
- struct vmcb *vmcb = svm->vmcb;
-
- if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
- goto out;
-
- icpt_info = x86_intercept_map[info->intercept];
-
- if (stage != icpt_info.stage)
- goto out;
-
- switch (icpt_info.exit_code) {
- case SVM_EXIT_READ_CR0:
- if (info->intercept == x86_intercept_cr_read)
- icpt_info.exit_code += info->modrm_reg;
- break;
- case SVM_EXIT_WRITE_CR0: {
- unsigned long cr0, val;
- u64 intercept;
-
- if (info->intercept == x86_intercept_cr_write)
- icpt_info.exit_code += info->modrm_reg;
-
- if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0)
- break;
-
- intercept = svm->nested.intercept;
-
- if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))
- break;
-
- cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
- val = info->src_val & ~SVM_CR0_SELECTIVE_MASK;
-
- if (info->intercept == x86_intercept_lmsw) {
- cr0 &= 0xfUL;
- val &= 0xfUL;
- /* lmsw can't clear PE - catch this here */
- if (cr0 & X86_CR0_PE)
- val |= X86_CR0_PE;
- }
-
- if (cr0 ^ val)
- icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
-
- break;
- }
- case SVM_EXIT_READ_DR0:
- case SVM_EXIT_WRITE_DR0:
- icpt_info.exit_code += info->modrm_reg;
- break;
- case SVM_EXIT_MSR:
- if (info->intercept == x86_intercept_wrmsr)
- vmcb->control.exit_info_1 = 1;
- else
- vmcb->control.exit_info_1 = 0;
- break;
- case SVM_EXIT_PAUSE:
- /*
- * We get this for NOP only, but pause
- * is rep not, check this here
- */
- if (info->rep_prefix != REPE_PREFIX)
- goto out;
- case SVM_EXIT_IOIO: {
- u64 exit_info;
- u32 bytes;
-
- exit_info = (vcpu->arch.regs[VCPU_REGS_RDX] & 0xffff) << 16;
-
- if (info->intercept == x86_intercept_in ||
- info->intercept == x86_intercept_ins) {
- exit_info |= SVM_IOIO_TYPE_MASK;
- bytes = info->src_bytes;
- } else {
- bytes = info->dst_bytes;
- }
-
- if (info->intercept == x86_intercept_outs ||
- info->intercept == x86_intercept_ins)
- exit_info |= SVM_IOIO_STR_MASK;
-
- if (info->rep_prefix)
- exit_info |= SVM_IOIO_REP_MASK;
-
- bytes = min(bytes, 4u);
-
- exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
-
- exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
-
- vmcb->control.exit_info_1 = exit_info;
- vmcb->control.exit_info_2 = info->next_rip;
-
- break;
- }
- default:
- break;
- }
-
- vmcb->control.next_rip = info->next_rip;
- vmcb->control.exit_code = icpt_info.exit_code;
- vmexit = nested_svm_exit_handled(svm);
-
- ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
- : X86EMUL_CONTINUE;
-
-out:
- return ret;
-}
-
-static struct kvm_x86_ops svm_x86_ops = {
- .cpu_has_kvm_support = has_svm,
- .disabled_by_bios = is_disabled,
- .hardware_setup = svm_hardware_setup,
- .hardware_unsetup = svm_hardware_unsetup,
- .check_processor_compatibility = svm_check_processor_compat,
- .hardware_enable = svm_hardware_enable,
- .hardware_disable = svm_hardware_disable,
- .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
-
- .vcpu_create = svm_create_vcpu,
- .vcpu_free = svm_free_vcpu,
- .vcpu_reset = svm_vcpu_reset,
-
- .prepare_guest_switch = svm_prepare_guest_switch,
- .vcpu_load = svm_vcpu_load,
- .vcpu_put = svm_vcpu_put,
-
- .set_guest_debug = svm_guest_debug,
- .get_msr = svm_get_msr,
- .set_msr = svm_set_msr,
- .get_segment_base = svm_get_segment_base,
- .get_segment = svm_get_segment,
- .set_segment = svm_set_segment,
- .get_cpl = svm_get_cpl,
- .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
- .decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
- .decache_cr3 = svm_decache_cr3,
- .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
- .set_cr0 = svm_set_cr0,
- .set_cr3 = svm_set_cr3,
- .set_cr4 = svm_set_cr4,
- .set_efer = svm_set_efer,
- .get_idt = svm_get_idt,
- .set_idt = svm_set_idt,
- .get_gdt = svm_get_gdt,
- .set_gdt = svm_set_gdt,
- .set_dr7 = svm_set_dr7,
- .cache_reg = svm_cache_reg,
- .get_rflags = svm_get_rflags,
- .set_rflags = svm_set_rflags,
- .fpu_activate = svm_fpu_activate,
- .fpu_deactivate = svm_fpu_deactivate,
-
- .tlb_flush = svm_flush_tlb,
-
- .run = svm_vcpu_run,
- .handle_exit = handle_exit,
- .skip_emulated_instruction = skip_emulated_instruction,
- .set_interrupt_shadow = svm_set_interrupt_shadow,
- .get_interrupt_shadow = svm_get_interrupt_shadow,
- .patch_hypercall = svm_patch_hypercall,
- .set_irq = svm_set_irq,
- .set_nmi = svm_inject_nmi,
- .queue_exception = svm_queue_exception,
- .cancel_injection = svm_cancel_injection,
- .interrupt_allowed = svm_interrupt_allowed,
- .nmi_allowed = svm_nmi_allowed,
- .get_nmi_mask = svm_get_nmi_mask,
- .set_nmi_mask = svm_set_nmi_mask,
- .enable_nmi_window = enable_nmi_window,
- .enable_irq_window = enable_irq_window,
- .update_cr8_intercept = update_cr8_intercept,
-
- .set_tss_addr = svm_set_tss_addr,
- .get_tdp_level = get_npt_level,
- .get_mt_mask = svm_get_mt_mask,
-
- .get_exit_info = svm_get_exit_info,
-
- .get_lpage_level = svm_get_lpage_level,
-
- .cpuid_update = svm_cpuid_update,
-
- .rdtscp_supported = svm_rdtscp_supported,
-
- .set_supported_cpuid = svm_set_supported_cpuid,
-
- .has_wbinvd_exit = svm_has_wbinvd_exit,
-
- .set_tsc_khz = svm_set_tsc_khz,
- .write_tsc_offset = svm_write_tsc_offset,
- .adjust_tsc_offset = svm_adjust_tsc_offset,
- .compute_tsc_offset = svm_compute_tsc_offset,
- .read_l1_tsc = svm_read_l1_tsc,
-
- .set_tdp_cr3 = set_tdp_cr3,
-
- .check_intercept = svm_check_intercept,
-};
-
-static int __init svm_init(void)
-{
- return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
- __alignof__(struct vcpu_svm), THIS_MODULE);
-}
-
-static void __exit svm_exit(void)
-{
- kvm_exit();
-}
-
-module_init(svm_init)
-module_exit(svm_exit)
diff --git a/ANDROID_3.4.5/arch/x86/kvm/timer.c b/ANDROID_3.4.5/arch/x86/kvm/timer.c
deleted file mode 100644
index 6b85cc64..00000000
--- a/ANDROID_3.4.5/arch/x86/kvm/timer.c
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
- *
- * timer support
- *
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- */
-
-#include <linux/kvm_host.h>
-#include <linux/kvm.h>
-#include <linux/hrtimer.h>
-#include <linux/atomic.h>
-#include "kvm_timer.h"
-
-enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
-{
- struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
- struct kvm_vcpu *vcpu = ktimer->vcpu;
- wait_queue_head_t *q = &vcpu->wq;
-
- /*
- * There is a race window between reading and incrementing, but we do
- * not care about potentially losing timer events in the !reinject
- * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
- * in vcpu_enter_guest.
- */
- if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
- atomic_inc(&ktimer->pending);
- /* FIXME: this code should not know anything about vcpus */
- kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
- }
-
- if (waitqueue_active(q))
- wake_up_interruptible(q);
-
- if (ktimer->t_ops->is_periodic(ktimer)) {
- hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
- return HRTIMER_RESTART;
- } else
- return HRTIMER_NORESTART;
-}
diff --git a/ANDROID_3.4.5/arch/x86/kvm/trace.h b/ANDROID_3.4.5/arch/x86/kvm/trace.h
deleted file mode 100644
index 911d2641..00000000
--- a/ANDROID_3.4.5/arch/x86/kvm/trace.h
+++ /dev/null
@@ -1,830 +0,0 @@
-#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_KVM_H
-
-#include <linux/tracepoint.h>
-#include <asm/vmx.h>
-#include <asm/svm.h>
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM kvm
-
-/*
- * Tracepoint for guest mode entry.
- */
-TRACE_EVENT(kvm_entry,
- TP_PROTO(unsigned int vcpu_id),
- TP_ARGS(vcpu_id),
-
- TP_STRUCT__entry(
- __field( unsigned int, vcpu_id )
- ),
-
- TP_fast_assign(
- __entry->vcpu_id = vcpu_id;
- ),
-
- TP_printk("vcpu %u", __entry->vcpu_id)
-);
-
-/*
- * Tracepoint for hypercall.
- */
-TRACE_EVENT(kvm_hypercall,
- TP_PROTO(unsigned long nr, unsigned long a0, unsigned long a1,
- unsigned long a2, unsigned long a3),
- TP_ARGS(nr, a0, a1, a2, a3),
-
- TP_STRUCT__entry(
- __field( unsigned long, nr )
- __field( unsigned long, a0 )
- __field( unsigned long, a1 )
- __field( unsigned long, a2 )
- __field( unsigned long, a3 )
- ),
-
- TP_fast_assign(
- __entry->nr = nr;
- __entry->a0 = a0;
- __entry->a1 = a1;
- __entry->a2 = a2;
- __entry->a3 = a3;
- ),
-
- TP_printk("nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx",
- __entry->nr, __entry->a0, __entry->a1, __entry->a2,
- __entry->a3)
-);
-
-/*
- * Tracepoint for hypercall.
- */
-TRACE_EVENT(kvm_hv_hypercall,
- TP_PROTO(__u16 code, bool fast, __u16 rep_cnt, __u16 rep_idx,
- __u64 ingpa, __u64 outgpa),
- TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa),
-
- TP_STRUCT__entry(
- __field( __u16, rep_cnt )
- __field( __u16, rep_idx )
- __field( __u64, ingpa )
- __field( __u64, outgpa )
- __field( __u16, code )
- __field( bool, fast )
- ),
-
- TP_fast_assign(
- __entry->rep_cnt = rep_cnt;
- __entry->rep_idx = rep_idx;
- __entry->ingpa = ingpa;
- __entry->outgpa = outgpa;
- __entry->code = code;
- __entry->fast = fast;
- ),
-
- TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx",
- __entry->code, __entry->fast ? "fast" : "slow",
- __entry->rep_cnt, __entry->rep_idx, __entry->ingpa,
- __entry->outgpa)
-);
-
-/*
- * Tracepoint for PIO.
- */
-TRACE_EVENT(kvm_pio,
- TP_PROTO(unsigned int rw, unsigned int port, unsigned int size,
- unsigned int count),
- TP_ARGS(rw, port, size, count),
-
- TP_STRUCT__entry(
- __field( unsigned int, rw )
- __field( unsigned int, port )
- __field( unsigned int, size )
- __field( unsigned int, count )
- ),
-
- TP_fast_assign(
- __entry->rw = rw;
- __entry->port = port;
- __entry->size = size;
- __entry->count = count;
- ),
-
- TP_printk("pio_%s at 0x%x size %d count %d",
- __entry->rw ? "write" : "read",
- __entry->port, __entry->size, __entry->count)
-);
-
-/*
- * Tracepoint for cpuid.
- */
-TRACE_EVENT(kvm_cpuid,
- TP_PROTO(unsigned int function, unsigned long rax, unsigned long rbx,
- unsigned long rcx, unsigned long rdx),
- TP_ARGS(function, rax, rbx, rcx, rdx),
-
- TP_STRUCT__entry(
- __field( unsigned int, function )
- __field( unsigned long, rax )
- __field( unsigned long, rbx )
- __field( unsigned long, rcx )
- __field( unsigned long, rdx )
- ),
-
- TP_fast_assign(
- __entry->function = function;
- __entry->rax = rax;
- __entry->rbx = rbx;
- __entry->rcx = rcx;
- __entry->rdx = rdx;
- ),
-
- TP_printk("func %x rax %lx rbx %lx rcx %lx rdx %lx",
- __entry->function, __entry->rax,
- __entry->rbx, __entry->rcx, __entry->rdx)
-);
-
-#define AREG(x) { APIC_##x, "APIC_" #x }
-
-#define kvm_trace_symbol_apic \
- AREG(ID), AREG(LVR), AREG(TASKPRI), AREG(ARBPRI), AREG(PROCPRI), \
- AREG(EOI), AREG(RRR), AREG(LDR), AREG(DFR), AREG(SPIV), AREG(ISR), \
- AREG(TMR), AREG(IRR), AREG(ESR), AREG(ICR), AREG(ICR2), AREG(LVTT), \
- AREG(LVTTHMR), AREG(LVTPC), AREG(LVT0), AREG(LVT1), AREG(LVTERR), \
- AREG(TMICT), AREG(TMCCT), AREG(TDCR), AREG(SELF_IPI), AREG(EFEAT), \
- AREG(ECTRL)
-/*
- * Tracepoint for apic access.
- */
-TRACE_EVENT(kvm_apic,
- TP_PROTO(unsigned int rw, unsigned int reg, unsigned int val),
- TP_ARGS(rw, reg, val),
-
- TP_STRUCT__entry(
- __field( unsigned int, rw )
- __field( unsigned int, reg )
- __field( unsigned int, val )
- ),
-
- TP_fast_assign(
- __entry->rw = rw;
- __entry->reg = reg;
- __entry->val = val;
- ),
-
- TP_printk("apic_%s %s = 0x%x",
- __entry->rw ? "write" : "read",
- __print_symbolic(__entry->reg, kvm_trace_symbol_apic),
- __entry->val)
-);
-
-#define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val)
-#define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val)
-
-#define KVM_ISA_VMX 1
-#define KVM_ISA_SVM 2
-
-#define VMX_EXIT_REASONS \
- { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \
- { EXIT_REASON_EXTERNAL_INTERRUPT, "EXTERNAL_INTERRUPT" }, \
- { EXIT_REASON_TRIPLE_FAULT, "TRIPLE_FAULT" }, \
- { EXIT_REASON_PENDING_INTERRUPT, "PENDING_INTERRUPT" }, \
- { EXIT_REASON_NMI_WINDOW, "NMI_WINDOW" }, \
- { EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \
- { EXIT_REASON_CPUID, "CPUID" }, \
- { EXIT_REASON_HLT, "HLT" }, \
- { EXIT_REASON_INVLPG, "INVLPG" }, \
- { EXIT_REASON_RDPMC, "RDPMC" }, \
- { EXIT_REASON_RDTSC, "RDTSC" }, \
- { EXIT_REASON_VMCALL, "VMCALL" }, \
- { EXIT_REASON_VMCLEAR, "VMCLEAR" }, \
- { EXIT_REASON_VMLAUNCH, "VMLAUNCH" }, \
- { EXIT_REASON_VMPTRLD, "VMPTRLD" }, \
- { EXIT_REASON_VMPTRST, "VMPTRST" }, \
- { EXIT_REASON_VMREAD, "VMREAD" }, \
- { EXIT_REASON_VMRESUME, "VMRESUME" }, \
- { EXIT_REASON_VMWRITE, "VMWRITE" }, \
- { EXIT_REASON_VMOFF, "VMOFF" }, \
- { EXIT_REASON_VMON, "VMON" }, \
- { EXIT_REASON_CR_ACCESS, "CR_ACCESS" }, \
- { EXIT_REASON_DR_ACCESS, "DR_ACCESS" }, \
- { EXIT_REASON_IO_INSTRUCTION, "IO_INSTRUCTION" }, \
- { EXIT_REASON_MSR_READ, "MSR_READ" }, \
- { EXIT_REASON_MSR_WRITE, "MSR_WRITE" }, \
- { EXIT_REASON_MWAIT_INSTRUCTION, "MWAIT_INSTRUCTION" }, \
- { EXIT_REASON_MONITOR_INSTRUCTION, "MONITOR_INSTRUCTION" }, \
- { EXIT_REASON_PAUSE_INSTRUCTION, "PAUSE_INSTRUCTION" }, \
- { EXIT_REASON_MCE_DURING_VMENTRY, "MCE_DURING_VMENTRY" }, \
- { EXIT_REASON_TPR_BELOW_THRESHOLD, "TPR_BELOW_THRESHOLD" }, \
- { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \
- { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \
- { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \
- { EXIT_REASON_WBINVD, "WBINVD" }
-
-#define SVM_EXIT_REASONS \
- { SVM_EXIT_READ_CR0, "read_cr0" }, \
- { SVM_EXIT_READ_CR3, "read_cr3" }, \
- { SVM_EXIT_READ_CR4, "read_cr4" }, \
- { SVM_EXIT_READ_CR8, "read_cr8" }, \
- { SVM_EXIT_WRITE_CR0, "write_cr0" }, \
- { SVM_EXIT_WRITE_CR3, "write_cr3" }, \
- { SVM_EXIT_WRITE_CR4, "write_cr4" }, \
- { SVM_EXIT_WRITE_CR8, "write_cr8" }, \
- { SVM_EXIT_READ_DR0, "read_dr0" }, \
- { SVM_EXIT_READ_DR1, "read_dr1" }, \
- { SVM_EXIT_READ_DR2, "read_dr2" }, \
- { SVM_EXIT_READ_DR3, "read_dr3" }, \
- { SVM_EXIT_WRITE_DR0, "write_dr0" }, \
- { SVM_EXIT_WRITE_DR1, "write_dr1" }, \
- { SVM_EXIT_WRITE_DR2, "write_dr2" }, \
- { SVM_EXIT_WRITE_DR3, "write_dr3" }, \
- { SVM_EXIT_WRITE_DR5, "write_dr5" }, \
- { SVM_EXIT_WRITE_DR7, "write_dr7" }, \
- { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, \
- { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, \
- { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, \
- { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, \
- { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, \
- { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, \
- { SVM_EXIT_INTR, "interrupt" }, \
- { SVM_EXIT_NMI, "nmi" }, \
- { SVM_EXIT_SMI, "smi" }, \
- { SVM_EXIT_INIT, "init" }, \
- { SVM_EXIT_VINTR, "vintr" }, \
- { SVM_EXIT_CPUID, "cpuid" }, \
- { SVM_EXIT_INVD, "invd" }, \
- { SVM_EXIT_HLT, "hlt" }, \
- { SVM_EXIT_INVLPG, "invlpg" }, \
- { SVM_EXIT_INVLPGA, "invlpga" }, \
- { SVM_EXIT_IOIO, "io" }, \
- { SVM_EXIT_MSR, "msr" }, \
- { SVM_EXIT_TASK_SWITCH, "task_switch" }, \
- { SVM_EXIT_SHUTDOWN, "shutdown" }, \
- { SVM_EXIT_VMRUN, "vmrun" }, \
- { SVM_EXIT_VMMCALL, "hypercall" }, \
- { SVM_EXIT_VMLOAD, "vmload" }, \
- { SVM_EXIT_VMSAVE, "vmsave" }, \
- { SVM_EXIT_STGI, "stgi" }, \
- { SVM_EXIT_CLGI, "clgi" }, \
- { SVM_EXIT_SKINIT, "skinit" }, \
- { SVM_EXIT_WBINVD, "wbinvd" }, \
- { SVM_EXIT_MONITOR, "monitor" }, \
- { SVM_EXIT_MWAIT, "mwait" }, \
- { SVM_EXIT_XSETBV, "xsetbv" }, \
- { SVM_EXIT_NPF, "npf" }
-
-/*
- * Tracepoint for kvm guest exit:
- */
-TRACE_EVENT(kvm_exit,
- TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa),
- TP_ARGS(exit_reason, vcpu, isa),
-
- TP_STRUCT__entry(
- __field( unsigned int, exit_reason )
- __field( unsigned long, guest_rip )
- __field( u32, isa )
- __field( u64, info1 )
- __field( u64, info2 )
- ),
-
- TP_fast_assign(
- __entry->exit_reason = exit_reason;
- __entry->guest_rip = kvm_rip_read(vcpu);
- __entry->isa = isa;
- kvm_x86_ops->get_exit_info(vcpu, &__entry->info1,
- &__entry->info2);
- ),
-
- TP_printk("reason %s rip 0x%lx info %llx %llx",
- (__entry->isa == KVM_ISA_VMX) ?
- __print_symbolic(__entry->exit_reason, VMX_EXIT_REASONS) :
- __print_symbolic(__entry->exit_reason, SVM_EXIT_REASONS),
- __entry->guest_rip, __entry->info1, __entry->info2)
-);
-
-/*
- * Tracepoint for kvm interrupt injection:
- */
-TRACE_EVENT(kvm_inj_virq,
- TP_PROTO(unsigned int irq),
- TP_ARGS(irq),
-
- TP_STRUCT__entry(
- __field( unsigned int, irq )
- ),
-
- TP_fast_assign(
- __entry->irq = irq;
- ),
-
- TP_printk("irq %u", __entry->irq)
-);
-
-#define EXS(x) { x##_VECTOR, "#" #x }
-
-#define kvm_trace_sym_exc \
- EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM), \
- EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), \
- EXS(MF), EXS(MC)
-
-/*
- * Tracepoint for kvm interrupt injection:
- */
-TRACE_EVENT(kvm_inj_exception,
- TP_PROTO(unsigned exception, bool has_error, unsigned error_code),
- TP_ARGS(exception, has_error, error_code),
-
- TP_STRUCT__entry(
- __field( u8, exception )
- __field( u8, has_error )
- __field( u32, error_code )
- ),
-
- TP_fast_assign(
- __entry->exception = exception;
- __entry->has_error = has_error;
- __entry->error_code = error_code;
- ),
-
- TP_printk("%s (0x%x)",
- __print_symbolic(__entry->exception, kvm_trace_sym_exc),
- /* FIXME: don't print error_code if not present */
- __entry->has_error ? __entry->error_code : 0)
-);
-
-/*
- * Tracepoint for page fault.
- */
-TRACE_EVENT(kvm_page_fault,
- TP_PROTO(unsigned long fault_address, unsigned int error_code),
- TP_ARGS(fault_address, error_code),
-
- TP_STRUCT__entry(
- __field( unsigned long, fault_address )
- __field( unsigned int, error_code )
- ),
-
- TP_fast_assign(
- __entry->fault_address = fault_address;
- __entry->error_code = error_code;
- ),
-
- TP_printk("address %lx error_code %x",
- __entry->fault_address, __entry->error_code)
-);
-
-/*
- * Tracepoint for guest MSR access.
- */
-TRACE_EVENT(kvm_msr,
- TP_PROTO(unsigned write, u32 ecx, u64 data, bool exception),
- TP_ARGS(write, ecx, data, exception),
-
- TP_STRUCT__entry(
- __field( unsigned, write )
- __field( u32, ecx )
- __field( u64, data )
- __field( u8, exception )
- ),
-
- TP_fast_assign(
- __entry->write = write;
- __entry->ecx = ecx;
- __entry->data = data;
- __entry->exception = exception;
- ),
-
- TP_printk("msr_%s %x = 0x%llx%s",
- __entry->write ? "write" : "read",
- __entry->ecx, __entry->data,
- __entry->exception ? " (#GP)" : "")
-);
-
-#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data, false)
-#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data, false)
-#define trace_kvm_msr_read_ex(ecx) trace_kvm_msr(0, ecx, 0, true)
-#define trace_kvm_msr_write_ex(ecx, data) trace_kvm_msr(1, ecx, data, true)
-
-/*
- * Tracepoint for guest CR access.
- */
-TRACE_EVENT(kvm_cr,
- TP_PROTO(unsigned int rw, unsigned int cr, unsigned long val),
- TP_ARGS(rw, cr, val),
-
- TP_STRUCT__entry(
- __field( unsigned int, rw )
- __field( unsigned int, cr )
- __field( unsigned long, val )
- ),
-
- TP_fast_assign(
- __entry->rw = rw;
- __entry->cr = cr;
- __entry->val = val;
- ),
-
- TP_printk("cr_%s %x = 0x%lx",
- __entry->rw ? "write" : "read",
- __entry->cr, __entry->val)
-);
-
-#define trace_kvm_cr_read(cr, val) trace_kvm_cr(0, cr, val)
-#define trace_kvm_cr_write(cr, val) trace_kvm_cr(1, cr, val)
-
-TRACE_EVENT(kvm_pic_set_irq,
- TP_PROTO(__u8 chip, __u8 pin, __u8 elcr, __u8 imr, bool coalesced),
- TP_ARGS(chip, pin, elcr, imr, coalesced),
-
- TP_STRUCT__entry(
- __field( __u8, chip )
- __field( __u8, pin )
- __field( __u8, elcr )
- __field( __u8, imr )
- __field( bool, coalesced )
- ),
-
- TP_fast_assign(
- __entry->chip = chip;
- __entry->pin = pin;
- __entry->elcr = elcr;
- __entry->imr = imr;
- __entry->coalesced = coalesced;
- ),
-
- TP_printk("chip %u pin %u (%s%s)%s",
- __entry->chip, __entry->pin,
- (__entry->elcr & (1 << __entry->pin)) ? "level":"edge",
- (__entry->imr & (1 << __entry->pin)) ? "|masked":"",
- __entry->coalesced ? " (coalesced)" : "")
-);
-
-#define kvm_apic_dst_shorthand \
- {0x0, "dst"}, \
- {0x1, "self"}, \
- {0x2, "all"}, \
- {0x3, "all-but-self"}
-
-TRACE_EVENT(kvm_apic_ipi,
- TP_PROTO(__u32 icr_low, __u32 dest_id),
- TP_ARGS(icr_low, dest_id),
-
- TP_STRUCT__entry(
- __field( __u32, icr_low )
- __field( __u32, dest_id )
- ),
-
- TP_fast_assign(
- __entry->icr_low = icr_low;
- __entry->dest_id = dest_id;
- ),
-
- TP_printk("dst %x vec %u (%s|%s|%s|%s|%s)",
- __entry->dest_id, (u8)__entry->icr_low,
- __print_symbolic((__entry->icr_low >> 8 & 0x7),
- kvm_deliver_mode),
- (__entry->icr_low & (1<<11)) ? "logical" : "physical",
- (__entry->icr_low & (1<<14)) ? "assert" : "de-assert",
- (__entry->icr_low & (1<<15)) ? "level" : "edge",
- __print_symbolic((__entry->icr_low >> 18 & 0x3),
- kvm_apic_dst_shorthand))
-);
-
-TRACE_EVENT(kvm_apic_accept_irq,
- TP_PROTO(__u32 apicid, __u16 dm, __u8 tm, __u8 vec, bool coalesced),
- TP_ARGS(apicid, dm, tm, vec, coalesced),
-
- TP_STRUCT__entry(
- __field( __u32, apicid )
- __field( __u16, dm )
- __field( __u8, tm )
- __field( __u8, vec )
- __field( bool, coalesced )
- ),
-
- TP_fast_assign(
- __entry->apicid = apicid;
- __entry->dm = dm;
- __entry->tm = tm;
- __entry->vec = vec;
- __entry->coalesced = coalesced;
- ),
-
- TP_printk("apicid %x vec %u (%s|%s)%s",
- __entry->apicid, __entry->vec,
- __print_symbolic((__entry->dm >> 8 & 0x7), kvm_deliver_mode),
- __entry->tm ? "level" : "edge",
- __entry->coalesced ? " (coalesced)" : "")
-);
-
-/*
- * Tracepoint for nested VMRUN
- */
-TRACE_EVENT(kvm_nested_vmrun,
- TP_PROTO(__u64 rip, __u64 vmcb, __u64 nested_rip, __u32 int_ctl,
- __u32 event_inj, bool npt),
- TP_ARGS(rip, vmcb, nested_rip, int_ctl, event_inj, npt),
-
- TP_STRUCT__entry(
- __field( __u64, rip )
- __field( __u64, vmcb )
- __field( __u64, nested_rip )
- __field( __u32, int_ctl )
- __field( __u32, event_inj )
- __field( bool, npt )
- ),
-
- TP_fast_assign(
- __entry->rip = rip;
- __entry->vmcb = vmcb;
- __entry->nested_rip = nested_rip;
- __entry->int_ctl = int_ctl;
- __entry->event_inj = event_inj;
- __entry->npt = npt;
- ),
-
- TP_printk("rip: 0x%016llx vmcb: 0x%016llx nrip: 0x%016llx int_ctl: 0x%08x "
- "event_inj: 0x%08x npt: %s",
- __entry->rip, __entry->vmcb, __entry->nested_rip,
- __entry->int_ctl, __entry->event_inj,
- __entry->npt ? "on" : "off")
-);
-
-TRACE_EVENT(kvm_nested_intercepts,
- TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions, __u64 intercept),
- TP_ARGS(cr_read, cr_write, exceptions, intercept),
-
- TP_STRUCT__entry(
- __field( __u16, cr_read )
- __field( __u16, cr_write )
- __field( __u32, exceptions )
- __field( __u64, intercept )
- ),
-
- TP_fast_assign(
- __entry->cr_read = cr_read;
- __entry->cr_write = cr_write;
- __entry->exceptions = exceptions;
- __entry->intercept = intercept;
- ),
-
- TP_printk("cr_read: %04x cr_write: %04x excp: %08x intercept: %016llx",
- __entry->cr_read, __entry->cr_write, __entry->exceptions,
- __entry->intercept)
-);
-/*
- * Tracepoint for #VMEXIT while nested
- */
-TRACE_EVENT(kvm_nested_vmexit,
- TP_PROTO(__u64 rip, __u32 exit_code,
- __u64 exit_info1, __u64 exit_info2,
- __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa),
- TP_ARGS(rip, exit_code, exit_info1, exit_info2,
- exit_int_info, exit_int_info_err, isa),
-
- TP_STRUCT__entry(
- __field( __u64, rip )
- __field( __u32, exit_code )
- __field( __u64, exit_info1 )
- __field( __u64, exit_info2 )
- __field( __u32, exit_int_info )
- __field( __u32, exit_int_info_err )
- __field( __u32, isa )
- ),
-
- TP_fast_assign(
- __entry->rip = rip;
- __entry->exit_code = exit_code;
- __entry->exit_info1 = exit_info1;
- __entry->exit_info2 = exit_info2;
- __entry->exit_int_info = exit_int_info;
- __entry->exit_int_info_err = exit_int_info_err;
- __entry->isa = isa;
- ),
- TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx "
- "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
- __entry->rip,
- (__entry->isa == KVM_ISA_VMX) ?
- __print_symbolic(__entry->exit_code, VMX_EXIT_REASONS) :
- __print_symbolic(__entry->exit_code, SVM_EXIT_REASONS),
- __entry->exit_info1, __entry->exit_info2,
- __entry->exit_int_info, __entry->exit_int_info_err)
-);
-
-/*
- * Tracepoint for #VMEXIT reinjected to the guest
- */
-TRACE_EVENT(kvm_nested_vmexit_inject,
- TP_PROTO(__u32 exit_code,
- __u64 exit_info1, __u64 exit_info2,
- __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa),
- TP_ARGS(exit_code, exit_info1, exit_info2,
- exit_int_info, exit_int_info_err, isa),
-
- TP_STRUCT__entry(
- __field( __u32, exit_code )
- __field( __u64, exit_info1 )
- __field( __u64, exit_info2 )
- __field( __u32, exit_int_info )
- __field( __u32, exit_int_info_err )
- __field( __u32, isa )
- ),
-
- TP_fast_assign(
- __entry->exit_code = exit_code;
- __entry->exit_info1 = exit_info1;
- __entry->exit_info2 = exit_info2;
- __entry->exit_int_info = exit_int_info;
- __entry->exit_int_info_err = exit_int_info_err;
- __entry->isa = isa;
- ),
-
- TP_printk("reason: %s ext_inf1: 0x%016llx "
- "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
- (__entry->isa == KVM_ISA_VMX) ?
- __print_symbolic(__entry->exit_code, VMX_EXIT_REASONS) :
- __print_symbolic(__entry->exit_code, SVM_EXIT_REASONS),
- __entry->exit_info1, __entry->exit_info2,
- __entry->exit_int_info, __entry->exit_int_info_err)
-);
-
-/*
- * Tracepoint for nested #vmexit because of interrupt pending
- */
-TRACE_EVENT(kvm_nested_intr_vmexit,
- TP_PROTO(__u64 rip),
- TP_ARGS(rip),
-
- TP_STRUCT__entry(
- __field( __u64, rip )
- ),
-
- TP_fast_assign(
- __entry->rip = rip
- ),
-
- TP_printk("rip: 0x%016llx", __entry->rip)
-);
-
-/*
- * Tracepoint for nested #vmexit because of interrupt pending
- */
-TRACE_EVENT(kvm_invlpga,
- TP_PROTO(__u64 rip, int asid, u64 address),
- TP_ARGS(rip, asid, address),
-
- TP_STRUCT__entry(
- __field( __u64, rip )
- __field( int, asid )
- __field( __u64, address )
- ),
-
- TP_fast_assign(
- __entry->rip = rip;
- __entry->asid = asid;
- __entry->address = address;
- ),
-
- TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx",
- __entry->rip, __entry->asid, __entry->address)
-);
-
-/*
- * Tracepoint for nested #vmexit because of interrupt pending
- */
-TRACE_EVENT(kvm_skinit,
- TP_PROTO(__u64 rip, __u32 slb),
- TP_ARGS(rip, slb),
-
- TP_STRUCT__entry(
- __field( __u64, rip )
- __field( __u32, slb )
- ),
-
- TP_fast_assign(
- __entry->rip = rip;
- __entry->slb = slb;
- ),
-
- TP_printk("rip: 0x%016llx slb: 0x%08x",
- __entry->rip, __entry->slb)
-);
-
-#define __print_insn(insn, ilen) ({ \
- int i; \
- const char *ret = p->buffer + p->len; \
- \
- for (i = 0; i < ilen; ++i) \
- trace_seq_printf(p, " %02x", insn[i]); \
- trace_seq_printf(p, "%c", 0); \
- ret; \
- })
-
-#define KVM_EMUL_INSN_F_CR0_PE (1 << 0)
-#define KVM_EMUL_INSN_F_EFL_VM (1 << 1)
-#define KVM_EMUL_INSN_F_CS_D (1 << 2)
-#define KVM_EMUL_INSN_F_CS_L (1 << 3)
-
-#define kvm_trace_symbol_emul_flags \
- { 0, "real" }, \
- { KVM_EMUL_INSN_F_CR0_PE \
- | KVM_EMUL_INSN_F_EFL_VM, "vm16" }, \
- { KVM_EMUL_INSN_F_CR0_PE, "prot16" }, \
- { KVM_EMUL_INSN_F_CR0_PE \
- | KVM_EMUL_INSN_F_CS_D, "prot32" }, \
- { KVM_EMUL_INSN_F_CR0_PE \
- | KVM_EMUL_INSN_F_CS_L, "prot64" }
-
-#define kei_decode_mode(mode) ({ \
- u8 flags = 0xff; \
- switch (mode) { \
- case X86EMUL_MODE_REAL: \
- flags = 0; \
- break; \
- case X86EMUL_MODE_VM86: \
- flags = KVM_EMUL_INSN_F_EFL_VM; \
- break; \
- case X86EMUL_MODE_PROT16: \
- flags = KVM_EMUL_INSN_F_CR0_PE; \
- break; \
- case X86EMUL_MODE_PROT32: \
- flags = KVM_EMUL_INSN_F_CR0_PE \
- | KVM_EMUL_INSN_F_CS_D; \
- break; \
- case X86EMUL_MODE_PROT64: \
- flags = KVM_EMUL_INSN_F_CR0_PE \
- | KVM_EMUL_INSN_F_CS_L; \
- break; \
- } \
- flags; \
- })
-
-TRACE_EVENT(kvm_emulate_insn,
- TP_PROTO(struct kvm_vcpu *vcpu, __u8 failed),
- TP_ARGS(vcpu, failed),
-
- TP_STRUCT__entry(
- __field( __u64, rip )
- __field( __u32, csbase )
- __field( __u8, len )
- __array( __u8, insn, 15 )
- __field( __u8, flags )
- __field( __u8, failed )
- ),
-
- TP_fast_assign(
- __entry->rip = vcpu->arch.emulate_ctxt.fetch.start;
- __entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS);
- __entry->len = vcpu->arch.emulate_ctxt._eip
- - vcpu->arch.emulate_ctxt.fetch.start;
- memcpy(__entry->insn,
- vcpu->arch.emulate_ctxt.fetch.data,
- 15);
- __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt.mode);
- __entry->failed = failed;
- ),
-
- TP_printk("%x:%llx:%s (%s)%s",
- __entry->csbase, __entry->rip,
- __print_insn(__entry->insn, __entry->len),
- __print_symbolic(__entry->flags,
- kvm_trace_symbol_emul_flags),
- __entry->failed ? " failed" : ""
- )
- );
-
-#define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0)
-#define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1)
-
-TRACE_EVENT(
- vcpu_match_mmio,
- TP_PROTO(gva_t gva, gpa_t gpa, bool write, bool gpa_match),
- TP_ARGS(gva, gpa, write, gpa_match),
-
- TP_STRUCT__entry(
- __field(gva_t, gva)
- __field(gpa_t, gpa)
- __field(bool, write)
- __field(bool, gpa_match)
- ),
-
- TP_fast_assign(
- __entry->gva = gva;
- __entry->gpa = gpa;
- __entry->write = write;
- __entry->gpa_match = gpa_match
- ),
-
- TP_printk("gva %#lx gpa %#llx %s %s", __entry->gva, __entry->gpa,
- __entry->write ? "Write" : "Read",
- __entry->gpa_match ? "GPA" : "GVA")
-);
-#endif /* _TRACE_KVM_H */
-
-#undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH arch/x86/kvm
-#undef TRACE_INCLUDE_FILE
-#define TRACE_INCLUDE_FILE trace
-
-/* This part must be outside protection */
-#include <trace/define_trace.h>
diff --git a/ANDROID_3.4.5/arch/x86/kvm/tss.h b/ANDROID_3.4.5/arch/x86/kvm/tss.h
deleted file mode 100644
index 622aa10f..00000000
--- a/ANDROID_3.4.5/arch/x86/kvm/tss.h
+++ /dev/null
@@ -1,59 +0,0 @@
-#ifndef __TSS_SEGMENT_H
-#define __TSS_SEGMENT_H
-
-struct tss_segment_32 {
- u32 prev_task_link;
- u32 esp0;
- u32 ss0;
- u32 esp1;
- u32 ss1;
- u32 esp2;
- u32 ss2;
- u32 cr3;
- u32 eip;
- u32 eflags;
- u32 eax;
- u32 ecx;
- u32 edx;
- u32 ebx;
- u32 esp;
- u32 ebp;
- u32 esi;
- u32 edi;
- u32 es;
- u32 cs;
- u32 ss;
- u32 ds;
- u32 fs;
- u32 gs;
- u32 ldt_selector;
- u16 t;
- u16 io_map;
-};
-
-struct tss_segment_16 {
- u16 prev_task_link;
- u16 sp0;
- u16 ss0;
- u16 sp1;
- u16 ss1;
- u16 sp2;
- u16 ss2;
- u16 ip;
- u16 flag;
- u16 ax;
- u16 cx;
- u16 dx;
- u16 bx;
- u16 sp;
- u16 bp;
- u16 si;
- u16 di;
- u16 es;
- u16 cs;
- u16 ss;
- u16 ds;
- u16 ldt;
-};
-
-#endif
diff --git a/ANDROID_3.4.5/arch/x86/kvm/vmx.c b/ANDROID_3.4.5/arch/x86/kvm/vmx.c
deleted file mode 100644
index 4ff0ab9b..00000000
--- a/ANDROID_3.4.5/arch/x86/kvm/vmx.c
+++ /dev/null
@@ -1,7272 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
- *
- * Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
- *
- * Authors:
- * Avi Kivity <avi@qumranet.com>
- * Yaniv Kamay <yaniv@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- *
- */
-
-#include "irq.h"
-#include "mmu.h"
-#include "cpuid.h"
-
-#include <linux/kvm_host.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/sched.h>
-#include <linux/moduleparam.h>
-#include <linux/ftrace_event.h>
-#include <linux/slab.h>
-#include <linux/tboot.h>
-#include "kvm_cache_regs.h"
-#include "x86.h"
-
-#include <asm/io.h>
-#include <asm/desc.h>
-#include <asm/vmx.h>
-#include <asm/virtext.h>
-#include <asm/mce.h>
-#include <asm/i387.h>
-#include <asm/xcr.h>
-#include <asm/perf_event.h>
-
-#include "trace.h"
-
-#define __ex(x) __kvm_handle_fault_on_reboot(x)
-#define __ex_clear(x, reg) \
- ____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg)
-
-MODULE_AUTHOR("Qumranet");
-MODULE_LICENSE("GPL");
-
-static bool __read_mostly enable_vpid = 1;
-module_param_named(vpid, enable_vpid, bool, 0444);
-
-static bool __read_mostly flexpriority_enabled = 1;
-module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
-
-static bool __read_mostly enable_ept = 1;
-module_param_named(ept, enable_ept, bool, S_IRUGO);
-
-static bool __read_mostly enable_unrestricted_guest = 1;
-module_param_named(unrestricted_guest,
- enable_unrestricted_guest, bool, S_IRUGO);
-
-static bool __read_mostly emulate_invalid_guest_state = 0;
-module_param(emulate_invalid_guest_state, bool, S_IRUGO);
-
-static bool __read_mostly vmm_exclusive = 1;
-module_param(vmm_exclusive, bool, S_IRUGO);
-
-static bool __read_mostly fasteoi = 1;
-module_param(fasteoi, bool, S_IRUGO);
-
-/*
- * If nested=1, nested virtualization is supported, i.e., guests may use
- * VMX and be a hypervisor for its own guests. If nested=0, guests may not
- * use VMX instructions.
- */
-static bool __read_mostly nested = 0;
-module_param(nested, bool, S_IRUGO);
-
-#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
- (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
-#define KVM_GUEST_CR0_MASK \
- (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
-#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \
- (X86_CR0_WP | X86_CR0_NE)
-#define KVM_VM_CR0_ALWAYS_ON \
- (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
-#define KVM_CR4_GUEST_OWNED_BITS \
- (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
- | X86_CR4_OSXMMEXCPT)
-
-#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
-#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
-
-#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
-
-/*
- * These 2 parameters are used to config the controls for Pause-Loop Exiting:
- * ple_gap: upper bound on the amount of time between two successive
- * executions of PAUSE in a loop. Also indicate if ple enabled.
- * According to test, this time is usually smaller than 128 cycles.
- * ple_window: upper bound on the amount of time a guest is allowed to execute
- * in a PAUSE loop. Tests indicate that most spinlocks are held for
- * less than 2^12 cycles
- * Time is measured based on a counter that runs at the same rate as the TSC,
- * refer SDM volume 3b section 21.6.13 & 22.1.3.
- */
-#define KVM_VMX_DEFAULT_PLE_GAP 128
-#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
-static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
-module_param(ple_gap, int, S_IRUGO);
-
-static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
-module_param(ple_window, int, S_IRUGO);
-
-#define NR_AUTOLOAD_MSRS 8
-#define VMCS02_POOL_SIZE 1
-
-struct vmcs {
- u32 revision_id;
- u32 abort;
- char data[0];
-};
-
-/*
- * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
- * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
- * loaded on this CPU (so we can clear them if the CPU goes down).
- */
-struct loaded_vmcs {
- struct vmcs *vmcs;
- int cpu;
- int launched;
- struct list_head loaded_vmcss_on_cpu_link;
-};
-
-struct shared_msr_entry {
- unsigned index;
- u64 data;
- u64 mask;
-};
-
-/*
- * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a
- * single nested guest (L2), hence the name vmcs12. Any VMX implementation has
- * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is
- * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
- * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
- * More than one of these structures may exist, if L1 runs multiple L2 guests.
- * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
- * underlying hardware which will be used to run L2.
- * This structure is packed to ensure that its layout is identical across
- * machines (necessary for live migration).
- * If there are changes in this struct, VMCS12_REVISION must be changed.
- */
-typedef u64 natural_width;
-struct __packed vmcs12 {
- /* According to the Intel spec, a VMCS region must start with the
- * following two fields. Then follow implementation-specific data.
- */
- u32 revision_id;
- u32 abort;
-
- u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
- u32 padding[7]; /* room for future expansion */
-
- u64 io_bitmap_a;
- u64 io_bitmap_b;
- u64 msr_bitmap;
- u64 vm_exit_msr_store_addr;
- u64 vm_exit_msr_load_addr;
- u64 vm_entry_msr_load_addr;
- u64 tsc_offset;
- u64 virtual_apic_page_addr;
- u64 apic_access_addr;
- u64 ept_pointer;
- u64 guest_physical_address;
- u64 vmcs_link_pointer;
- u64 guest_ia32_debugctl;
- u64 guest_ia32_pat;
- u64 guest_ia32_efer;
- u64 guest_ia32_perf_global_ctrl;
- u64 guest_pdptr0;
- u64 guest_pdptr1;
- u64 guest_pdptr2;
- u64 guest_pdptr3;
- u64 host_ia32_pat;
- u64 host_ia32_efer;
- u64 host_ia32_perf_global_ctrl;
- u64 padding64[8]; /* room for future expansion */
- /*
- * To allow migration of L1 (complete with its L2 guests) between
- * machines of different natural widths (32 or 64 bit), we cannot have
- * unsigned long fields with no explict size. We use u64 (aliased
- * natural_width) instead. Luckily, x86 is little-endian.
- */
- natural_width cr0_guest_host_mask;
- natural_width cr4_guest_host_mask;
- natural_width cr0_read_shadow;
- natural_width cr4_read_shadow;
- natural_width cr3_target_value0;
- natural_width cr3_target_value1;
- natural_width cr3_target_value2;
- natural_width cr3_target_value3;
- natural_width exit_qualification;
- natural_width guest_linear_address;
- natural_width guest_cr0;
- natural_width guest_cr3;
- natural_width guest_cr4;
- natural_width guest_es_base;
- natural_width guest_cs_base;
- natural_width guest_ss_base;
- natural_width guest_ds_base;
- natural_width guest_fs_base;
- natural_width guest_gs_base;
- natural_width guest_ldtr_base;
- natural_width guest_tr_base;
- natural_width guest_gdtr_base;
- natural_width guest_idtr_base;
- natural_width guest_dr7;
- natural_width guest_rsp;
- natural_width guest_rip;
- natural_width guest_rflags;
- natural_width guest_pending_dbg_exceptions;
- natural_width guest_sysenter_esp;
- natural_width guest_sysenter_eip;
- natural_width host_cr0;
- natural_width host_cr3;
- natural_width host_cr4;
- natural_width host_fs_base;
- natural_width host_gs_base;
- natural_width host_tr_base;
- natural_width host_gdtr_base;
- natural_width host_idtr_base;
- natural_width host_ia32_sysenter_esp;
- natural_width host_ia32_sysenter_eip;
- natural_width host_rsp;
- natural_width host_rip;
- natural_width paddingl[8]; /* room for future expansion */
- u32 pin_based_vm_exec_control;
- u32 cpu_based_vm_exec_control;
- u32 exception_bitmap;
- u32 page_fault_error_code_mask;
- u32 page_fault_error_code_match;
- u32 cr3_target_count;
- u32 vm_exit_controls;
- u32 vm_exit_msr_store_count;
- u32 vm_exit_msr_load_count;
- u32 vm_entry_controls;
- u32 vm_entry_msr_load_count;
- u32 vm_entry_intr_info_field;
- u32 vm_entry_exception_error_code;
- u32 vm_entry_instruction_len;
- u32 tpr_threshold;
- u32 secondary_vm_exec_control;
- u32 vm_instruction_error;
- u32 vm_exit_reason;
- u32 vm_exit_intr_info;
- u32 vm_exit_intr_error_code;
- u32 idt_vectoring_info_field;
- u32 idt_vectoring_error_code;
- u32 vm_exit_instruction_len;
- u32 vmx_instruction_info;
- u32 guest_es_limit;
- u32 guest_cs_limit;
- u32 guest_ss_limit;
- u32 guest_ds_limit;
- u32 guest_fs_limit;
- u32 guest_gs_limit;
- u32 guest_ldtr_limit;
- u32 guest_tr_limit;
- u32 guest_gdtr_limit;
- u32 guest_idtr_limit;
- u32 guest_es_ar_bytes;
- u32 guest_cs_ar_bytes;
- u32 guest_ss_ar_bytes;
- u32 guest_ds_ar_bytes;
- u32 guest_fs_ar_bytes;
- u32 guest_gs_ar_bytes;
- u32 guest_ldtr_ar_bytes;
- u32 guest_tr_ar_bytes;
- u32 guest_interruptibility_info;
- u32 guest_activity_state;
- u32 guest_sysenter_cs;
- u32 host_ia32_sysenter_cs;
- u32 padding32[8]; /* room for future expansion */
- u16 virtual_processor_id;
- u16 guest_es_selector;
- u16 guest_cs_selector;
- u16 guest_ss_selector;
- u16 guest_ds_selector;
- u16 guest_fs_selector;
- u16 guest_gs_selector;
- u16 guest_ldtr_selector;
- u16 guest_tr_selector;
- u16 host_es_selector;
- u16 host_cs_selector;
- u16 host_ss_selector;
- u16 host_ds_selector;
- u16 host_fs_selector;
- u16 host_gs_selector;
- u16 host_tr_selector;
-};
-
-/*
- * VMCS12_REVISION is an arbitrary id that should be changed if the content or
- * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
- * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
- */
-#define VMCS12_REVISION 0x11e57ed0
-
-/*
- * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region
- * and any VMCS region. Although only sizeof(struct vmcs12) are used by the
- * current implementation, 4K are reserved to avoid future complications.
- */
-#define VMCS12_SIZE 0x1000
-
-/* Used to remember the last vmcs02 used for some recently used vmcs12s */
-struct vmcs02_list {
- struct list_head list;
- gpa_t vmptr;
- struct loaded_vmcs vmcs02;
-};
-
-/*
- * The nested_vmx structure is part of vcpu_vmx, and holds information we need
- * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
- */
-struct nested_vmx {
- /* Has the level1 guest done vmxon? */
- bool vmxon;
-
- /* The guest-physical address of the current VMCS L1 keeps for L2 */
- gpa_t current_vmptr;
- /* The host-usable pointer to the above */
- struct page *current_vmcs12_page;
- struct vmcs12 *current_vmcs12;
-
- /* vmcs02_list cache of VMCSs recently used to run L2 guests */
- struct list_head vmcs02_pool;
- int vmcs02_num;
- u64 vmcs01_tsc_offset;
- /* L2 must run next, and mustn't decide to exit to L1. */
- bool nested_run_pending;
- /*
- * Guest pages referred to in vmcs02 with host-physical pointers, so
- * we must keep them pinned while L2 runs.
- */
- struct page *apic_access_page;
-};
-
-struct vcpu_vmx {
- struct kvm_vcpu vcpu;
- unsigned long host_rsp;
- u8 fail;
- u8 cpl;
- bool nmi_known_unmasked;
- u32 exit_intr_info;
- u32 idt_vectoring_info;
- ulong rflags;
- struct shared_msr_entry *guest_msrs;
- int nmsrs;
- int save_nmsrs;
-#ifdef CONFIG_X86_64
- u64 msr_host_kernel_gs_base;
- u64 msr_guest_kernel_gs_base;
-#endif
- /*
- * loaded_vmcs points to the VMCS currently used in this vcpu. For a
- * non-nested (L1) guest, it always points to vmcs01. For a nested
- * guest (L2), it points to a different VMCS.
- */
- struct loaded_vmcs vmcs01;
- struct loaded_vmcs *loaded_vmcs;
- bool __launched; /* temporary, used in vmx_vcpu_run */
- struct msr_autoload {
- unsigned nr;
- struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
- struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
- } msr_autoload;
- struct {
- int loaded;
- u16 fs_sel, gs_sel, ldt_sel;
- int gs_ldt_reload_needed;
- int fs_reload_needed;
- } host_state;
- struct {
- int vm86_active;
- ulong save_rflags;
- struct kvm_save_segment {
- u16 selector;
- unsigned long base;
- u32 limit;
- u32 ar;
- } tr, es, ds, fs, gs;
- } rmode;
- struct {
- u32 bitmask; /* 4 bits per segment (1 bit per field) */
- struct kvm_save_segment seg[8];
- } segment_cache;
- int vpid;
- bool emulation_required;
-
- /* Support for vnmi-less CPUs */
- int soft_vnmi_blocked;
- ktime_t entry_time;
- s64 vnmi_blocked_time;
- u32 exit_reason;
-
- bool rdtscp_enabled;
-
- /* Support for a guest hypervisor (nested VMX) */
- struct nested_vmx nested;
-};
-
-enum segment_cache_field {
- SEG_FIELD_SEL = 0,
- SEG_FIELD_BASE = 1,
- SEG_FIELD_LIMIT = 2,
- SEG_FIELD_AR = 3,
-
- SEG_FIELD_NR = 4
-};
-
-static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
-{
- return container_of(vcpu, struct vcpu_vmx, vcpu);
-}
-
-#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
-#define FIELD(number, name) [number] = VMCS12_OFFSET(name)
-#define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \
- [number##_HIGH] = VMCS12_OFFSET(name)+4
-
-static unsigned short vmcs_field_to_offset_table[] = {
- FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
- FIELD(GUEST_ES_SELECTOR, guest_es_selector),
- FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
- FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
- FIELD(GUEST_DS_SELECTOR, guest_ds_selector),
- FIELD(GUEST_FS_SELECTOR, guest_fs_selector),
- FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
- FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
- FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
- FIELD(HOST_ES_SELECTOR, host_es_selector),
- FIELD(HOST_CS_SELECTOR, host_cs_selector),
- FIELD(HOST_SS_SELECTOR, host_ss_selector),
- FIELD(HOST_DS_SELECTOR, host_ds_selector),
- FIELD(HOST_FS_SELECTOR, host_fs_selector),
- FIELD(HOST_GS_SELECTOR, host_gs_selector),
- FIELD(HOST_TR_SELECTOR, host_tr_selector),
- FIELD64(IO_BITMAP_A, io_bitmap_a),
- FIELD64(IO_BITMAP_B, io_bitmap_b),
- FIELD64(MSR_BITMAP, msr_bitmap),
- FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr),
- FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr),
- FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr),
- FIELD64(TSC_OFFSET, tsc_offset),
- FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
- FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
- FIELD64(EPT_POINTER, ept_pointer),
- FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
- FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
- FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
- FIELD64(GUEST_IA32_PAT, guest_ia32_pat),
- FIELD64(GUEST_IA32_EFER, guest_ia32_efer),
- FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl),
- FIELD64(GUEST_PDPTR0, guest_pdptr0),
- FIELD64(GUEST_PDPTR1, guest_pdptr1),
- FIELD64(GUEST_PDPTR2, guest_pdptr2),
- FIELD64(GUEST_PDPTR3, guest_pdptr3),
- FIELD64(HOST_IA32_PAT, host_ia32_pat),
- FIELD64(HOST_IA32_EFER, host_ia32_efer),
- FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
- FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
- FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
- FIELD(EXCEPTION_BITMAP, exception_bitmap),
- FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask),
- FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match),
- FIELD(CR3_TARGET_COUNT, cr3_target_count),
- FIELD(VM_EXIT_CONTROLS, vm_exit_controls),
- FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count),
- FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count),
- FIELD(VM_ENTRY_CONTROLS, vm_entry_controls),
- FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count),
- FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field),
- FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code),
- FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len),
- FIELD(TPR_THRESHOLD, tpr_threshold),
- FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control),
- FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error),
- FIELD(VM_EXIT_REASON, vm_exit_reason),
- FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info),
- FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code),
- FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field),
- FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code),
- FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len),
- FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info),
- FIELD(GUEST_ES_LIMIT, guest_es_limit),
- FIELD(GUEST_CS_LIMIT, guest_cs_limit),
- FIELD(GUEST_SS_LIMIT, guest_ss_limit),
- FIELD(GUEST_DS_LIMIT, guest_ds_limit),
- FIELD(GUEST_FS_LIMIT, guest_fs_limit),
- FIELD(GUEST_GS_LIMIT, guest_gs_limit),
- FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit),
- FIELD(GUEST_TR_LIMIT, guest_tr_limit),
- FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit),
- FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit),
- FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes),
- FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes),
- FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes),
- FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes),
- FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes),
- FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes),
- FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes),
- FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes),
- FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info),
- FIELD(GUEST_ACTIVITY_STATE, guest_activity_state),
- FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs),
- FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs),
- FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask),
- FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
- FIELD(CR0_READ_SHADOW, cr0_read_shadow),
- FIELD(CR4_READ_SHADOW, cr4_read_shadow),
- FIELD(CR3_TARGET_VALUE0, cr3_target_value0),
- FIELD(CR3_TARGET_VALUE1, cr3_target_value1),
- FIELD(CR3_TARGET_VALUE2, cr3_target_value2),
- FIELD(CR3_TARGET_VALUE3, cr3_target_value3),
- FIELD(EXIT_QUALIFICATION, exit_qualification),
- FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address),
- FIELD(GUEST_CR0, guest_cr0),
- FIELD(GUEST_CR3, guest_cr3),
- FIELD(GUEST_CR4, guest_cr4),
- FIELD(GUEST_ES_BASE, guest_es_base),
- FIELD(GUEST_CS_BASE, guest_cs_base),
- FIELD(GUEST_SS_BASE, guest_ss_base),
- FIELD(GUEST_DS_BASE, guest_ds_base),
- FIELD(GUEST_FS_BASE, guest_fs_base),
- FIELD(GUEST_GS_BASE, guest_gs_base),
- FIELD(GUEST_LDTR_BASE, guest_ldtr_base),
- FIELD(GUEST_TR_BASE, guest_tr_base),
- FIELD(GUEST_GDTR_BASE, guest_gdtr_base),
- FIELD(GUEST_IDTR_BASE, guest_idtr_base),
- FIELD(GUEST_DR7, guest_dr7),
- FIELD(GUEST_RSP, guest_rsp),
- FIELD(GUEST_RIP, guest_rip),
- FIELD(GUEST_RFLAGS, guest_rflags),
- FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions),
- FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp),
- FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip),
- FIELD(HOST_CR0, host_cr0),
- FIELD(HOST_CR3, host_cr3),
- FIELD(HOST_CR4, host_cr4),
- FIELD(HOST_FS_BASE, host_fs_base),
- FIELD(HOST_GS_BASE, host_gs_base),
- FIELD(HOST_TR_BASE, host_tr_base),
- FIELD(HOST_GDTR_BASE, host_gdtr_base),
- FIELD(HOST_IDTR_BASE, host_idtr_base),
- FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp),
- FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip),
- FIELD(HOST_RSP, host_rsp),
- FIELD(HOST_RIP, host_rip),
-};
-static const int max_vmcs_field = ARRAY_SIZE(vmcs_field_to_offset_table);
-
-static inline short vmcs_field_to_offset(unsigned long field)
-{
- if (field >= max_vmcs_field || vmcs_field_to_offset_table[field] == 0)
- return -1;
- return vmcs_field_to_offset_table[field];
-}
-
-static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
-{
- return to_vmx(vcpu)->nested.current_vmcs12;
-}
-
-static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
-{
- struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT);
- if (is_error_page(page)) {
- kvm_release_page_clean(page);
- return NULL;
- }
- return page;
-}
-
-static void nested_release_page(struct page *page)
-{
- kvm_release_page_dirty(page);
-}
-
-static void nested_release_page_clean(struct page *page)
-{
- kvm_release_page_clean(page);
-}
-
-static u64 construct_eptp(unsigned long root_hpa);
-static void kvm_cpu_vmxon(u64 addr);
-static void kvm_cpu_vmxoff(void);
-static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
-static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
-
-static DEFINE_PER_CPU(struct vmcs *, vmxarea);
-static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
-/*
- * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
- * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
- */
-static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
-static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
-
-static unsigned long *vmx_io_bitmap_a;
-static unsigned long *vmx_io_bitmap_b;
-static unsigned long *vmx_msr_bitmap_legacy;
-static unsigned long *vmx_msr_bitmap_longmode;
-
-static bool cpu_has_load_ia32_efer;
-static bool cpu_has_load_perf_global_ctrl;
-
-static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
-static DEFINE_SPINLOCK(vmx_vpid_lock);
-
-static struct vmcs_config {
- int size;
- int order;
- u32 revision_id;
- u32 pin_based_exec_ctrl;
- u32 cpu_based_exec_ctrl;
- u32 cpu_based_2nd_exec_ctrl;
- u32 vmexit_ctrl;
- u32 vmentry_ctrl;
-} vmcs_config;
-
-static struct vmx_capability {
- u32 ept;
- u32 vpid;
-} vmx_capability;
-
-#define VMX_SEGMENT_FIELD(seg) \
- [VCPU_SREG_##seg] = { \
- .selector = GUEST_##seg##_SELECTOR, \
- .base = GUEST_##seg##_BASE, \
- .limit = GUEST_##seg##_LIMIT, \
- .ar_bytes = GUEST_##seg##_AR_BYTES, \
- }
-
-static struct kvm_vmx_segment_field {
- unsigned selector;
- unsigned base;
- unsigned limit;
- unsigned ar_bytes;
-} kvm_vmx_segment_fields[] = {
- VMX_SEGMENT_FIELD(CS),
- VMX_SEGMENT_FIELD(DS),
- VMX_SEGMENT_FIELD(ES),
- VMX_SEGMENT_FIELD(FS),
- VMX_SEGMENT_FIELD(GS),
- VMX_SEGMENT_FIELD(SS),
- VMX_SEGMENT_FIELD(TR),
- VMX_SEGMENT_FIELD(LDTR),
-};
-
-static u64 host_efer;
-
-static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
-
-/*
- * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
- * away by decrementing the array size.
- */
-static const u32 vmx_msr_index[] = {
-#ifdef CONFIG_X86_64
- MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
-#endif
- MSR_EFER, MSR_TSC_AUX, MSR_STAR,
-};
-#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
-
-static inline bool is_page_fault(u32 intr_info)
-{
- return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
- INTR_INFO_VALID_MASK)) ==
- (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
-}
-
-static inline bool is_no_device(u32 intr_info)
-{
- return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
- INTR_INFO_VALID_MASK)) ==
- (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
-}
-
-static inline bool is_invalid_opcode(u32 intr_info)
-{
- return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
- INTR_INFO_VALID_MASK)) ==
- (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
-}
-
-static inline bool is_external_interrupt(u32 intr_info)
-{
- return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
- == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
-}
-
-static inline bool is_machine_check(u32 intr_info)
-{
- return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
- INTR_INFO_VALID_MASK)) ==
- (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
-}
-
-static inline bool cpu_has_vmx_msr_bitmap(void)
-{
- return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
-}
-
-static inline bool cpu_has_vmx_tpr_shadow(void)
-{
- return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
-}
-
-static inline bool vm_need_tpr_shadow(struct kvm *kvm)
-{
- return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
-}
-
-static inline bool cpu_has_secondary_exec_ctrls(void)
-{
- return vmcs_config.cpu_based_exec_ctrl &
- CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
-}
-
-static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
-{
- return vmcs_config.cpu_based_2nd_exec_ctrl &
- SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
-}
-
-static inline bool cpu_has_vmx_flexpriority(void)
-{
- return cpu_has_vmx_tpr_shadow() &&
- cpu_has_vmx_virtualize_apic_accesses();
-}
-
-static inline bool cpu_has_vmx_ept_execute_only(void)
-{
- return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
-}
-
-static inline bool cpu_has_vmx_eptp_uncacheable(void)
-{
- return vmx_capability.ept & VMX_EPTP_UC_BIT;
-}
-
-static inline bool cpu_has_vmx_eptp_writeback(void)
-{
- return vmx_capability.ept & VMX_EPTP_WB_BIT;
-}
-
-static inline bool cpu_has_vmx_ept_2m_page(void)
-{
- return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
-}
-
-static inline bool cpu_has_vmx_ept_1g_page(void)
-{
- return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
-}
-
-static inline bool cpu_has_vmx_ept_4levels(void)
-{
- return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
-}
-
-static inline bool cpu_has_vmx_invept_individual_addr(void)
-{
- return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
-}
-
-static inline bool cpu_has_vmx_invept_context(void)
-{
- return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
-}
-
-static inline bool cpu_has_vmx_invept_global(void)
-{
- return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
-}
-
-static inline bool cpu_has_vmx_invvpid_single(void)
-{
- return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
-}
-
-static inline bool cpu_has_vmx_invvpid_global(void)
-{
- return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
-}
-
-static inline bool cpu_has_vmx_ept(void)
-{
- return vmcs_config.cpu_based_2nd_exec_ctrl &
- SECONDARY_EXEC_ENABLE_EPT;
-}
-
-static inline bool cpu_has_vmx_unrestricted_guest(void)
-{
- return vmcs_config.cpu_based_2nd_exec_ctrl &
- SECONDARY_EXEC_UNRESTRICTED_GUEST;
-}
-
-static inline bool cpu_has_vmx_ple(void)
-{
- return vmcs_config.cpu_based_2nd_exec_ctrl &
- SECONDARY_EXEC_PAUSE_LOOP_EXITING;
-}
-
-static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
-{
- return flexpriority_enabled && irqchip_in_kernel(kvm);
-}
-
-static inline bool cpu_has_vmx_vpid(void)
-{
- return vmcs_config.cpu_based_2nd_exec_ctrl &
- SECONDARY_EXEC_ENABLE_VPID;
-}
-
-static inline bool cpu_has_vmx_rdtscp(void)
-{
- return vmcs_config.cpu_based_2nd_exec_ctrl &
- SECONDARY_EXEC_RDTSCP;
-}
-
-static inline bool cpu_has_virtual_nmis(void)
-{
- return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
-}
-
-static inline bool cpu_has_vmx_wbinvd_exit(void)
-{
- return vmcs_config.cpu_based_2nd_exec_ctrl &
- SECONDARY_EXEC_WBINVD_EXITING;
-}
-
-static inline bool report_flexpriority(void)
-{
- return flexpriority_enabled;
-}
-
-static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
-{
- return vmcs12->cpu_based_vm_exec_control & bit;
-}
-
-static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
-{
- return (vmcs12->cpu_based_vm_exec_control &
- CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
- (vmcs12->secondary_vm_exec_control & bit);
-}
-
-static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12,
- struct kvm_vcpu *vcpu)
-{
- return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
-}
-
-static inline bool is_exception(u32 intr_info)
-{
- return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
- == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);
-}
-
-static void nested_vmx_vmexit(struct kvm_vcpu *vcpu);
-static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
- struct vmcs12 *vmcs12,
- u32 reason, unsigned long qualification);
-
-static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
-{
- int i;
-
- for (i = 0; i < vmx->nmsrs; ++i)
- if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
- return i;
- return -1;
-}
-
-static inline void __invvpid(int ext, u16 vpid, gva_t gva)
-{
- struct {
- u64 vpid : 16;
- u64 rsvd : 48;
- u64 gva;
- } operand = { vpid, 0, gva };
-
- asm volatile (__ex(ASM_VMX_INVVPID)
- /* CF==1 or ZF==1 --> rc = -1 */
- "; ja 1f ; ud2 ; 1:"
- : : "a"(&operand), "c"(ext) : "cc", "memory");
-}
-
-static inline void __invept(int ext, u64 eptp, gpa_t gpa)
-{
- struct {
- u64 eptp, gpa;
- } operand = {eptp, gpa};
-
- asm volatile (__ex(ASM_VMX_INVEPT)
- /* CF==1 or ZF==1 --> rc = -1 */
- "; ja 1f ; ud2 ; 1:\n"
- : : "a" (&operand), "c" (ext) : "cc", "memory");
-}
-
-static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
-{
- int i;
-
- i = __find_msr_index(vmx, msr);
- if (i >= 0)
- return &vmx->guest_msrs[i];
- return NULL;
-}
-
-static void vmcs_clear(struct vmcs *vmcs)
-{
- u64 phys_addr = __pa(vmcs);
- u8 error;
-
- asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
- : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
- : "cc", "memory");
- if (error)
- printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
- vmcs, phys_addr);
-}
-
-static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
-{
- vmcs_clear(loaded_vmcs->vmcs);
- loaded_vmcs->cpu = -1;
- loaded_vmcs->launched = 0;
-}
-
-static void vmcs_load(struct vmcs *vmcs)
-{
- u64 phys_addr = __pa(vmcs);
- u8 error;
-
- asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
- : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
- : "cc", "memory");
- if (error)
- printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
- vmcs, phys_addr);
-}
-
-static void __loaded_vmcs_clear(void *arg)
-{
- struct loaded_vmcs *loaded_vmcs = arg;
- int cpu = raw_smp_processor_id();
-
- if (loaded_vmcs->cpu != cpu)
- return; /* vcpu migration can race with cpu offline */
- if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
- per_cpu(current_vmcs, cpu) = NULL;
- list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
- loaded_vmcs_init(loaded_vmcs);
-}
-
-static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
-{
- if (loaded_vmcs->cpu != -1)
- smp_call_function_single(
- loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1);
-}
-
-static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
-{
- if (vmx->vpid == 0)
- return;
-
- if (cpu_has_vmx_invvpid_single())
- __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
-}
-
-static inline void vpid_sync_vcpu_global(void)
-{
- if (cpu_has_vmx_invvpid_global())
- __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
-}
-
-static inline void vpid_sync_context(struct vcpu_vmx *vmx)
-{
- if (cpu_has_vmx_invvpid_single())
- vpid_sync_vcpu_single(vmx);
- else
- vpid_sync_vcpu_global();
-}
-
-static inline void ept_sync_global(void)
-{
- if (cpu_has_vmx_invept_global())
- __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
-}
-
-static inline void ept_sync_context(u64 eptp)
-{
- if (enable_ept) {
- if (cpu_has_vmx_invept_context())
- __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
- else
- ept_sync_global();
- }
-}
-
-static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
-{
- if (enable_ept) {
- if (cpu_has_vmx_invept_individual_addr())
- __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
- eptp, gpa);
- else
- ept_sync_context(eptp);
- }
-}
-
-static __always_inline unsigned long vmcs_readl(unsigned long field)
-{
- unsigned long value;
-
- asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0")
- : "=a"(value) : "d"(field) : "cc");
- return value;
-}
-
-static __always_inline u16 vmcs_read16(unsigned long field)
-{
- return vmcs_readl(field);
-}
-
-static __always_inline u32 vmcs_read32(unsigned long field)
-{
- return vmcs_readl(field);
-}
-
-static __always_inline u64 vmcs_read64(unsigned long field)
-{
-#ifdef CONFIG_X86_64
- return vmcs_readl(field);
-#else
- return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32);
-#endif
-}
-
-static noinline void vmwrite_error(unsigned long field, unsigned long value)
-{
- printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
- field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
- dump_stack();
-}
-
-static void vmcs_writel(unsigned long field, unsigned long value)
-{
- u8 error;
-
- asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0"
- : "=q"(error) : "a"(value), "d"(field) : "cc");
- if (unlikely(error))
- vmwrite_error(field, value);
-}
-
-static void vmcs_write16(unsigned long field, u16 value)
-{
- vmcs_writel(field, value);
-}
-
-static void vmcs_write32(unsigned long field, u32 value)
-{
- vmcs_writel(field, value);
-}
-
-static void vmcs_write64(unsigned long field, u64 value)
-{
- vmcs_writel(field, value);
-#ifndef CONFIG_X86_64
- asm volatile ("");
- vmcs_writel(field+1, value >> 32);
-#endif
-}
-
-static void vmcs_clear_bits(unsigned long field, u32 mask)
-{
- vmcs_writel(field, vmcs_readl(field) & ~mask);
-}
-
-static void vmcs_set_bits(unsigned long field, u32 mask)
-{
- vmcs_writel(field, vmcs_readl(field) | mask);
-}
-
-static void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
-{
- vmx->segment_cache.bitmask = 0;
-}
-
-static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
- unsigned field)
-{
- bool ret;
- u32 mask = 1 << (seg * SEG_FIELD_NR + field);
-
- if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) {
- vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS);
- vmx->segment_cache.bitmask = 0;
- }
- ret = vmx->segment_cache.bitmask & mask;
- vmx->segment_cache.bitmask |= mask;
- return ret;
-}
-
-static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
-{
- u16 *p = &vmx->segment_cache.seg[seg].selector;
-
- if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
- *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
- return *p;
-}
-
-static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
-{
- ulong *p = &vmx->segment_cache.seg[seg].base;
-
- if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
- *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
- return *p;
-}
-
-static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
-{
- u32 *p = &vmx->segment_cache.seg[seg].limit;
-
- if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
- *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
- return *p;
-}
-
-static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
-{
- u32 *p = &vmx->segment_cache.seg[seg].ar;
-
- if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
- *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
- return *p;
-}
-
-static void update_exception_bitmap(struct kvm_vcpu *vcpu)
-{
- u32 eb;
-
- eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
- (1u << NM_VECTOR) | (1u << DB_VECTOR);
- if ((vcpu->guest_debug &
- (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
- (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
- eb |= 1u << BP_VECTOR;
- if (to_vmx(vcpu)->rmode.vm86_active)
- eb = ~0;
- if (enable_ept)
- eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
- if (vcpu->fpu_active)
- eb &= ~(1u << NM_VECTOR);
-
- /* When we are running a nested L2 guest and L1 specified for it a
- * certain exception bitmap, we must trap the same exceptions and pass
- * them to L1. When running L2, we will only handle the exceptions
- * specified above if L1 did not want them.
- */
- if (is_guest_mode(vcpu))
- eb |= get_vmcs12(vcpu)->exception_bitmap;
-
- vmcs_write32(EXCEPTION_BITMAP, eb);
-}
-
-static void clear_atomic_switch_msr_special(unsigned long entry,
- unsigned long exit)
-{
- vmcs_clear_bits(VM_ENTRY_CONTROLS, entry);
- vmcs_clear_bits(VM_EXIT_CONTROLS, exit);
-}
-
-static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
-{
- unsigned i;
- struct msr_autoload *m = &vmx->msr_autoload;
-
- switch (msr) {
- case MSR_EFER:
- if (cpu_has_load_ia32_efer) {
- clear_atomic_switch_msr_special(VM_ENTRY_LOAD_IA32_EFER,
- VM_EXIT_LOAD_IA32_EFER);
- return;
- }
- break;
- case MSR_CORE_PERF_GLOBAL_CTRL:
- if (cpu_has_load_perf_global_ctrl) {
- clear_atomic_switch_msr_special(
- VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
- VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
- return;
- }
- break;
- }
-
- for (i = 0; i < m->nr; ++i)
- if (m->guest[i].index == msr)
- break;
-
- if (i == m->nr)
- return;
- --m->nr;
- m->guest[i] = m->guest[m->nr];
- m->host[i] = m->host[m->nr];
- vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
- vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
-}
-
-static void add_atomic_switch_msr_special(unsigned long entry,
- unsigned long exit, unsigned long guest_val_vmcs,
- unsigned long host_val_vmcs, u64 guest_val, u64 host_val)
-{
- vmcs_write64(guest_val_vmcs, guest_val);
- vmcs_write64(host_val_vmcs, host_val);
- vmcs_set_bits(VM_ENTRY_CONTROLS, entry);
- vmcs_set_bits(VM_EXIT_CONTROLS, exit);
-}
-
-static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
- u64 guest_val, u64 host_val)
-{
- unsigned i;
- struct msr_autoload *m = &vmx->msr_autoload;
-
- switch (msr) {
- case MSR_EFER:
- if (cpu_has_load_ia32_efer) {
- add_atomic_switch_msr_special(VM_ENTRY_LOAD_IA32_EFER,
- VM_EXIT_LOAD_IA32_EFER,
- GUEST_IA32_EFER,
- HOST_IA32_EFER,
- guest_val, host_val);
- return;
- }
- break;
- case MSR_CORE_PERF_GLOBAL_CTRL:
- if (cpu_has_load_perf_global_ctrl) {
- add_atomic_switch_msr_special(
- VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
- VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
- GUEST_IA32_PERF_GLOBAL_CTRL,
- HOST_IA32_PERF_GLOBAL_CTRL,
- guest_val, host_val);
- return;
- }
- break;
- }
-
- for (i = 0; i < m->nr; ++i)
- if (m->guest[i].index == msr)
- break;
-
- if (i == NR_AUTOLOAD_MSRS) {
- printk_once(KERN_WARNING"Not enough mst switch entries. "
- "Can't add msr %x\n", msr);
- return;
- } else if (i == m->nr) {
- ++m->nr;
- vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
- vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
- }
-
- m->guest[i].index = msr;
- m->guest[i].value = guest_val;
- m->host[i].index = msr;
- m->host[i].value = host_val;
-}
-
-static void reload_tss(void)
-{
- /*
- * VT restores TR but not its size. Useless.
- */
- struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
- struct desc_struct *descs;
-
- descs = (void *)gdt->address;
- descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
- load_TR_desc();
-}
-
-static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
-{
- u64 guest_efer;
- u64 ignore_bits;
-
- guest_efer = vmx->vcpu.arch.efer;
-
- /*
- * NX is emulated; LMA and LME handled by hardware; SCE meaninless
- * outside long mode
- */
- ignore_bits = EFER_NX | EFER_SCE;
-#ifdef CONFIG_X86_64
- ignore_bits |= EFER_LMA | EFER_LME;
- /* SCE is meaningful only in long mode on Intel */
- if (guest_efer & EFER_LMA)
- ignore_bits &= ~(u64)EFER_SCE;
-#endif
- guest_efer &= ~ignore_bits;
- guest_efer |= host_efer & ignore_bits;
- vmx->guest_msrs[efer_offset].data = guest_efer;
- vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
-
- clear_atomic_switch_msr(vmx, MSR_EFER);
- /* On ept, can't emulate nx, and must switch nx atomically */
- if (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX)) {
- guest_efer = vmx->vcpu.arch.efer;
- if (!(guest_efer & EFER_LMA))
- guest_efer &= ~EFER_LME;
- add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, host_efer);
- return false;
- }
-
- return true;
-}
-
-static unsigned long segment_base(u16 selector)
-{
- struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
- struct desc_struct *d;
- unsigned long table_base;
- unsigned long v;
-
- if (!(selector & ~3))
- return 0;
-
- table_base = gdt->address;
-
- if (selector & 4) { /* from ldt */
- u16 ldt_selector = kvm_read_ldt();
-
- if (!(ldt_selector & ~3))
- return 0;
-
- table_base = segment_base(ldt_selector);
- }
- d = (struct desc_struct *)(table_base + (selector & ~7));
- v = get_desc_base(d);
-#ifdef CONFIG_X86_64
- if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
- v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
-#endif
- return v;
-}
-
-static inline unsigned long kvm_read_tr_base(void)
-{
- u16 tr;
- asm("str %0" : "=g"(tr));
- return segment_base(tr);
-}
-
-static void vmx_save_host_state(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- int i;
-
- if (vmx->host_state.loaded)
- return;
-
- vmx->host_state.loaded = 1;
- /*
- * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
- * allow segment selectors with cpl > 0 or ti == 1.
- */
- vmx->host_state.ldt_sel = kvm_read_ldt();
- vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
- savesegment(fs, vmx->host_state.fs_sel);
- if (!(vmx->host_state.fs_sel & 7)) {
- vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
- vmx->host_state.fs_reload_needed = 0;
- } else {
- vmcs_write16(HOST_FS_SELECTOR, 0);
- vmx->host_state.fs_reload_needed = 1;
- }
- savesegment(gs, vmx->host_state.gs_sel);
- if (!(vmx->host_state.gs_sel & 7))
- vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
- else {
- vmcs_write16(HOST_GS_SELECTOR, 0);
- vmx->host_state.gs_ldt_reload_needed = 1;
- }
-
-#ifdef CONFIG_X86_64
- vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
- vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
-#else
- vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
- vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
-#endif
-
-#ifdef CONFIG_X86_64
- rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
- if (is_long_mode(&vmx->vcpu))
- wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
-#endif
- for (i = 0; i < vmx->save_nmsrs; ++i)
- kvm_set_shared_msr(vmx->guest_msrs[i].index,
- vmx->guest_msrs[i].data,
- vmx->guest_msrs[i].mask);
-}
-
-static void __vmx_load_host_state(struct vcpu_vmx *vmx)
-{
- if (!vmx->host_state.loaded)
- return;
-
- ++vmx->vcpu.stat.host_state_reload;
- vmx->host_state.loaded = 0;
-#ifdef CONFIG_X86_64
- if (is_long_mode(&vmx->vcpu))
- rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
-#endif
- if (vmx->host_state.gs_ldt_reload_needed) {
- kvm_load_ldt(vmx->host_state.ldt_sel);
-#ifdef CONFIG_X86_64
- load_gs_index(vmx->host_state.gs_sel);
-#else
- loadsegment(gs, vmx->host_state.gs_sel);
-#endif
- }
- if (vmx->host_state.fs_reload_needed)
- loadsegment(fs, vmx->host_state.fs_sel);
- reload_tss();
-#ifdef CONFIG_X86_64
- wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
-#endif
- if (user_has_fpu())
- clts();
- load_gdt(&__get_cpu_var(host_gdt));
-}
-
-static void vmx_load_host_state(struct vcpu_vmx *vmx)
-{
- preempt_disable();
- __vmx_load_host_state(vmx);
- preempt_enable();
-}
-
-/*
- * Switches to specified vcpu, until a matching vcpu_put(), but assumes
- * vcpu mutex is already taken.
- */
-static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
-
- if (!vmm_exclusive)
- kvm_cpu_vmxon(phys_addr);
- else if (vmx->loaded_vmcs->cpu != cpu)
- loaded_vmcs_clear(vmx->loaded_vmcs);
-
- if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
- per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
- vmcs_load(vmx->loaded_vmcs->vmcs);
- }
-
- if (vmx->loaded_vmcs->cpu != cpu) {
- struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
- unsigned long sysenter_esp;
-
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
- local_irq_disable();
- list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
- &per_cpu(loaded_vmcss_on_cpu, cpu));
- local_irq_enable();
-
- /*
- * Linux uses per-cpu TSS and GDT, so set these when switching
- * processors.
- */
- vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
- vmcs_writel(HOST_GDTR_BASE, gdt->address); /* 22.2.4 */
-
- rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
- vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
- vmx->loaded_vmcs->cpu = cpu;
- }
-}
-
-static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
-{
- __vmx_load_host_state(to_vmx(vcpu));
- if (!vmm_exclusive) {
- __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
- vcpu->cpu = -1;
- kvm_cpu_vmxoff();
- }
-}
-
-static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
-{
- ulong cr0;
-
- if (vcpu->fpu_active)
- return;
- vcpu->fpu_active = 1;
- cr0 = vmcs_readl(GUEST_CR0);
- cr0 &= ~(X86_CR0_TS | X86_CR0_MP);
- cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP);
- vmcs_writel(GUEST_CR0, cr0);
- update_exception_bitmap(vcpu);
- vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
- if (is_guest_mode(vcpu))
- vcpu->arch.cr0_guest_owned_bits &=
- ~get_vmcs12(vcpu)->cr0_guest_host_mask;
- vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
-}
-
-static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
-
-/*
- * Return the cr0 value that a nested guest would read. This is a combination
- * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
- * its hypervisor (cr0_read_shadow).
- */
-static inline unsigned long nested_read_cr0(struct vmcs12 *fields)
-{
- return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) |
- (fields->cr0_read_shadow & fields->cr0_guest_host_mask);
-}
-static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
-{
- return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) |
- (fields->cr4_read_shadow & fields->cr4_guest_host_mask);
-}
-
-static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
-{
- /* Note that there is no vcpu->fpu_active = 0 here. The caller must
- * set this *before* calling this function.
- */
- vmx_decache_cr0_guest_bits(vcpu);
- vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
- update_exception_bitmap(vcpu);
- vcpu->arch.cr0_guest_owned_bits = 0;
- vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
- if (is_guest_mode(vcpu)) {
- /*
- * L1's specified read shadow might not contain the TS bit,
- * so now that we turned on shadowing of this bit, we need to
- * set this bit of the shadow. Like in nested_vmx_run we need
- * nested_read_cr0(vmcs12), but vmcs12->guest_cr0 is not yet
- * up-to-date here because we just decached cr0.TS (and we'll
- * only update vmcs12->guest_cr0 on nested exit).
- */
- struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- vmcs12->guest_cr0 = (vmcs12->guest_cr0 & ~X86_CR0_TS) |
- (vcpu->arch.cr0 & X86_CR0_TS);
- vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
- } else
- vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
-}
-
-static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
-{
- unsigned long rflags, save_rflags;
-
- if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
- __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
- rflags = vmcs_readl(GUEST_RFLAGS);
- if (to_vmx(vcpu)->rmode.vm86_active) {
- rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
- save_rflags = to_vmx(vcpu)->rmode.save_rflags;
- rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
- }
- to_vmx(vcpu)->rflags = rflags;
- }
- return to_vmx(vcpu)->rflags;
-}
-
-static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
-{
- __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
- __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
- to_vmx(vcpu)->rflags = rflags;
- if (to_vmx(vcpu)->rmode.vm86_active) {
- to_vmx(vcpu)->rmode.save_rflags = rflags;
- rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
- }
- vmcs_writel(GUEST_RFLAGS, rflags);
-}
-
-static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
-{
- u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
- int ret = 0;
-
- if (interruptibility & GUEST_INTR_STATE_STI)
- ret |= KVM_X86_SHADOW_INT_STI;
- if (interruptibility & GUEST_INTR_STATE_MOV_SS)
- ret |= KVM_X86_SHADOW_INT_MOV_SS;
-
- return ret & mask;
-}
-
-static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
-{
- u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
- u32 interruptibility = interruptibility_old;
-
- interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
-
- if (mask & KVM_X86_SHADOW_INT_MOV_SS)
- interruptibility |= GUEST_INTR_STATE_MOV_SS;
- else if (mask & KVM_X86_SHADOW_INT_STI)
- interruptibility |= GUEST_INTR_STATE_STI;
-
- if ((interruptibility != interruptibility_old))
- vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
-}
-
-static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
-{
- unsigned long rip;
-
- rip = kvm_rip_read(vcpu);
- rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
- kvm_rip_write(vcpu, rip);
-
- /* skipping an emulated instruction also counts */
- vmx_set_interrupt_shadow(vcpu, 0);
-}
-
-/*
- * KVM wants to inject page-faults which it got to the guest. This function
- * checks whether in a nested guest, we need to inject them to L1 or L2.
- * This function assumes it is called with the exit reason in vmcs02 being
- * a #PF exception (this is the only case in which KVM injects a #PF when L2
- * is running).
- */
-static int nested_pf_handled(struct kvm_vcpu *vcpu)
-{
- struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-
- /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
- if (!(vmcs12->exception_bitmap & (1u << PF_VECTOR)))
- return 0;
-
- nested_vmx_vmexit(vcpu);
- return 1;
-}
-
-static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
- bool has_error_code, u32 error_code,
- bool reinject)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- u32 intr_info = nr | INTR_INFO_VALID_MASK;
-
- if (nr == PF_VECTOR && is_guest_mode(vcpu) &&
- nested_pf_handled(vcpu))
- return;
-
- if (has_error_code) {
- vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
- intr_info |= INTR_INFO_DELIVER_CODE_MASK;
- }
-
- if (vmx->rmode.vm86_active) {
- int inc_eip = 0;
- if (kvm_exception_is_soft(nr))
- inc_eip = vcpu->arch.event_exit_inst_len;
- if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE)
- kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
- return;
- }
-
- if (kvm_exception_is_soft(nr)) {
- vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
- vmx->vcpu.arch.event_exit_inst_len);
- intr_info |= INTR_TYPE_SOFT_EXCEPTION;
- } else
- intr_info |= INTR_TYPE_HARD_EXCEPTION;
-
- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
-}
-
-static bool vmx_rdtscp_supported(void)
-{
- return cpu_has_vmx_rdtscp();
-}
-
-/*
- * Swap MSR entry in host/guest MSR entry array.
- */
-static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
-{
- struct shared_msr_entry tmp;
-
- tmp = vmx->guest_msrs[to];
- vmx->guest_msrs[to] = vmx->guest_msrs[from];
- vmx->guest_msrs[from] = tmp;
-}
-
-/*
- * Set up the vmcs to automatically save and restore system
- * msrs. Don't touch the 64-bit msrs if the guest is in legacy
- * mode, as fiddling with msrs is very expensive.
- */
-static void setup_msrs(struct vcpu_vmx *vmx)
-{
- int save_nmsrs, index;
- unsigned long *msr_bitmap;
-
- save_nmsrs = 0;
-#ifdef CONFIG_X86_64
- if (is_long_mode(&vmx->vcpu)) {
- index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
- if (index >= 0)
- move_msr_up(vmx, index, save_nmsrs++);
- index = __find_msr_index(vmx, MSR_LSTAR);
- if (index >= 0)
- move_msr_up(vmx, index, save_nmsrs++);
- index = __find_msr_index(vmx, MSR_CSTAR);
- if (index >= 0)
- move_msr_up(vmx, index, save_nmsrs++);
- index = __find_msr_index(vmx, MSR_TSC_AUX);
- if (index >= 0 && vmx->rdtscp_enabled)
- move_msr_up(vmx, index, save_nmsrs++);
- /*
- * MSR_STAR is only needed on long mode guests, and only
- * if efer.sce is enabled.
- */
- index = __find_msr_index(vmx, MSR_STAR);
- if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
- move_msr_up(vmx, index, save_nmsrs++);
- }
-#endif
- index = __find_msr_index(vmx, MSR_EFER);
- if (index >= 0 && update_transition_efer(vmx, index))
- move_msr_up(vmx, index, save_nmsrs++);
-
- vmx->save_nmsrs = save_nmsrs;
-
- if (cpu_has_vmx_msr_bitmap()) {
- if (is_long_mode(&vmx->vcpu))
- msr_bitmap = vmx_msr_bitmap_longmode;
- else
- msr_bitmap = vmx_msr_bitmap_legacy;
-
- vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
- }
-}
-
-/*
- * reads and returns guest's timestamp counter "register"
- * guest_tsc = host_tsc + tsc_offset -- 21.3
- */
-static u64 guest_read_tsc(void)
-{
- u64 host_tsc, tsc_offset;
-
- rdtscll(host_tsc);
- tsc_offset = vmcs_read64(TSC_OFFSET);
- return host_tsc + tsc_offset;
-}
-
-/*
- * Like guest_read_tsc, but always returns L1's notion of the timestamp
- * counter, even if a nested guest (L2) is currently running.
- */
-u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu)
-{
- u64 host_tsc, tsc_offset;
-
- rdtscll(host_tsc);
- tsc_offset = is_guest_mode(vcpu) ?
- to_vmx(vcpu)->nested.vmcs01_tsc_offset :
- vmcs_read64(TSC_OFFSET);
- return host_tsc + tsc_offset;
-}
-
-/*
- * Engage any workarounds for mis-matched TSC rates. Currently limited to
- * software catchup for faster rates on slower CPUs.
- */
-static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
-{
- if (!scale)
- return;
-
- if (user_tsc_khz > tsc_khz) {
- vcpu->arch.tsc_catchup = 1;
- vcpu->arch.tsc_always_catchup = 1;
- } else
- WARN(1, "user requested TSC rate below hardware speed\n");
-}
-
-/*
- * writes 'offset' into guest's timestamp counter offset register
- */
-static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
-{
- if (is_guest_mode(vcpu)) {
- /*
- * We're here if L1 chose not to trap WRMSR to TSC. According
- * to the spec, this should set L1's TSC; The offset that L1
- * set for L2 remains unchanged, and still needs to be added
- * to the newly set TSC to get L2's TSC.
- */
- struct vmcs12 *vmcs12;
- to_vmx(vcpu)->nested.vmcs01_tsc_offset = offset;
- /* recalculate vmcs02.TSC_OFFSET: */
- vmcs12 = get_vmcs12(vcpu);
- vmcs_write64(TSC_OFFSET, offset +
- (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ?
- vmcs12->tsc_offset : 0));
- } else {
- vmcs_write64(TSC_OFFSET, offset);
- }
-}
-
-static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
-{
- u64 offset = vmcs_read64(TSC_OFFSET);
- vmcs_write64(TSC_OFFSET, offset + adjustment);
- if (is_guest_mode(vcpu)) {
- /* Even when running L2, the adjustment needs to apply to L1 */
- to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment;
- }
-}
-
-static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
-{
- return target_tsc - native_read_tsc();
-}
-
-static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0);
- return best && (best->ecx & (1 << (X86_FEATURE_VMX & 31)));
-}
-
-/*
- * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
- * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
- * all guests if the "nested" module option is off, and can also be disabled
- * for a single guest by disabling its VMX cpuid bit.
- */
-static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
-{
- return nested && guest_cpuid_has_vmx(vcpu);
-}
-
-/*
- * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
- * returned for the various VMX controls MSRs when nested VMX is enabled.
- * The same values should also be used to verify that vmcs12 control fields are
- * valid during nested entry from L1 to L2.
- * Each of these control msrs has a low and high 32-bit half: A low bit is on
- * if the corresponding bit in the (32-bit) control field *must* be on, and a
- * bit in the high half is on if the corresponding bit in the control field
- * may be on. See also vmx_control_verify().
- * TODO: allow these variables to be modified (downgraded) by module options
- * or other means.
- */
-static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high;
-static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high;
-static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
-static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
-static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
-static __init void nested_vmx_setup_ctls_msrs(void)
-{
- /*
- * Note that as a general rule, the high half of the MSRs (bits in
- * the control fields which may be 1) should be initialized by the
- * intersection of the underlying hardware's MSR (i.e., features which
- * can be supported) and the list of features we want to expose -
- * because they are known to be properly supported in our code.
- * Also, usually, the low half of the MSRs (bits which must be 1) can
- * be set to 0, meaning that L1 may turn off any of these bits. The
- * reason is that if one of these bits is necessary, it will appear
- * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
- * fields of vmcs01 and vmcs02, will turn these bits off - and
- * nested_vmx_exit_handled() will not pass related exits to L1.
- * These rules have exceptions below.
- */
-
- /* pin-based controls */
- /*
- * According to the Intel spec, if bit 55 of VMX_BASIC is off (as it is
- * in our case), bits 1, 2 and 4 (i.e., 0x16) must be 1 in this MSR.
- */
- nested_vmx_pinbased_ctls_low = 0x16 ;
- nested_vmx_pinbased_ctls_high = 0x16 |
- PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING |
- PIN_BASED_VIRTUAL_NMIS;
-
- /* exit controls */
- nested_vmx_exit_ctls_low = 0;
- /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */
-#ifdef CONFIG_X86_64
- nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE;
-#else
- nested_vmx_exit_ctls_high = 0;
-#endif
-
- /* entry controls */
- rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
- nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high);
- nested_vmx_entry_ctls_low = 0;
- nested_vmx_entry_ctls_high &=
- VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE;
-
- /* cpu-based controls */
- rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
- nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high);
- nested_vmx_procbased_ctls_low = 0;
- nested_vmx_procbased_ctls_high &=
- CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_USE_TSC_OFFSETING |
- CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
- CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
- CPU_BASED_CR3_STORE_EXITING |
-#ifdef CONFIG_X86_64
- CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
-#endif
- CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
- CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
- CPU_BASED_RDPMC_EXITING |
- CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
- /*
- * We can allow some features even when not supported by the
- * hardware. For example, L1 can specify an MSR bitmap - and we
- * can use it to avoid exits to L1 - even when L0 runs L2
- * without MSR bitmaps.
- */
- nested_vmx_procbased_ctls_high |= CPU_BASED_USE_MSR_BITMAPS;
-
- /* secondary cpu-based controls */
- rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
- nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high);
- nested_vmx_secondary_ctls_low = 0;
- nested_vmx_secondary_ctls_high &=
- SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
-}
-
-static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
-{
- /*
- * Bits 0 in high must be 0, and bits 1 in low must be 1.
- */
- return ((control & high) | low) == control;
-}
-
-static inline u64 vmx_control_msr(u32 low, u32 high)
-{
- return low | ((u64)high << 32);
-}
-
-/*
- * If we allow our guest to use VMX instructions (i.e., nested VMX), we should
- * also let it use VMX-specific MSRs.
- * vmx_get_vmx_msr() and vmx_set_vmx_msr() return 1 when we handled a
- * VMX-specific MSR, or 0 when we haven't (and the caller should handle it
- * like all other MSRs).
- */
-static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
-{
- if (!nested_vmx_allowed(vcpu) && msr_index >= MSR_IA32_VMX_BASIC &&
- msr_index <= MSR_IA32_VMX_TRUE_ENTRY_CTLS) {
- /*
- * According to the spec, processors which do not support VMX
- * should throw a #GP(0) when VMX capability MSRs are read.
- */
- kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
- return 1;
- }
-
- switch (msr_index) {
- case MSR_IA32_FEATURE_CONTROL:
- *pdata = 0;
- break;
- case MSR_IA32_VMX_BASIC:
- /*
- * This MSR reports some information about VMX support. We
- * should return information about the VMX we emulate for the
- * guest, and the VMCS structure we give it - not about the
- * VMX support of the underlying hardware.
- */
- *pdata = VMCS12_REVISION |
- ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
- (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
- break;
- case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
- case MSR_IA32_VMX_PINBASED_CTLS:
- *pdata = vmx_control_msr(nested_vmx_pinbased_ctls_low,
- nested_vmx_pinbased_ctls_high);
- break;
- case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
- case MSR_IA32_VMX_PROCBASED_CTLS:
- *pdata = vmx_control_msr(nested_vmx_procbased_ctls_low,
- nested_vmx_procbased_ctls_high);
- break;
- case MSR_IA32_VMX_TRUE_EXIT_CTLS:
- case MSR_IA32_VMX_EXIT_CTLS:
- *pdata = vmx_control_msr(nested_vmx_exit_ctls_low,
- nested_vmx_exit_ctls_high);
- break;
- case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
- case MSR_IA32_VMX_ENTRY_CTLS:
- *pdata = vmx_control_msr(nested_vmx_entry_ctls_low,
- nested_vmx_entry_ctls_high);
- break;
- case MSR_IA32_VMX_MISC:
- *pdata = 0;
- break;
- /*
- * These MSRs specify bits which the guest must keep fixed (on or off)
- * while L1 is in VMXON mode (in L1's root mode, or running an L2).
- * We picked the standard core2 setting.
- */
-#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
-#define VMXON_CR4_ALWAYSON X86_CR4_VMXE
- case MSR_IA32_VMX_CR0_FIXED0:
- *pdata = VMXON_CR0_ALWAYSON;
- break;
- case MSR_IA32_VMX_CR0_FIXED1:
- *pdata = -1ULL;
- break;
- case MSR_IA32_VMX_CR4_FIXED0:
- *pdata = VMXON_CR4_ALWAYSON;
- break;
- case MSR_IA32_VMX_CR4_FIXED1:
- *pdata = -1ULL;
- break;
- case MSR_IA32_VMX_VMCS_ENUM:
- *pdata = 0x1f;
- break;
- case MSR_IA32_VMX_PROCBASED_CTLS2:
- *pdata = vmx_control_msr(nested_vmx_secondary_ctls_low,
- nested_vmx_secondary_ctls_high);
- break;
- case MSR_IA32_VMX_EPT_VPID_CAP:
- /* Currently, no nested ept or nested vpid */
- *pdata = 0;
- break;
- default:
- return 0;
- }
-
- return 1;
-}
-
-static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
-{
- if (!nested_vmx_allowed(vcpu))
- return 0;
-
- if (msr_index == MSR_IA32_FEATURE_CONTROL)
- /* TODO: the right thing. */
- return 1;
- /*
- * No need to treat VMX capability MSRs specially: If we don't handle
- * them, handle_wrmsr will #GP(0), which is correct (they are readonly)
- */
- return 0;
-}
-
-/*
- * Reads an msr value (of 'msr_index') into 'pdata'.
- * Returns 0 on success, non-0 otherwise.
- * Assumes vcpu_load() was already called.
- */
-static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
-{
- u64 data;
- struct shared_msr_entry *msr;
-
- if (!pdata) {
- printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
- return -EINVAL;
- }
-
- switch (msr_index) {
-#ifdef CONFIG_X86_64
- case MSR_FS_BASE:
- data = vmcs_readl(GUEST_FS_BASE);
- break;
- case MSR_GS_BASE:
- data = vmcs_readl(GUEST_GS_BASE);
- break;
- case MSR_KERNEL_GS_BASE:
- vmx_load_host_state(to_vmx(vcpu));
- data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
- break;
-#endif
- case MSR_EFER:
- return kvm_get_msr_common(vcpu, msr_index, pdata);
- case MSR_IA32_TSC:
- data = guest_read_tsc();
- break;
- case MSR_IA32_SYSENTER_CS:
- data = vmcs_read32(GUEST_SYSENTER_CS);
- break;
- case MSR_IA32_SYSENTER_EIP:
- data = vmcs_readl(GUEST_SYSENTER_EIP);
- break;
- case MSR_IA32_SYSENTER_ESP:
- data = vmcs_readl(GUEST_SYSENTER_ESP);
- break;
- case MSR_TSC_AUX:
- if (!to_vmx(vcpu)->rdtscp_enabled)
- return 1;
- /* Otherwise falls through */
- default:
- if (vmx_get_vmx_msr(vcpu, msr_index, pdata))
- return 0;
- msr = find_msr_entry(to_vmx(vcpu), msr_index);
- if (msr) {
- data = msr->data;
- break;
- }
- return kvm_get_msr_common(vcpu, msr_index, pdata);
- }
-
- *pdata = data;
- return 0;
-}
-
-/*
- * Writes msr value into into the appropriate "register".
- * Returns 0 on success, non-0 otherwise.
- * Assumes vcpu_load() was already called.
- */
-static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct shared_msr_entry *msr;
- int ret = 0;
-
- switch (msr_index) {
- case MSR_EFER:
- ret = kvm_set_msr_common(vcpu, msr_index, data);
- break;
-#ifdef CONFIG_X86_64
- case MSR_FS_BASE:
- vmx_segment_cache_clear(vmx);
- vmcs_writel(GUEST_FS_BASE, data);
- break;
- case MSR_GS_BASE:
- vmx_segment_cache_clear(vmx);
- vmcs_writel(GUEST_GS_BASE, data);
- break;
- case MSR_KERNEL_GS_BASE:
- vmx_load_host_state(vmx);
- vmx->msr_guest_kernel_gs_base = data;
- break;
-#endif
- case MSR_IA32_SYSENTER_CS:
- vmcs_write32(GUEST_SYSENTER_CS, data);
- break;
- case MSR_IA32_SYSENTER_EIP:
- vmcs_writel(GUEST_SYSENTER_EIP, data);
- break;
- case MSR_IA32_SYSENTER_ESP:
- vmcs_writel(GUEST_SYSENTER_ESP, data);
- break;
- case MSR_IA32_TSC:
- kvm_write_tsc(vcpu, data);
- break;
- case MSR_IA32_CR_PAT:
- if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
- vmcs_write64(GUEST_IA32_PAT, data);
- vcpu->arch.pat = data;
- break;
- }
- ret = kvm_set_msr_common(vcpu, msr_index, data);
- break;
- case MSR_TSC_AUX:
- if (!vmx->rdtscp_enabled)
- return 1;
- /* Check reserved bit, higher 32 bits should be zero */
- if ((data >> 32) != 0)
- return 1;
- /* Otherwise falls through */
- default:
- if (vmx_set_vmx_msr(vcpu, msr_index, data))
- break;
- msr = find_msr_entry(vmx, msr_index);
- if (msr) {
- msr->data = data;
- if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
- preempt_disable();
- kvm_set_shared_msr(msr->index, msr->data,
- msr->mask);
- preempt_enable();
- }
- break;
- }
- ret = kvm_set_msr_common(vcpu, msr_index, data);
- }
-
- return ret;
-}
-
-static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
-{
- __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
- switch (reg) {
- case VCPU_REGS_RSP:
- vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
- break;
- case VCPU_REGS_RIP:
- vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
- break;
- case VCPU_EXREG_PDPTR:
- if (enable_ept)
- ept_save_pdptrs(vcpu);
- break;
- default:
- break;
- }
-}
-
-static void set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
-{
- if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
- vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]);
- else
- vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
-
- update_exception_bitmap(vcpu);
-}
-
-static __init int cpu_has_kvm_support(void)
-{
- return cpu_has_vmx();
-}
-
-static __init int vmx_disabled_by_bios(void)
-{
- u64 msr;
-
- rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
- if (msr & FEATURE_CONTROL_LOCKED) {
- /* launched w/ TXT and VMX disabled */
- if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
- && tboot_enabled())
- return 1;
- /* launched w/o TXT and VMX only enabled w/ TXT */
- if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
- && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
- && !tboot_enabled()) {
- printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
- "activate TXT before enabling KVM\n");
- return 1;
- }
- /* launched w/o TXT and VMX disabled */
- if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
- && !tboot_enabled())
- return 1;
- }
-
- return 0;
-}
-
-static void kvm_cpu_vmxon(u64 addr)
-{
- asm volatile (ASM_VMX_VMXON_RAX
- : : "a"(&addr), "m"(addr)
- : "memory", "cc");
-}
-
-static int hardware_enable(void *garbage)
-{
- int cpu = raw_smp_processor_id();
- u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
- u64 old, test_bits;
-
- if (read_cr4() & X86_CR4_VMXE)
- return -EBUSY;
-
- INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
- rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
-
- test_bits = FEATURE_CONTROL_LOCKED;
- test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
- if (tboot_enabled())
- test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
-
- if ((old & test_bits) != test_bits) {
- /* enable and lock */
- wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
- }
- write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
-
- if (vmm_exclusive) {
- kvm_cpu_vmxon(phys_addr);
- ept_sync_global();
- }
-
- store_gdt(&__get_cpu_var(host_gdt));
-
- return 0;
-}
-
-static void vmclear_local_loaded_vmcss(void)
-{
- int cpu = raw_smp_processor_id();
- struct loaded_vmcs *v, *n;
-
- list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
- loaded_vmcss_on_cpu_link)
- __loaded_vmcs_clear(v);
-}
-
-
-/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
- * tricks.
- */
-static void kvm_cpu_vmxoff(void)
-{
- asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
-}
-
-static void hardware_disable(void *garbage)
-{
- if (vmm_exclusive) {
- vmclear_local_loaded_vmcss();
- kvm_cpu_vmxoff();
- }
- write_cr4(read_cr4() & ~X86_CR4_VMXE);
-}
-
-static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
- u32 msr, u32 *result)
-{
- u32 vmx_msr_low, vmx_msr_high;
- u32 ctl = ctl_min | ctl_opt;
-
- rdmsr(msr, vmx_msr_low, vmx_msr_high);
-
- ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
- ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */
-
- /* Ensure minimum (required) set of control bits are supported. */
- if (ctl_min & ~ctl)
- return -EIO;
-
- *result = ctl;
- return 0;
-}
-
-static __init bool allow_1_setting(u32 msr, u32 ctl)
-{
- u32 vmx_msr_low, vmx_msr_high;
-
- rdmsr(msr, vmx_msr_low, vmx_msr_high);
- return vmx_msr_high & ctl;
-}
-
-static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
-{
- u32 vmx_msr_low, vmx_msr_high;
- u32 min, opt, min2, opt2;
- u32 _pin_based_exec_control = 0;
- u32 _cpu_based_exec_control = 0;
- u32 _cpu_based_2nd_exec_control = 0;
- u32 _vmexit_control = 0;
- u32 _vmentry_control = 0;
-
- min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
- opt = PIN_BASED_VIRTUAL_NMIS;
- if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
- &_pin_based_exec_control) < 0)
- return -EIO;
-
- min = CPU_BASED_HLT_EXITING |
-#ifdef CONFIG_X86_64
- CPU_BASED_CR8_LOAD_EXITING |
- CPU_BASED_CR8_STORE_EXITING |
-#endif
- CPU_BASED_CR3_LOAD_EXITING |
- CPU_BASED_CR3_STORE_EXITING |
- CPU_BASED_USE_IO_BITMAPS |
- CPU_BASED_MOV_DR_EXITING |
- CPU_BASED_USE_TSC_OFFSETING |
- CPU_BASED_MWAIT_EXITING |
- CPU_BASED_MONITOR_EXITING |
- CPU_BASED_INVLPG_EXITING |
- CPU_BASED_RDPMC_EXITING;
-
- opt = CPU_BASED_TPR_SHADOW |
- CPU_BASED_USE_MSR_BITMAPS |
- CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
- if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
- &_cpu_based_exec_control) < 0)
- return -EIO;
-#ifdef CONFIG_X86_64
- if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
- _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
- ~CPU_BASED_CR8_STORE_EXITING;
-#endif
- if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
- min2 = 0;
- opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
- SECONDARY_EXEC_WBINVD_EXITING |
- SECONDARY_EXEC_ENABLE_VPID |
- SECONDARY_EXEC_ENABLE_EPT |
- SECONDARY_EXEC_UNRESTRICTED_GUEST |
- SECONDARY_EXEC_PAUSE_LOOP_EXITING |
- SECONDARY_EXEC_RDTSCP;
- if (adjust_vmx_controls(min2, opt2,
- MSR_IA32_VMX_PROCBASED_CTLS2,
- &_cpu_based_2nd_exec_control) < 0)
- return -EIO;
- }
-#ifndef CONFIG_X86_64
- if (!(_cpu_based_2nd_exec_control &
- SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
- _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
-#endif
- if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
- /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
- enabled */
- _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
- CPU_BASED_CR3_STORE_EXITING |
- CPU_BASED_INVLPG_EXITING);
- rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
- vmx_capability.ept, vmx_capability.vpid);
- }
-
- min = 0;
-#ifdef CONFIG_X86_64
- min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
-#endif
- opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT;
- if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
- &_vmexit_control) < 0)
- return -EIO;
-
- min = 0;
- opt = VM_ENTRY_LOAD_IA32_PAT;
- if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
- &_vmentry_control) < 0)
- return -EIO;
-
- rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
-
- /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
- if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
- return -EIO;
-
-#ifdef CONFIG_X86_64
- /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
- if (vmx_msr_high & (1u<<16))
- return -EIO;
-#endif
-
- /* Require Write-Back (WB) memory type for VMCS accesses. */
- if (((vmx_msr_high >> 18) & 15) != 6)
- return -EIO;
-
- vmcs_conf->size = vmx_msr_high & 0x1fff;
- vmcs_conf->order = get_order(vmcs_config.size);
- vmcs_conf->revision_id = vmx_msr_low;
-
- vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
- vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
- vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
- vmcs_conf->vmexit_ctrl = _vmexit_control;
- vmcs_conf->vmentry_ctrl = _vmentry_control;
-
- cpu_has_load_ia32_efer =
- allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
- VM_ENTRY_LOAD_IA32_EFER)
- && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
- VM_EXIT_LOAD_IA32_EFER);
-
- cpu_has_load_perf_global_ctrl =
- allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
- VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
- && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
- VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
-
- /*
- * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL
- * but due to arrata below it can't be used. Workaround is to use
- * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL.
- *
- * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32]
- *
- * AAK155 (model 26)
- * AAP115 (model 30)
- * AAT100 (model 37)
- * BC86,AAY89,BD102 (model 44)
- * BA97 (model 46)
- *
- */
- if (cpu_has_load_perf_global_ctrl && boot_cpu_data.x86 == 0x6) {
- switch (boot_cpu_data.x86_model) {
- case 26:
- case 30:
- case 37:
- case 44:
- case 46:
- cpu_has_load_perf_global_ctrl = false;
- printk_once(KERN_WARNING"kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
- "does not work properly. Using workaround\n");
- break;
- default:
- break;
- }
- }
-
- return 0;
-}
-
-static struct vmcs *alloc_vmcs_cpu(int cpu)
-{
- int node = cpu_to_node(cpu);
- struct page *pages;
- struct vmcs *vmcs;
-
- pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order);
- if (!pages)
- return NULL;
- vmcs = page_address(pages);
- memset(vmcs, 0, vmcs_config.size);
- vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */
- return vmcs;
-}
-
-static struct vmcs *alloc_vmcs(void)
-{
- return alloc_vmcs_cpu(raw_smp_processor_id());
-}
-
-static void free_vmcs(struct vmcs *vmcs)
-{
- free_pages((unsigned long)vmcs, vmcs_config.order);
-}
-
-/*
- * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
- */
-static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
-{
- if (!loaded_vmcs->vmcs)
- return;
- loaded_vmcs_clear(loaded_vmcs);
- free_vmcs(loaded_vmcs->vmcs);
- loaded_vmcs->vmcs = NULL;
-}
-
-static void free_kvm_area(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- free_vmcs(per_cpu(vmxarea, cpu));
- per_cpu(vmxarea, cpu) = NULL;
- }
-}
-
-static __init int alloc_kvm_area(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct vmcs *vmcs;
-
- vmcs = alloc_vmcs_cpu(cpu);
- if (!vmcs) {
- free_kvm_area();
- return -ENOMEM;
- }
-
- per_cpu(vmxarea, cpu) = vmcs;
- }
- return 0;
-}
-
-static __init int hardware_setup(void)
-{
- if (setup_vmcs_config(&vmcs_config) < 0)
- return -EIO;
-
- if (boot_cpu_has(X86_FEATURE_NX))
- kvm_enable_efer_bits(EFER_NX);
-
- if (!cpu_has_vmx_vpid())
- enable_vpid = 0;
-
- if (!cpu_has_vmx_ept() ||
- !cpu_has_vmx_ept_4levels()) {
- enable_ept = 0;
- enable_unrestricted_guest = 0;
- }
-
- if (!cpu_has_vmx_unrestricted_guest())
- enable_unrestricted_guest = 0;
-
- if (!cpu_has_vmx_flexpriority())
- flexpriority_enabled = 0;
-
- if (!cpu_has_vmx_tpr_shadow())
- kvm_x86_ops->update_cr8_intercept = NULL;
-
- if (enable_ept && !cpu_has_vmx_ept_2m_page())
- kvm_disable_largepages();
-
- if (!cpu_has_vmx_ple())
- ple_gap = 0;
-
- if (nested)
- nested_vmx_setup_ctls_msrs();
-
- return alloc_kvm_area();
-}
-
-static __exit void hardware_unsetup(void)
-{
- free_kvm_area();
-}
-
-static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
-{
- struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
-
- if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) {
- vmcs_write16(sf->selector, save->selector);
- vmcs_writel(sf->base, save->base);
- vmcs_write32(sf->limit, save->limit);
- vmcs_write32(sf->ar_bytes, save->ar);
- } else {
- u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK)
- << AR_DPL_SHIFT;
- vmcs_write32(sf->ar_bytes, 0x93 | dpl);
- }
-}
-
-static void enter_pmode(struct kvm_vcpu *vcpu)
-{
- unsigned long flags;
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- vmx->emulation_required = 1;
- vmx->rmode.vm86_active = 0;
-
- vmx_segment_cache_clear(vmx);
-
- vmcs_write16(GUEST_TR_SELECTOR, vmx->rmode.tr.selector);
- vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base);
- vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit);
- vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar);
-
- flags = vmcs_readl(GUEST_RFLAGS);
- flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
- flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
- vmcs_writel(GUEST_RFLAGS, flags);
-
- vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
- (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
-
- update_exception_bitmap(vcpu);
-
- if (emulate_invalid_guest_state)
- return;
-
- fix_pmode_dataseg(VCPU_SREG_ES, &vmx->rmode.es);
- fix_pmode_dataseg(VCPU_SREG_DS, &vmx->rmode.ds);
- fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs);
- fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs);
-
- vmx_segment_cache_clear(vmx);
-
- vmcs_write16(GUEST_SS_SELECTOR, 0);
- vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
-
- vmcs_write16(GUEST_CS_SELECTOR,
- vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
- vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
-}
-
-static gva_t rmode_tss_base(struct kvm *kvm)
-{
- if (!kvm->arch.tss_addr) {
- struct kvm_memslots *slots;
- struct kvm_memory_slot *slot;
- gfn_t base_gfn;
-
- slots = kvm_memslots(kvm);
- slot = id_to_memslot(slots, 0);
- base_gfn = slot->base_gfn + slot->npages - 3;
-
- return base_gfn << PAGE_SHIFT;
- }
- return kvm->arch.tss_addr;
-}
-
-static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
-{
- struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
-
- save->selector = vmcs_read16(sf->selector);
- save->base = vmcs_readl(sf->base);
- save->limit = vmcs_read32(sf->limit);
- save->ar = vmcs_read32(sf->ar_bytes);
- vmcs_write16(sf->selector, save->base >> 4);
- vmcs_write32(sf->base, save->base & 0xffff0);
- vmcs_write32(sf->limit, 0xffff);
- vmcs_write32(sf->ar_bytes, 0xf3);
- if (save->base & 0xf)
- printk_once(KERN_WARNING "kvm: segment base is not paragraph"
- " aligned when entering protected mode (seg=%d)",
- seg);
-}
-
-static void enter_rmode(struct kvm_vcpu *vcpu)
-{
- unsigned long flags;
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- if (enable_unrestricted_guest)
- return;
-
- vmx->emulation_required = 1;
- vmx->rmode.vm86_active = 1;
-
- /*
- * Very old userspace does not call KVM_SET_TSS_ADDR before entering
- * vcpu. Call it here with phys address pointing 16M below 4G.
- */
- if (!vcpu->kvm->arch.tss_addr) {
- printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
- "called before entering vcpu\n");
- srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
- vmx_set_tss_addr(vcpu->kvm, 0xfeffd000);
- vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
- }
-
- vmx_segment_cache_clear(vmx);
-
- vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR);
- vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
- vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
-
- vmx->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
- vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
-
- vmx->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
- vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
-
- flags = vmcs_readl(GUEST_RFLAGS);
- vmx->rmode.save_rflags = flags;
-
- flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
-
- vmcs_writel(GUEST_RFLAGS, flags);
- vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
- update_exception_bitmap(vcpu);
-
- if (emulate_invalid_guest_state)
- goto continue_rmode;
-
- vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
- vmcs_write32(GUEST_SS_LIMIT, 0xffff);
- vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
-
- vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
- vmcs_write32(GUEST_CS_LIMIT, 0xffff);
- if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
- vmcs_writel(GUEST_CS_BASE, 0xf0000);
- vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
-
- fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es);
- fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds);
- fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs);
- fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs);
-
-continue_rmode:
- kvm_mmu_reset_context(vcpu);
-}
-
-static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
-
- if (!msr)
- return;
-
- /*
- * Force kernel_gs_base reloading before EFER changes, as control
- * of this msr depends on is_long_mode().
- */
- vmx_load_host_state(to_vmx(vcpu));
- vcpu->arch.efer = efer;
- if (efer & EFER_LMA) {
- vmcs_write32(VM_ENTRY_CONTROLS,
- vmcs_read32(VM_ENTRY_CONTROLS) |
- VM_ENTRY_IA32E_MODE);
- msr->data = efer;
- } else {
- vmcs_write32(VM_ENTRY_CONTROLS,
- vmcs_read32(VM_ENTRY_CONTROLS) &
- ~VM_ENTRY_IA32E_MODE);
-
- msr->data = efer & ~EFER_LME;
- }
- setup_msrs(vmx);
-}
-
-#ifdef CONFIG_X86_64
-
-static void enter_lmode(struct kvm_vcpu *vcpu)
-{
- u32 guest_tr_ar;
-
- vmx_segment_cache_clear(to_vmx(vcpu));
-
- guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
- if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
- pr_debug_ratelimited("%s: tss fixup for long mode. \n",
- __func__);
- vmcs_write32(GUEST_TR_AR_BYTES,
- (guest_tr_ar & ~AR_TYPE_MASK)
- | AR_TYPE_BUSY_64_TSS);
- }
- vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
-}
-
-static void exit_lmode(struct kvm_vcpu *vcpu)
-{
- vmcs_write32(VM_ENTRY_CONTROLS,
- vmcs_read32(VM_ENTRY_CONTROLS)
- & ~VM_ENTRY_IA32E_MODE);
- vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
-}
-
-#endif
-
-static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
-{
- vpid_sync_context(to_vmx(vcpu));
- if (enable_ept) {
- if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
- return;
- ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
- }
-}
-
-static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
-{
- ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
-
- vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
- vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
-}
-
-static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
-{
- if (enable_ept && is_paging(vcpu))
- vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
- __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
-}
-
-static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
-{
- ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
-
- vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
- vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
-}
-
-static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
-{
- if (!test_bit(VCPU_EXREG_PDPTR,
- (unsigned long *)&vcpu->arch.regs_dirty))
- return;
-
- if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
- vmcs_write64(GUEST_PDPTR0, vcpu->arch.mmu.pdptrs[0]);
- vmcs_write64(GUEST_PDPTR1, vcpu->arch.mmu.pdptrs[1]);
- vmcs_write64(GUEST_PDPTR2, vcpu->arch.mmu.pdptrs[2]);
- vmcs_write64(GUEST_PDPTR3, vcpu->arch.mmu.pdptrs[3]);
- }
-}
-
-static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
-{
- if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
- vcpu->arch.mmu.pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
- vcpu->arch.mmu.pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
- vcpu->arch.mmu.pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
- vcpu->arch.mmu.pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
- }
-
- __set_bit(VCPU_EXREG_PDPTR,
- (unsigned long *)&vcpu->arch.regs_avail);
- __set_bit(VCPU_EXREG_PDPTR,
- (unsigned long *)&vcpu->arch.regs_dirty);
-}
-
-static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
-
-static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
- unsigned long cr0,
- struct kvm_vcpu *vcpu)
-{
- if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
- vmx_decache_cr3(vcpu);
- if (!(cr0 & X86_CR0_PG)) {
- /* From paging/starting to nonpaging */
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
- vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) |
- (CPU_BASED_CR3_LOAD_EXITING |
- CPU_BASED_CR3_STORE_EXITING));
- vcpu->arch.cr0 = cr0;
- vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
- } else if (!is_paging(vcpu)) {
- /* From nonpaging to paging */
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
- vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
- ~(CPU_BASED_CR3_LOAD_EXITING |
- CPU_BASED_CR3_STORE_EXITING));
- vcpu->arch.cr0 = cr0;
- vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
- }
-
- if (!(cr0 & X86_CR0_WP))
- *hw_cr0 &= ~X86_CR0_WP;
-}
-
-static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long hw_cr0;
-
- if (enable_unrestricted_guest)
- hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST)
- | KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
- else
- hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON;
-
- if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
- enter_pmode(vcpu);
-
- if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
- enter_rmode(vcpu);
-
-#ifdef CONFIG_X86_64
- if (vcpu->arch.efer & EFER_LME) {
- if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
- enter_lmode(vcpu);
- if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
- exit_lmode(vcpu);
- }
-#endif
-
- if (enable_ept)
- ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
-
- if (!vcpu->fpu_active)
- hw_cr0 |= X86_CR0_TS | X86_CR0_MP;
-
- vmcs_writel(CR0_READ_SHADOW, cr0);
- vmcs_writel(GUEST_CR0, hw_cr0);
- vcpu->arch.cr0 = cr0;
- __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
-}
-
-static u64 construct_eptp(unsigned long root_hpa)
-{
- u64 eptp;
-
- /* TODO write the value reading from MSR */
- eptp = VMX_EPT_DEFAULT_MT |
- VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
- eptp |= (root_hpa & PAGE_MASK);
-
- return eptp;
-}
-
-static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
-{
- unsigned long guest_cr3;
- u64 eptp;
-
- guest_cr3 = cr3;
- if (enable_ept) {
- eptp = construct_eptp(cr3);
- vmcs_write64(EPT_POINTER, eptp);
- guest_cr3 = is_paging(vcpu) ? kvm_read_cr3(vcpu) :
- vcpu->kvm->arch.ept_identity_map_addr;
- ept_load_pdptrs(vcpu);
- }
-
- vmx_flush_tlb(vcpu);
- vmcs_writel(GUEST_CR3, guest_cr3);
-}
-
-static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
-{
- unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ?
- KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
-
- if (cr4 & X86_CR4_VMXE) {
- /*
- * To use VMXON (and later other VMX instructions), a guest
- * must first be able to turn on cr4.VMXE (see handle_vmon()).
- * So basically the check on whether to allow nested VMX
- * is here.
- */
- if (!nested_vmx_allowed(vcpu))
- return 1;
- } else if (to_vmx(vcpu)->nested.vmxon)
- return 1;
-
- vcpu->arch.cr4 = cr4;
- if (enable_ept) {
- if (!is_paging(vcpu)) {
- hw_cr4 &= ~X86_CR4_PAE;
- hw_cr4 |= X86_CR4_PSE;
- } else if (!(cr4 & X86_CR4_PAE)) {
- hw_cr4 &= ~X86_CR4_PAE;
- }
- }
-
- vmcs_writel(CR4_READ_SHADOW, cr4);
- vmcs_writel(GUEST_CR4, hw_cr4);
- return 0;
-}
-
-static void vmx_get_segment(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct kvm_save_segment *save;
- u32 ar;
-
- if (vmx->rmode.vm86_active
- && (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES
- || seg == VCPU_SREG_DS || seg == VCPU_SREG_FS
- || seg == VCPU_SREG_GS)
- && !emulate_invalid_guest_state) {
- switch (seg) {
- case VCPU_SREG_TR: save = &vmx->rmode.tr; break;
- case VCPU_SREG_ES: save = &vmx->rmode.es; break;
- case VCPU_SREG_DS: save = &vmx->rmode.ds; break;
- case VCPU_SREG_FS: save = &vmx->rmode.fs; break;
- case VCPU_SREG_GS: save = &vmx->rmode.gs; break;
- default: BUG();
- }
- var->selector = save->selector;
- var->base = save->base;
- var->limit = save->limit;
- ar = save->ar;
- if (seg == VCPU_SREG_TR
- || var->selector == vmx_read_guest_seg_selector(vmx, seg))
- goto use_saved_rmode_seg;
- }
- var->base = vmx_read_guest_seg_base(vmx, seg);
- var->limit = vmx_read_guest_seg_limit(vmx, seg);
- var->selector = vmx_read_guest_seg_selector(vmx, seg);
- ar = vmx_read_guest_seg_ar(vmx, seg);
-use_saved_rmode_seg:
- if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
- ar = 0;
- var->type = ar & 15;
- var->s = (ar >> 4) & 1;
- var->dpl = (ar >> 5) & 3;
- var->present = (ar >> 7) & 1;
- var->avl = (ar >> 12) & 1;
- var->l = (ar >> 13) & 1;
- var->db = (ar >> 14) & 1;
- var->g = (ar >> 15) & 1;
- var->unusable = (ar >> 16) & 1;
-}
-
-static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
-{
- struct kvm_segment s;
-
- if (to_vmx(vcpu)->rmode.vm86_active) {
- vmx_get_segment(vcpu, &s, seg);
- return s.base;
- }
- return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
-}
-
-static int __vmx_get_cpl(struct kvm_vcpu *vcpu)
-{
- if (!is_protmode(vcpu))
- return 0;
-
- if (!is_long_mode(vcpu)
- && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */
- return 3;
-
- return vmx_read_guest_seg_selector(to_vmx(vcpu), VCPU_SREG_CS) & 3;
-}
-
-static int vmx_get_cpl(struct kvm_vcpu *vcpu)
-{
- if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) {
- __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
- to_vmx(vcpu)->cpl = __vmx_get_cpl(vcpu);
- }
- return to_vmx(vcpu)->cpl;
-}
-
-
-static u32 vmx_segment_access_rights(struct kvm_segment *var)
-{
- u32 ar;
-
- if (var->unusable)
- ar = 1 << 16;
- else {
- ar = var->type & 15;
- ar |= (var->s & 1) << 4;
- ar |= (var->dpl & 3) << 5;
- ar |= (var->present & 1) << 7;
- ar |= (var->avl & 1) << 12;
- ar |= (var->l & 1) << 13;
- ar |= (var->db & 1) << 14;
- ar |= (var->g & 1) << 15;
- }
- if (ar == 0) /* a 0 value means unusable */
- ar = AR_UNUSABLE_MASK;
-
- return ar;
-}
-
-static void vmx_set_segment(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
- u32 ar;
-
- vmx_segment_cache_clear(vmx);
-
- if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) {
- vmcs_write16(sf->selector, var->selector);
- vmx->rmode.tr.selector = var->selector;
- vmx->rmode.tr.base = var->base;
- vmx->rmode.tr.limit = var->limit;
- vmx->rmode.tr.ar = vmx_segment_access_rights(var);
- return;
- }
- vmcs_writel(sf->base, var->base);
- vmcs_write32(sf->limit, var->limit);
- vmcs_write16(sf->selector, var->selector);
- if (vmx->rmode.vm86_active && var->s) {
- /*
- * Hack real-mode segments into vm86 compatibility.
- */
- if (var->base == 0xffff0000 && var->selector == 0xf000)
- vmcs_writel(sf->base, 0xf0000);
- ar = 0xf3;
- } else
- ar = vmx_segment_access_rights(var);
-
- /*
- * Fix the "Accessed" bit in AR field of segment registers for older
- * qemu binaries.
- * IA32 arch specifies that at the time of processor reset the
- * "Accessed" bit in the AR field of segment registers is 1. And qemu
- * is setting it to 0 in the usedland code. This causes invalid guest
- * state vmexit when "unrestricted guest" mode is turned on.
- * Fix for this setup issue in cpu_reset is being pushed in the qemu
- * tree. Newer qemu binaries with that qemu fix would not need this
- * kvm hack.
- */
- if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
- ar |= 0x1; /* Accessed */
-
- vmcs_write32(sf->ar_bytes, ar);
- __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
-}
-
-static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
-{
- u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
-
- *db = (ar >> 14) & 1;
- *l = (ar >> 13) & 1;
-}
-
-static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
-{
- dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
- dt->address = vmcs_readl(GUEST_IDTR_BASE);
-}
-
-static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
-{
- vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
- vmcs_writel(GUEST_IDTR_BASE, dt->address);
-}
-
-static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
-{
- dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
- dt->address = vmcs_readl(GUEST_GDTR_BASE);
-}
-
-static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
-{
- vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
- vmcs_writel(GUEST_GDTR_BASE, dt->address);
-}
-
-static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
-{
- struct kvm_segment var;
- u32 ar;
-
- vmx_get_segment(vcpu, &var, seg);
- ar = vmx_segment_access_rights(&var);
-
- if (var.base != (var.selector << 4))
- return false;
- if (var.limit != 0xffff)
- return false;
- if (ar != 0xf3)
- return false;
-
- return true;
-}
-
-static bool code_segment_valid(struct kvm_vcpu *vcpu)
-{
- struct kvm_segment cs;
- unsigned int cs_rpl;
-
- vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
- cs_rpl = cs.selector & SELECTOR_RPL_MASK;
-
- if (cs.unusable)
- return false;
- if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK))
- return false;
- if (!cs.s)
- return false;
- if (cs.type & AR_TYPE_WRITEABLE_MASK) {
- if (cs.dpl > cs_rpl)
- return false;
- } else {
- if (cs.dpl != cs_rpl)
- return false;
- }
- if (!cs.present)
- return false;
-
- /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
- return true;
-}
-
-static bool stack_segment_valid(struct kvm_vcpu *vcpu)
-{
- struct kvm_segment ss;
- unsigned int ss_rpl;
-
- vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
- ss_rpl = ss.selector & SELECTOR_RPL_MASK;
-
- if (ss.unusable)
- return true;
- if (ss.type != 3 && ss.type != 7)
- return false;
- if (!ss.s)
- return false;
- if (ss.dpl != ss_rpl) /* DPL != RPL */
- return false;
- if (!ss.present)
- return false;
-
- return true;
-}
-
-static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
-{
- struct kvm_segment var;
- unsigned int rpl;
-
- vmx_get_segment(vcpu, &var, seg);
- rpl = var.selector & SELECTOR_RPL_MASK;
-
- if (var.unusable)
- return true;
- if (!var.s)
- return false;
- if (!var.present)
- return false;
- if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) {
- if (var.dpl < rpl) /* DPL < RPL */
- return false;
- }
-
- /* TODO: Add other members to kvm_segment_field to allow checking for other access
- * rights flags
- */
- return true;
-}
-
-static bool tr_valid(struct kvm_vcpu *vcpu)
-{
- struct kvm_segment tr;
-
- vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
-
- if (tr.unusable)
- return false;
- if (tr.selector & SELECTOR_TI_MASK) /* TI = 1 */
- return false;
- if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
- return false;
- if (!tr.present)
- return false;
-
- return true;
-}
-
-static bool ldtr_valid(struct kvm_vcpu *vcpu)
-{
- struct kvm_segment ldtr;
-
- vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
-
- if (ldtr.unusable)
- return true;
- if (ldtr.selector & SELECTOR_TI_MASK) /* TI = 1 */
- return false;
- if (ldtr.type != 2)
- return false;
- if (!ldtr.present)
- return false;
-
- return true;
-}
-
-static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
-{
- struct kvm_segment cs, ss;
-
- vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
- vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
-
- return ((cs.selector & SELECTOR_RPL_MASK) ==
- (ss.selector & SELECTOR_RPL_MASK));
-}
-
-/*
- * Check if guest state is valid. Returns true if valid, false if
- * not.
- * We assume that registers are always usable
- */
-static bool guest_state_valid(struct kvm_vcpu *vcpu)
-{
- /* real mode guest state checks */
- if (!is_protmode(vcpu)) {
- if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
- return false;
- if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
- return false;
- if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
- return false;
- if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
- return false;
- if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
- return false;
- if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
- return false;
- } else {
- /* protected mode guest state checks */
- if (!cs_ss_rpl_check(vcpu))
- return false;
- if (!code_segment_valid(vcpu))
- return false;
- if (!stack_segment_valid(vcpu))
- return false;
- if (!data_segment_valid(vcpu, VCPU_SREG_DS))
- return false;
- if (!data_segment_valid(vcpu, VCPU_SREG_ES))
- return false;
- if (!data_segment_valid(vcpu, VCPU_SREG_FS))
- return false;
- if (!data_segment_valid(vcpu, VCPU_SREG_GS))
- return false;
- if (!tr_valid(vcpu))
- return false;
- if (!ldtr_valid(vcpu))
- return false;
- }
- /* TODO:
- * - Add checks on RIP
- * - Add checks on RFLAGS
- */
-
- return true;
-}
-
-static int init_rmode_tss(struct kvm *kvm)
-{
- gfn_t fn;
- u16 data = 0;
- int r, idx, ret = 0;
-
- idx = srcu_read_lock(&kvm->srcu);
- fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
- r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
- if (r < 0)
- goto out;
- data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
- r = kvm_write_guest_page(kvm, fn++, &data,
- TSS_IOPB_BASE_OFFSET, sizeof(u16));
- if (r < 0)
- goto out;
- r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
- if (r < 0)
- goto out;
- r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
- if (r < 0)
- goto out;
- data = ~0;
- r = kvm_write_guest_page(kvm, fn, &data,
- RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
- sizeof(u8));
- if (r < 0)
- goto out;
-
- ret = 1;
-out:
- srcu_read_unlock(&kvm->srcu, idx);
- return ret;
-}
-
-static int init_rmode_identity_map(struct kvm *kvm)
-{
- int i, idx, r, ret;
- pfn_t identity_map_pfn;
- u32 tmp;
-
- if (!enable_ept)
- return 1;
- if (unlikely(!kvm->arch.ept_identity_pagetable)) {
- printk(KERN_ERR "EPT: identity-mapping pagetable "
- "haven't been allocated!\n");
- return 0;
- }
- if (likely(kvm->arch.ept_identity_pagetable_done))
- return 1;
- ret = 0;
- identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT;
- idx = srcu_read_lock(&kvm->srcu);
- r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
- if (r < 0)
- goto out;
- /* Set up identity-mapping pagetable for EPT in real mode */
- for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
- tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
- _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
- r = kvm_write_guest_page(kvm, identity_map_pfn,
- &tmp, i * sizeof(tmp), sizeof(tmp));
- if (r < 0)
- goto out;
- }
- kvm->arch.ept_identity_pagetable_done = true;
- ret = 1;
-out:
- srcu_read_unlock(&kvm->srcu, idx);
- return ret;
-}
-
-static void seg_setup(int seg)
-{
- struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
- unsigned int ar;
-
- vmcs_write16(sf->selector, 0);
- vmcs_writel(sf->base, 0);
- vmcs_write32(sf->limit, 0xffff);
- if (enable_unrestricted_guest) {
- ar = 0x93;
- if (seg == VCPU_SREG_CS)
- ar |= 0x08; /* code segment */
- } else
- ar = 0xf3;
-
- vmcs_write32(sf->ar_bytes, ar);
-}
-
-static int alloc_apic_access_page(struct kvm *kvm)
-{
- struct kvm_userspace_memory_region kvm_userspace_mem;
- int r = 0;
-
- mutex_lock(&kvm->slots_lock);
- if (kvm->arch.apic_access_page)
- goto out;
- kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
- kvm_userspace_mem.flags = 0;
- kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
- kvm_userspace_mem.memory_size = PAGE_SIZE;
- r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
- if (r)
- goto out;
-
- kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
-out:
- mutex_unlock(&kvm->slots_lock);
- return r;
-}
-
-static int alloc_identity_pagetable(struct kvm *kvm)
-{
- struct kvm_userspace_memory_region kvm_userspace_mem;
- int r = 0;
-
- mutex_lock(&kvm->slots_lock);
- if (kvm->arch.ept_identity_pagetable)
- goto out;
- kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
- kvm_userspace_mem.flags = 0;
- kvm_userspace_mem.guest_phys_addr =
- kvm->arch.ept_identity_map_addr;
- kvm_userspace_mem.memory_size = PAGE_SIZE;
- r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
- if (r)
- goto out;
-
- kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
- kvm->arch.ept_identity_map_addr >> PAGE_SHIFT);
-out:
- mutex_unlock(&kvm->slots_lock);
- return r;
-}
-
-static void allocate_vpid(struct vcpu_vmx *vmx)
-{
- int vpid;
-
- vmx->vpid = 0;
- if (!enable_vpid)
- return;
- spin_lock(&vmx_vpid_lock);
- vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
- if (vpid < VMX_NR_VPIDS) {
- vmx->vpid = vpid;
- __set_bit(vpid, vmx_vpid_bitmap);
- }
- spin_unlock(&vmx_vpid_lock);
-}
-
-static void free_vpid(struct vcpu_vmx *vmx)
-{
- if (!enable_vpid)
- return;
- spin_lock(&vmx_vpid_lock);
- if (vmx->vpid != 0)
- __clear_bit(vmx->vpid, vmx_vpid_bitmap);
- spin_unlock(&vmx_vpid_lock);
-}
-
-static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
-{
- int f = sizeof(unsigned long);
-
- if (!cpu_has_vmx_msr_bitmap())
- return;
-
- /*
- * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
- * have the write-low and read-high bitmap offsets the wrong way round.
- * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
- */
- if (msr <= 0x1fff) {
- __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
- __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
- } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
- msr &= 0x1fff;
- __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
- __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
- }
-}
-
-static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
-{
- if (!longmode_only)
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
-}
-
-/*
- * Set up the vmcs's constant host-state fields, i.e., host-state fields that
- * will not change in the lifetime of the guest.
- * Note that host-state that does change is set elsewhere. E.g., host-state
- * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
- */
-static void vmx_set_constant_host_state(void)
-{
- u32 low32, high32;
- unsigned long tmpl;
- struct desc_ptr dt;
-
- vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */
- vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
- vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
-
- vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
- vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
- vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
- vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
- vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
-
- native_store_idt(&dt);
- vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
-
- asm("mov $.Lkvm_vmx_return, %0" : "=r"(tmpl));
- vmcs_writel(HOST_RIP, tmpl); /* 22.2.5 */
-
- rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
- vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
- rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
- vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */
-
- if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
- rdmsr(MSR_IA32_CR_PAT, low32, high32);
- vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
- }
-}
-
-static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
-{
- vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
- if (enable_ept)
- vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
- if (is_guest_mode(&vmx->vcpu))
- vmx->vcpu.arch.cr4_guest_owned_bits &=
- ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;
- vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
-}
-
-static u32 vmx_exec_control(struct vcpu_vmx *vmx)
-{
- u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
- if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
- exec_control &= ~CPU_BASED_TPR_SHADOW;
-#ifdef CONFIG_X86_64
- exec_control |= CPU_BASED_CR8_STORE_EXITING |
- CPU_BASED_CR8_LOAD_EXITING;
-#endif
- }
- if (!enable_ept)
- exec_control |= CPU_BASED_CR3_STORE_EXITING |
- CPU_BASED_CR3_LOAD_EXITING |
- CPU_BASED_INVLPG_EXITING;
- return exec_control;
-}
-
-static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
-{
- u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
- if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
- exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
- if (vmx->vpid == 0)
- exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
- if (!enable_ept) {
- exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
- enable_unrestricted_guest = 0;
- }
- if (!enable_unrestricted_guest)
- exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
- if (!ple_gap)
- exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
- return exec_control;
-}
-
-static void ept_set_mmio_spte_mask(void)
-{
- /*
- * EPT Misconfigurations can be generated if the value of bits 2:0
- * of an EPT paging-structure entry is 110b (write/execute).
- * Also, magic bits (0xffull << 49) is set to quickly identify mmio
- * spte.
- */
- kvm_mmu_set_mmio_spte_mask(0xffull << 49 | 0x6ull);
-}
-
-/*
- * Sets up the vmcs for emulated real mode.
- */
-static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
-{
-#ifdef CONFIG_X86_64
- unsigned long a;
-#endif
- int i;
-
- /* I/O */
- vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
- vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b));
-
- if (cpu_has_vmx_msr_bitmap())
- vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
-
- vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
-
- /* Control */
- vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
- vmcs_config.pin_based_exec_ctrl);
-
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
-
- if (cpu_has_secondary_exec_ctrls()) {
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
- vmx_secondary_exec_control(vmx));
- }
-
- if (ple_gap) {
- vmcs_write32(PLE_GAP, ple_gap);
- vmcs_write32(PLE_WINDOW, ple_window);
- }
-
- vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
- vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
- vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
-
- vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */
- vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */
- vmx_set_constant_host_state();
-#ifdef CONFIG_X86_64
- rdmsrl(MSR_FS_BASE, a);
- vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
- rdmsrl(MSR_GS_BASE, a);
- vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
-#else
- vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
- vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
-#endif
-
- vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
- vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
- vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
- vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
- vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
-
- if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
- u32 msr_low, msr_high;
- u64 host_pat;
- rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
- host_pat = msr_low | ((u64) msr_high << 32);
- /* Write the default value follow host pat */
- vmcs_write64(GUEST_IA32_PAT, host_pat);
- /* Keep arch.pat sync with GUEST_IA32_PAT */
- vmx->vcpu.arch.pat = host_pat;
- }
-
- for (i = 0; i < NR_VMX_MSR; ++i) {
- u32 index = vmx_msr_index[i];
- u32 data_low, data_high;
- int j = vmx->nmsrs;
-
- if (rdmsr_safe(index, &data_low, &data_high) < 0)
- continue;
- if (wrmsr_safe(index, data_low, data_high) < 0)
- continue;
- vmx->guest_msrs[j].index = i;
- vmx->guest_msrs[j].data = 0;
- vmx->guest_msrs[j].mask = -1ull;
- ++vmx->nmsrs;
- }
-
- vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
-
- /* 22.2.1, 20.8.1 */
- vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
-
- vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
- set_cr4_guest_host_mask(vmx);
-
- kvm_write_tsc(&vmx->vcpu, 0);
-
- return 0;
-}
-
-static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- u64 msr;
- int ret;
-
- vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
-
- vmx->rmode.vm86_active = 0;
-
- vmx->soft_vnmi_blocked = 0;
-
- vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
- kvm_set_cr8(&vmx->vcpu, 0);
- msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
- if (kvm_vcpu_is_bsp(&vmx->vcpu))
- msr |= MSR_IA32_APICBASE_BSP;
- kvm_set_apic_base(&vmx->vcpu, msr);
-
- ret = fx_init(&vmx->vcpu);
- if (ret != 0)
- goto out;
-
- vmx_segment_cache_clear(vmx);
-
- seg_setup(VCPU_SREG_CS);
- /*
- * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
- * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
- */
- if (kvm_vcpu_is_bsp(&vmx->vcpu)) {
- vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
- vmcs_writel(GUEST_CS_BASE, 0x000f0000);
- } else {
- vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
- vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
- }
-
- seg_setup(VCPU_SREG_DS);
- seg_setup(VCPU_SREG_ES);
- seg_setup(VCPU_SREG_FS);
- seg_setup(VCPU_SREG_GS);
- seg_setup(VCPU_SREG_SS);
-
- vmcs_write16(GUEST_TR_SELECTOR, 0);
- vmcs_writel(GUEST_TR_BASE, 0);
- vmcs_write32(GUEST_TR_LIMIT, 0xffff);
- vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
-
- vmcs_write16(GUEST_LDTR_SELECTOR, 0);
- vmcs_writel(GUEST_LDTR_BASE, 0);
- vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
- vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
-
- vmcs_write32(GUEST_SYSENTER_CS, 0);
- vmcs_writel(GUEST_SYSENTER_ESP, 0);
- vmcs_writel(GUEST_SYSENTER_EIP, 0);
-
- vmcs_writel(GUEST_RFLAGS, 0x02);
- if (kvm_vcpu_is_bsp(&vmx->vcpu))
- kvm_rip_write(vcpu, 0xfff0);
- else
- kvm_rip_write(vcpu, 0);
- kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
-
- vmcs_writel(GUEST_DR7, 0x400);
-
- vmcs_writel(GUEST_GDTR_BASE, 0);
- vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
-
- vmcs_writel(GUEST_IDTR_BASE, 0);
- vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
-
- vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
- vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
- vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
-
- /* Special registers */
- vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
-
- setup_msrs(vmx);
-
- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
-
- if (cpu_has_vmx_tpr_shadow()) {
- vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
- if (vm_need_tpr_shadow(vmx->vcpu.kvm))
- vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
- __pa(vmx->vcpu.arch.apic->regs));
- vmcs_write32(TPR_THRESHOLD, 0);
- }
-
- if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
- vmcs_write64(APIC_ACCESS_ADDR,
- page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
-
- if (vmx->vpid != 0)
- vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
-
- vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
- vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
- vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */
- srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
- vmx_set_cr4(&vmx->vcpu, 0);
- vmx_set_efer(&vmx->vcpu, 0);
- vmx_fpu_activate(&vmx->vcpu);
- update_exception_bitmap(&vmx->vcpu);
-
- vpid_sync_context(vmx);
-
- ret = 0;
-
- /* HACK: Don't enable emulation on guest boot/reset */
- vmx->emulation_required = 0;
-
-out:
- return ret;
-}
-
-/*
- * In nested virtualization, check if L1 asked to exit on external interrupts.
- * For most existing hypervisors, this will always return true.
- */
-static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
-{
- return get_vmcs12(vcpu)->pin_based_vm_exec_control &
- PIN_BASED_EXT_INTR_MASK;
-}
-
-static void enable_irq_window(struct kvm_vcpu *vcpu)
-{
- u32 cpu_based_vm_exec_control;
- if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) {
- /*
- * We get here if vmx_interrupt_allowed() said we can't
- * inject to L1 now because L2 must run. Ask L2 to exit
- * right after entry, so we can inject to L1 more promptly.
- */
- kvm_make_request(KVM_REQ_IMMEDIATE_EXIT, vcpu);
- return;
- }
-
- cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
- cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-}
-
-static void enable_nmi_window(struct kvm_vcpu *vcpu)
-{
- u32 cpu_based_vm_exec_control;
-
- if (!cpu_has_virtual_nmis()) {
- enable_irq_window(vcpu);
- return;
- }
-
- if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
- enable_irq_window(vcpu);
- return;
- }
- cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
- cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-}
-
-static void vmx_inject_irq(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- uint32_t intr;
- int irq = vcpu->arch.interrupt.nr;
-
- trace_kvm_inj_virq(irq);
-
- ++vcpu->stat.irq_injections;
- if (vmx->rmode.vm86_active) {
- int inc_eip = 0;
- if (vcpu->arch.interrupt.soft)
- inc_eip = vcpu->arch.event_exit_inst_len;
- if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE)
- kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
- return;
- }
- intr = irq | INTR_INFO_VALID_MASK;
- if (vcpu->arch.interrupt.soft) {
- intr |= INTR_TYPE_SOFT_INTR;
- vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
- vmx->vcpu.arch.event_exit_inst_len);
- } else
- intr |= INTR_TYPE_EXT_INTR;
- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
-}
-
-static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- if (is_guest_mode(vcpu))
- return;
-
- if (!cpu_has_virtual_nmis()) {
- /*
- * Tracking the NMI-blocked state in software is built upon
- * finding the next open IRQ window. This, in turn, depends on
- * well-behaving guests: They have to keep IRQs disabled at
- * least as long as the NMI handler runs. Otherwise we may
- * cause NMI nesting, maybe breaking the guest. But as this is
- * highly unlikely, we can live with the residual risk.
- */
- vmx->soft_vnmi_blocked = 1;
- vmx->vnmi_blocked_time = 0;
- }
-
- ++vcpu->stat.nmi_injections;
- vmx->nmi_known_unmasked = false;
- if (vmx->rmode.vm86_active) {
- if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE)
- kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
- return;
- }
- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
- INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
-}
-
-static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
-{
- if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
- return 0;
-
- return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
- (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
- | GUEST_INTR_STATE_NMI));
-}
-
-static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
-{
- if (!cpu_has_virtual_nmis())
- return to_vmx(vcpu)->soft_vnmi_blocked;
- if (to_vmx(vcpu)->nmi_known_unmasked)
- return false;
- return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
-}
-
-static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- if (!cpu_has_virtual_nmis()) {
- if (vmx->soft_vnmi_blocked != masked) {
- vmx->soft_vnmi_blocked = masked;
- vmx->vnmi_blocked_time = 0;
- }
- } else {
- vmx->nmi_known_unmasked = !masked;
- if (masked)
- vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
- GUEST_INTR_STATE_NMI);
- else
- vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
- GUEST_INTR_STATE_NMI);
- }
-}
-
-static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
-{
- if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) {
- struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- if (to_vmx(vcpu)->nested.nested_run_pending ||
- (vmcs12->idt_vectoring_info_field &
- VECTORING_INFO_VALID_MASK))
- return 0;
- nested_vmx_vmexit(vcpu);
- vmcs12->vm_exit_reason = EXIT_REASON_EXTERNAL_INTERRUPT;
- vmcs12->vm_exit_intr_info = 0;
- /* fall through to normal code, but now in L1, not L2 */
- }
-
- return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
- !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
- (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
-}
-
-static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
-{
- int ret;
- struct kvm_userspace_memory_region tss_mem = {
- .slot = TSS_PRIVATE_MEMSLOT,
- .guest_phys_addr = addr,
- .memory_size = PAGE_SIZE * 3,
- .flags = 0,
- };
-
- ret = kvm_set_memory_region(kvm, &tss_mem, 0);
- if (ret)
- return ret;
- kvm->arch.tss_addr = addr;
- if (!init_rmode_tss(kvm))
- return -ENOMEM;
-
- return 0;
-}
-
-static int handle_rmode_exception(struct kvm_vcpu *vcpu,
- int vec, u32 err_code)
-{
- /*
- * Instruction with address size override prefix opcode 0x67
- * Cause the #SS fault with 0 error code in VM86 mode.
- */
- if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
- if (emulate_instruction(vcpu, 0) == EMULATE_DONE)
- return 1;
- /*
- * Forward all other exceptions that are valid in real mode.
- * FIXME: Breaks guest debugging in real mode, needs to be fixed with
- * the required debugging infrastructure rework.
- */
- switch (vec) {
- case DB_VECTOR:
- if (vcpu->guest_debug &
- (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
- return 0;
- kvm_queue_exception(vcpu, vec);
- return 1;
- case BP_VECTOR:
- /*
- * Update instruction length as we may reinject the exception
- * from user space while in guest debugging mode.
- */
- to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
- vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
- if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
- return 0;
- /* fall through */
- case DE_VECTOR:
- case OF_VECTOR:
- case BR_VECTOR:
- case UD_VECTOR:
- case DF_VECTOR:
- case SS_VECTOR:
- case GP_VECTOR:
- case MF_VECTOR:
- kvm_queue_exception(vcpu, vec);
- return 1;
- }
- return 0;
-}
-
-/*
- * Trigger machine check on the host. We assume all the MSRs are already set up
- * by the CPU and that we still run on the same CPU as the MCE occurred on.
- * We pass a fake environment to the machine check handler because we want
- * the guest to be always treated like user space, no matter what context
- * it used internally.
- */
-static void kvm_machine_check(void)
-{
-#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
- struct pt_regs regs = {
- .cs = 3, /* Fake ring 3 no matter what the guest ran on */
- .flags = X86_EFLAGS_IF,
- };
-
- do_machine_check(&regs, 0);
-#endif
-}
-
-static int handle_machine_check(struct kvm_vcpu *vcpu)
-{
- /* already handled by vcpu_run */
- return 1;
-}
-
-static int handle_exception(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct kvm_run *kvm_run = vcpu->run;
- u32 intr_info, ex_no, error_code;
- unsigned long cr2, rip, dr6;
- u32 vect_info;
- enum emulation_result er;
-
- vect_info = vmx->idt_vectoring_info;
- intr_info = vmx->exit_intr_info;
-
- if (is_machine_check(intr_info))
- return handle_machine_check(vcpu);
-
- if ((vect_info & VECTORING_INFO_VALID_MASK) &&
- !is_page_fault(intr_info)) {
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
- vcpu->run->internal.ndata = 2;
- vcpu->run->internal.data[0] = vect_info;
- vcpu->run->internal.data[1] = intr_info;
- return 0;
- }
-
- if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
- return 1; /* already handled by vmx_vcpu_run() */
-
- if (is_no_device(intr_info)) {
- vmx_fpu_activate(vcpu);
- return 1;
- }
-
- if (is_invalid_opcode(intr_info)) {
- er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
- if (er != EMULATE_DONE)
- kvm_queue_exception(vcpu, UD_VECTOR);
- return 1;
- }
-
- error_code = 0;
- if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
- error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
- if (is_page_fault(intr_info)) {
- /* EPT won't cause page fault directly */
- BUG_ON(enable_ept);
- cr2 = vmcs_readl(EXIT_QUALIFICATION);
- trace_kvm_page_fault(cr2, error_code);
-
- if (kvm_event_needs_reinjection(vcpu))
- kvm_mmu_unprotect_page_virt(vcpu, cr2);
- return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0);
- }
-
- if (vmx->rmode.vm86_active &&
- handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
- error_code)) {
- if (vcpu->arch.halt_request) {
- vcpu->arch.halt_request = 0;
- return kvm_emulate_halt(vcpu);
- }
- return 1;
- }
-
- ex_no = intr_info & INTR_INFO_VECTOR_MASK;
- switch (ex_no) {
- case DB_VECTOR:
- dr6 = vmcs_readl(EXIT_QUALIFICATION);
- if (!(vcpu->guest_debug &
- (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
- vcpu->arch.dr6 = dr6 | DR6_FIXED_1;
- kvm_queue_exception(vcpu, DB_VECTOR);
- return 1;
- }
- kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
- kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
- /* fall through */
- case BP_VECTOR:
- /*
- * Update instruction length as we may reinject #BP from
- * user space while in guest debugging mode. Reading it for
- * #DB as well causes no harm, it is not used in that case.
- */
- vmx->vcpu.arch.event_exit_inst_len =
- vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
- kvm_run->exit_reason = KVM_EXIT_DEBUG;
- rip = kvm_rip_read(vcpu);
- kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
- kvm_run->debug.arch.exception = ex_no;
- break;
- default:
- kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
- kvm_run->ex.exception = ex_no;
- kvm_run->ex.error_code = error_code;
- break;
- }
- return 0;
-}
-
-static int handle_external_interrupt(struct kvm_vcpu *vcpu)
-{
- ++vcpu->stat.irq_exits;
- return 1;
-}
-
-static int handle_triple_fault(struct kvm_vcpu *vcpu)
-{
- vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
- return 0;
-}
-
-static int handle_io(struct kvm_vcpu *vcpu)
-{
- unsigned long exit_qualification;
- int size, in, string;
- unsigned port;
-
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
- string = (exit_qualification & 16) != 0;
- in = (exit_qualification & 8) != 0;
-
- ++vcpu->stat.io_exits;
-
- if (string || in)
- return emulate_instruction(vcpu, 0) == EMULATE_DONE;
-
- port = exit_qualification >> 16;
- size = (exit_qualification & 7) + 1;
- skip_emulated_instruction(vcpu);
-
- return kvm_fast_pio_out(vcpu, size, port);
-}
-
-static void
-vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
-{
- /*
- * Patch in the VMCALL instruction:
- */
- hypercall[0] = 0x0f;
- hypercall[1] = 0x01;
- hypercall[2] = 0xc1;
-}
-
-/* called to set cr0 as approriate for a mov-to-cr0 exit. */
-static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
-{
- if (to_vmx(vcpu)->nested.vmxon &&
- ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON))
- return 1;
-
- if (is_guest_mode(vcpu)) {
- /*
- * We get here when L2 changed cr0 in a way that did not change
- * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
- * but did change L0 shadowed bits. This can currently happen
- * with the TS bit: L0 may want to leave TS on (for lazy fpu
- * loading) while pretending to allow the guest to change it.
- */
- if (kvm_set_cr0(vcpu, (val & vcpu->arch.cr0_guest_owned_bits) |
- (vcpu->arch.cr0 & ~vcpu->arch.cr0_guest_owned_bits)))
- return 1;
- vmcs_writel(CR0_READ_SHADOW, val);
- return 0;
- } else
- return kvm_set_cr0(vcpu, val);
-}
-
-static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
-{
- if (is_guest_mode(vcpu)) {
- if (kvm_set_cr4(vcpu, (val & vcpu->arch.cr4_guest_owned_bits) |
- (vcpu->arch.cr4 & ~vcpu->arch.cr4_guest_owned_bits)))
- return 1;
- vmcs_writel(CR4_READ_SHADOW, val);
- return 0;
- } else
- return kvm_set_cr4(vcpu, val);
-}
-
-/* called to set cr0 as approriate for clts instruction exit. */
-static void handle_clts(struct kvm_vcpu *vcpu)
-{
- if (is_guest_mode(vcpu)) {
- /*
- * We get here when L2 did CLTS, and L1 didn't shadow CR0.TS
- * but we did (!fpu_active). We need to keep GUEST_CR0.TS on,
- * just pretend it's off (also in arch.cr0 for fpu_activate).
- */
- vmcs_writel(CR0_READ_SHADOW,
- vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS);
- vcpu->arch.cr0 &= ~X86_CR0_TS;
- } else
- vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
-}
-
-static int handle_cr(struct kvm_vcpu *vcpu)
-{
- unsigned long exit_qualification, val;
- int cr;
- int reg;
- int err;
-
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
- cr = exit_qualification & 15;
- reg = (exit_qualification >> 8) & 15;
- switch ((exit_qualification >> 4) & 3) {
- case 0: /* mov to cr */
- val = kvm_register_read(vcpu, reg);
- trace_kvm_cr_write(cr, val);
- switch (cr) {
- case 0:
- err = handle_set_cr0(vcpu, val);
- kvm_complete_insn_gp(vcpu, err);
- return 1;
- case 3:
- err = kvm_set_cr3(vcpu, val);
- kvm_complete_insn_gp(vcpu, err);
- return 1;
- case 4:
- err = handle_set_cr4(vcpu, val);
- kvm_complete_insn_gp(vcpu, err);
- return 1;
- case 8: {
- u8 cr8_prev = kvm_get_cr8(vcpu);
- u8 cr8 = kvm_register_read(vcpu, reg);
- err = kvm_set_cr8(vcpu, cr8);
- kvm_complete_insn_gp(vcpu, err);
- if (irqchip_in_kernel(vcpu->kvm))
- return 1;
- if (cr8_prev <= cr8)
- return 1;
- vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
- return 0;
- }
- };
- break;
- case 2: /* clts */
- handle_clts(vcpu);
- trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
- skip_emulated_instruction(vcpu);
- vmx_fpu_activate(vcpu);
- return 1;
- case 1: /*mov from cr*/
- switch (cr) {
- case 3:
- val = kvm_read_cr3(vcpu);
- kvm_register_write(vcpu, reg, val);
- trace_kvm_cr_read(cr, val);
- skip_emulated_instruction(vcpu);
- return 1;
- case 8:
- val = kvm_get_cr8(vcpu);
- kvm_register_write(vcpu, reg, val);
- trace_kvm_cr_read(cr, val);
- skip_emulated_instruction(vcpu);
- return 1;
- }
- break;
- case 3: /* lmsw */
- val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
- trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
- kvm_lmsw(vcpu, val);
-
- skip_emulated_instruction(vcpu);
- return 1;
- default:
- break;
- }
- vcpu->run->exit_reason = 0;
- pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
- (int)(exit_qualification >> 4) & 3, cr);
- return 0;
-}
-
-static int handle_dr(struct kvm_vcpu *vcpu)
-{
- unsigned long exit_qualification;
- int dr, reg;
-
- /* Do not handle if the CPL > 0, will trigger GP on re-entry */
- if (!kvm_require_cpl(vcpu, 0))
- return 1;
- dr = vmcs_readl(GUEST_DR7);
- if (dr & DR7_GD) {
- /*
- * As the vm-exit takes precedence over the debug trap, we
- * need to emulate the latter, either for the host or the
- * guest debugging itself.
- */
- if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
- vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
- vcpu->run->debug.arch.dr7 = dr;
- vcpu->run->debug.arch.pc =
- vmcs_readl(GUEST_CS_BASE) +
- vmcs_readl(GUEST_RIP);
- vcpu->run->debug.arch.exception = DB_VECTOR;
- vcpu->run->exit_reason = KVM_EXIT_DEBUG;
- return 0;
- } else {
- vcpu->arch.dr7 &= ~DR7_GD;
- vcpu->arch.dr6 |= DR6_BD;
- vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
- kvm_queue_exception(vcpu, DB_VECTOR);
- return 1;
- }
- }
-
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
- dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
- reg = DEBUG_REG_ACCESS_REG(exit_qualification);
- if (exit_qualification & TYPE_MOV_FROM_DR) {
- unsigned long val;
- if (!kvm_get_dr(vcpu, dr, &val))
- kvm_register_write(vcpu, reg, val);
- } else
- kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]);
- skip_emulated_instruction(vcpu);
- return 1;
-}
-
-static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
-{
- vmcs_writel(GUEST_DR7, val);
-}
-
-static int handle_cpuid(struct kvm_vcpu *vcpu)
-{
- kvm_emulate_cpuid(vcpu);
- return 1;
-}
-
-static int handle_rdmsr(struct kvm_vcpu *vcpu)
-{
- u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
- u64 data;
-
- if (vmx_get_msr(vcpu, ecx, &data)) {
- trace_kvm_msr_read_ex(ecx);
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
-
- trace_kvm_msr_read(ecx, data);
-
- /* FIXME: handling of bits 32:63 of rax, rdx */
- vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
- vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
- skip_emulated_instruction(vcpu);
- return 1;
-}
-
-static int handle_wrmsr(struct kvm_vcpu *vcpu)
-{
- u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
- u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
- | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
-
- if (vmx_set_msr(vcpu, ecx, data) != 0) {
- trace_kvm_msr_write_ex(ecx, data);
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
-
- trace_kvm_msr_write(ecx, data);
- skip_emulated_instruction(vcpu);
- return 1;
-}
-
-static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
-{
- kvm_make_request(KVM_REQ_EVENT, vcpu);
- return 1;
-}
-
-static int handle_interrupt_window(struct kvm_vcpu *vcpu)
-{
- u32 cpu_based_vm_exec_control;
-
- /* clear pending irq */
- cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
- cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-
- kvm_make_request(KVM_REQ_EVENT, vcpu);
-
- ++vcpu->stat.irq_window_exits;
-
- /*
- * If the user space waits to inject interrupts, exit as soon as
- * possible
- */
- if (!irqchip_in_kernel(vcpu->kvm) &&
- vcpu->run->request_interrupt_window &&
- !kvm_cpu_has_interrupt(vcpu)) {
- vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
- return 0;
- }
- return 1;
-}
-
-static int handle_halt(struct kvm_vcpu *vcpu)
-{
- skip_emulated_instruction(vcpu);
- return kvm_emulate_halt(vcpu);
-}
-
-static int handle_vmcall(struct kvm_vcpu *vcpu)
-{
- skip_emulated_instruction(vcpu);
- kvm_emulate_hypercall(vcpu);
- return 1;
-}
-
-static int handle_invd(struct kvm_vcpu *vcpu)
-{
- return emulate_instruction(vcpu, 0) == EMULATE_DONE;
-}
-
-static int handle_invlpg(struct kvm_vcpu *vcpu)
-{
- unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-
- kvm_mmu_invlpg(vcpu, exit_qualification);
- skip_emulated_instruction(vcpu);
- return 1;
-}
-
-static int handle_rdpmc(struct kvm_vcpu *vcpu)
-{
- int err;
-
- err = kvm_rdpmc(vcpu);
- kvm_complete_insn_gp(vcpu, err);
-
- return 1;
-}
-
-static int handle_wbinvd(struct kvm_vcpu *vcpu)
-{
- skip_emulated_instruction(vcpu);
- kvm_emulate_wbinvd(vcpu);
- return 1;
-}
-
-static int handle_xsetbv(struct kvm_vcpu *vcpu)
-{
- u64 new_bv = kvm_read_edx_eax(vcpu);
- u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
-
- if (kvm_set_xcr(vcpu, index, new_bv) == 0)
- skip_emulated_instruction(vcpu);
- return 1;
-}
-
-static int handle_apic_access(struct kvm_vcpu *vcpu)
-{
- if (likely(fasteoi)) {
- unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
- int access_type, offset;
-
- access_type = exit_qualification & APIC_ACCESS_TYPE;
- offset = exit_qualification & APIC_ACCESS_OFFSET;
- /*
- * Sane guest uses MOV to write EOI, with written value
- * not cared. So make a short-circuit here by avoiding
- * heavy instruction emulation.
- */
- if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
- (offset == APIC_EOI)) {
- kvm_lapic_set_eoi(vcpu);
- skip_emulated_instruction(vcpu);
- return 1;
- }
- }
- return emulate_instruction(vcpu, 0) == EMULATE_DONE;
-}
-
-static int handle_task_switch(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long exit_qualification;
- bool has_error_code = false;
- u32 error_code = 0;
- u16 tss_selector;
- int reason, type, idt_v, idt_index;
-
- idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
- idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
- type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
-
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-
- reason = (u32)exit_qualification >> 30;
- if (reason == TASK_SWITCH_GATE && idt_v) {
- switch (type) {
- case INTR_TYPE_NMI_INTR:
- vcpu->arch.nmi_injected = false;
- vmx_set_nmi_mask(vcpu, true);
- break;
- case INTR_TYPE_EXT_INTR:
- case INTR_TYPE_SOFT_INTR:
- kvm_clear_interrupt_queue(vcpu);
- break;
- case INTR_TYPE_HARD_EXCEPTION:
- if (vmx->idt_vectoring_info &
- VECTORING_INFO_DELIVER_CODE_MASK) {
- has_error_code = true;
- error_code =
- vmcs_read32(IDT_VECTORING_ERROR_CODE);
- }
- /* fall through */
- case INTR_TYPE_SOFT_EXCEPTION:
- kvm_clear_exception_queue(vcpu);
- break;
- default:
- break;
- }
- }
- tss_selector = exit_qualification;
-
- if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
- type != INTR_TYPE_EXT_INTR &&
- type != INTR_TYPE_NMI_INTR))
- skip_emulated_instruction(vcpu);
-
- if (kvm_task_switch(vcpu, tss_selector,
- type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason,
- has_error_code, error_code) == EMULATE_FAIL) {
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
- return 0;
- }
-
- /* clear all local breakpoint enable flags */
- vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55);
-
- /*
- * TODO: What about debug traps on tss switch?
- * Are we supposed to inject them and update dr6?
- */
-
- return 1;
-}
-
-static int handle_ept_violation(struct kvm_vcpu *vcpu)
-{
- unsigned long exit_qualification;
- gpa_t gpa;
- int gla_validity;
-
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-
- if (exit_qualification & (1 << 6)) {
- printk(KERN_ERR "EPT: GPA exceeds GAW!\n");
- return -EINVAL;
- }
-
- gla_validity = (exit_qualification >> 7) & 0x3;
- if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {
- printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
- printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
- (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
- vmcs_readl(GUEST_LINEAR_ADDRESS));
- printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
- (long unsigned int)exit_qualification);
- vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
- vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION;
- return 0;
- }
-
- gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
- trace_kvm_page_fault(gpa, exit_qualification);
- return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0);
-}
-
-static u64 ept_rsvd_mask(u64 spte, int level)
-{
- int i;
- u64 mask = 0;
-
- for (i = 51; i > boot_cpu_data.x86_phys_bits; i--)
- mask |= (1ULL << i);
-
- if (level > 2)
- /* bits 7:3 reserved */
- mask |= 0xf8;
- else if (level == 2) {
- if (spte & (1ULL << 7))
- /* 2MB ref, bits 20:12 reserved */
- mask |= 0x1ff000;
- else
- /* bits 6:3 reserved */
- mask |= 0x78;
- }
-
- return mask;
-}
-
-static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
- int level)
-{
- printk(KERN_ERR "%s: spte 0x%llx level %d\n", __func__, spte, level);
-
- /* 010b (write-only) */
- WARN_ON((spte & 0x7) == 0x2);
-
- /* 110b (write/execute) */
- WARN_ON((spte & 0x7) == 0x6);
-
- /* 100b (execute-only) and value not supported by logical processor */
- if (!cpu_has_vmx_ept_execute_only())
- WARN_ON((spte & 0x7) == 0x4);
-
- /* not 000b */
- if ((spte & 0x7)) {
- u64 rsvd_bits = spte & ept_rsvd_mask(spte, level);
-
- if (rsvd_bits != 0) {
- printk(KERN_ERR "%s: rsvd_bits = 0x%llx\n",
- __func__, rsvd_bits);
- WARN_ON(1);
- }
-
- if (level == 1 || (level == 2 && (spte & (1ULL << 7)))) {
- u64 ept_mem_type = (spte & 0x38) >> 3;
-
- if (ept_mem_type == 2 || ept_mem_type == 3 ||
- ept_mem_type == 7) {
- printk(KERN_ERR "%s: ept_mem_type=0x%llx\n",
- __func__, ept_mem_type);
- WARN_ON(1);
- }
- }
- }
-}
-
-static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
-{
- u64 sptes[4];
- int nr_sptes, i, ret;
- gpa_t gpa;
-
- gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
-
- ret = handle_mmio_page_fault_common(vcpu, gpa, true);
- if (likely(ret == 1))
- return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
- EMULATE_DONE;
- if (unlikely(!ret))
- return 1;
-
- /* It is the real ept misconfig */
- printk(KERN_ERR "EPT: Misconfiguration.\n");
- printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa);
-
- nr_sptes = kvm_mmu_get_spte_hierarchy(vcpu, gpa, sptes);
-
- for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i)
- ept_misconfig_inspect_spte(vcpu, sptes[i-1], i);
-
- vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
- vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG;
-
- return 0;
-}
-
-static int handle_nmi_window(struct kvm_vcpu *vcpu)
-{
- u32 cpu_based_vm_exec_control;
-
- /* clear pending NMI */
- cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
- cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
- ++vcpu->stat.nmi_window_exits;
- kvm_make_request(KVM_REQ_EVENT, vcpu);
-
- return 1;
-}
-
-static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- enum emulation_result err = EMULATE_DONE;
- int ret = 1;
- u32 cpu_exec_ctrl;
- bool intr_window_requested;
-
- cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
- intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
-
- while (!guest_state_valid(vcpu)) {
- if (intr_window_requested
- && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF))
- return handle_interrupt_window(&vmx->vcpu);
-
- err = emulate_instruction(vcpu, 0);
-
- if (err == EMULATE_DO_MMIO) {
- ret = 0;
- goto out;
- }
-
- if (err != EMULATE_DONE)
- return 0;
-
- if (signal_pending(current))
- goto out;
- if (need_resched())
- schedule();
- }
-
- vmx->emulation_required = 0;
-out:
- return ret;
-}
-
-/*
- * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
- * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
- */
-static int handle_pause(struct kvm_vcpu *vcpu)
-{
- skip_emulated_instruction(vcpu);
- kvm_vcpu_on_spin(vcpu);
-
- return 1;
-}
-
-static int handle_invalid_op(struct kvm_vcpu *vcpu)
-{
- kvm_queue_exception(vcpu, UD_VECTOR);
- return 1;
-}
-
-/*
- * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
- * We could reuse a single VMCS for all the L2 guests, but we also want the
- * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
- * allows keeping them loaded on the processor, and in the future will allow
- * optimizations where prepare_vmcs02 doesn't need to set all the fields on
- * every entry if they never change.
- * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
- * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
- *
- * The following functions allocate and free a vmcs02 in this pool.
- */
-
-/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
-static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
-{
- struct vmcs02_list *item;
- list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
- if (item->vmptr == vmx->nested.current_vmptr) {
- list_move(&item->list, &vmx->nested.vmcs02_pool);
- return &item->vmcs02;
- }
-
- if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
- /* Recycle the least recently used VMCS. */
- item = list_entry(vmx->nested.vmcs02_pool.prev,
- struct vmcs02_list, list);
- item->vmptr = vmx->nested.current_vmptr;
- list_move(&item->list, &vmx->nested.vmcs02_pool);
- return &item->vmcs02;
- }
-
- /* Create a new VMCS */
- item = (struct vmcs02_list *)
- kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
- if (!item)
- return NULL;
- item->vmcs02.vmcs = alloc_vmcs();
- if (!item->vmcs02.vmcs) {
- kfree(item);
- return NULL;
- }
- loaded_vmcs_init(&item->vmcs02);
- item->vmptr = vmx->nested.current_vmptr;
- list_add(&(item->list), &(vmx->nested.vmcs02_pool));
- vmx->nested.vmcs02_num++;
- return &item->vmcs02;
-}
-
-/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
-static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
-{
- struct vmcs02_list *item;
- list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
- if (item->vmptr == vmptr) {
- free_loaded_vmcs(&item->vmcs02);
- list_del(&item->list);
- kfree(item);
- vmx->nested.vmcs02_num--;
- return;
- }
-}
-
-/*
- * Free all VMCSs saved for this vcpu, except the one pointed by
- * vmx->loaded_vmcs. These include the VMCSs in vmcs02_pool (except the one
- * currently used, if running L2), and vmcs01 when running L2.
- */
-static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
-{
- struct vmcs02_list *item, *n;
- list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
- if (vmx->loaded_vmcs != &item->vmcs02)
- free_loaded_vmcs(&item->vmcs02);
- list_del(&item->list);
- kfree(item);
- }
- vmx->nested.vmcs02_num = 0;
-
- if (vmx->loaded_vmcs != &vmx->vmcs01)
- free_loaded_vmcs(&vmx->vmcs01);
-}
-
-/*
- * Emulate the VMXON instruction.
- * Currently, we just remember that VMX is active, and do not save or even
- * inspect the argument to VMXON (the so-called "VMXON pointer") because we
- * do not currently need to store anything in that guest-allocated memory
- * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
- * argument is different from the VMXON pointer (which the spec says they do).
- */
-static int handle_vmon(struct kvm_vcpu *vcpu)
-{
- struct kvm_segment cs;
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- /* The Intel VMX Instruction Reference lists a bunch of bits that
- * are prerequisite to running VMXON, most notably cr4.VMXE must be
- * set to 1 (see vmx_set_cr4() for when we allow the guest to set this).
- * Otherwise, we should fail with #UD. We test these now:
- */
- if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE) ||
- !kvm_read_cr0_bits(vcpu, X86_CR0_PE) ||
- (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
- kvm_queue_exception(vcpu, UD_VECTOR);
- return 1;
- }
-
- vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
- if (is_long_mode(vcpu) && !cs.l) {
- kvm_queue_exception(vcpu, UD_VECTOR);
- return 1;
- }
-
- if (vmx_get_cpl(vcpu)) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
-
- INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
- vmx->nested.vmcs02_num = 0;
-
- vmx->nested.vmxon = true;
-
- skip_emulated_instruction(vcpu);
- return 1;
-}
-
-/*
- * Intel's VMX Instruction Reference specifies a common set of prerequisites
- * for running VMX instructions (except VMXON, whose prerequisites are
- * slightly different). It also specifies what exception to inject otherwise.
- */
-static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
-{
- struct kvm_segment cs;
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- if (!vmx->nested.vmxon) {
- kvm_queue_exception(vcpu, UD_VECTOR);
- return 0;
- }
-
- vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
- if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) ||
- (is_long_mode(vcpu) && !cs.l)) {
- kvm_queue_exception(vcpu, UD_VECTOR);
- return 0;
- }
-
- if (vmx_get_cpl(vcpu)) {
- kvm_inject_gp(vcpu, 0);
- return 0;
- }
-
- return 1;
-}
-
-/*
- * Free whatever needs to be freed from vmx->nested when L1 goes down, or
- * just stops using VMX.
- */
-static void free_nested(struct vcpu_vmx *vmx)
-{
- if (!vmx->nested.vmxon)
- return;
- vmx->nested.vmxon = false;
- if (vmx->nested.current_vmptr != -1ull) {
- kunmap(vmx->nested.current_vmcs12_page);
- nested_release_page(vmx->nested.current_vmcs12_page);
- vmx->nested.current_vmptr = -1ull;
- vmx->nested.current_vmcs12 = NULL;
- }
- /* Unpin physical memory we referred to in current vmcs02 */
- if (vmx->nested.apic_access_page) {
- nested_release_page(vmx->nested.apic_access_page);
- vmx->nested.apic_access_page = 0;
- }
-
- nested_free_all_saved_vmcss(vmx);
-}
-
-/* Emulate the VMXOFF instruction */
-static int handle_vmoff(struct kvm_vcpu *vcpu)
-{
- if (!nested_vmx_check_permission(vcpu))
- return 1;
- free_nested(to_vmx(vcpu));
- skip_emulated_instruction(vcpu);
- return 1;
-}
-
-/*
- * Decode the memory-address operand of a vmx instruction, as recorded on an
- * exit caused by such an instruction (run by a guest hypervisor).
- * On success, returns 0. When the operand is invalid, returns 1 and throws
- * #UD or #GP.
- */
-static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
- unsigned long exit_qualification,
- u32 vmx_instruction_info, gva_t *ret)
-{
- /*
- * According to Vol. 3B, "Information for VM Exits Due to Instruction
- * Execution", on an exit, vmx_instruction_info holds most of the
- * addressing components of the operand. Only the displacement part
- * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
- * For how an actual address is calculated from all these components,
- * refer to Vol. 1, "Operand Addressing".
- */
- int scaling = vmx_instruction_info & 3;
- int addr_size = (vmx_instruction_info >> 7) & 7;
- bool is_reg = vmx_instruction_info & (1u << 10);
- int seg_reg = (vmx_instruction_info >> 15) & 7;
- int index_reg = (vmx_instruction_info >> 18) & 0xf;
- bool index_is_valid = !(vmx_instruction_info & (1u << 22));
- int base_reg = (vmx_instruction_info >> 23) & 0xf;
- bool base_is_valid = !(vmx_instruction_info & (1u << 27));
-
- if (is_reg) {
- kvm_queue_exception(vcpu, UD_VECTOR);
- return 1;
- }
-
- /* Addr = segment_base + offset */
- /* offset = base + [index * scale] + displacement */
- *ret = vmx_get_segment_base(vcpu, seg_reg);
- if (base_is_valid)
- *ret += kvm_register_read(vcpu, base_reg);
- if (index_is_valid)
- *ret += kvm_register_read(vcpu, index_reg)<<scaling;
- *ret += exit_qualification; /* holds the displacement */
-
- if (addr_size == 1) /* 32 bit */
- *ret &= 0xffffffff;
-
- /*
- * TODO: throw #GP (and return 1) in various cases that the VM*
- * instructions require it - e.g., offset beyond segment limit,
- * unusable or unreadable/unwritable segment, non-canonical 64-bit
- * address, and so on. Currently these are not checked.
- */
- return 0;
-}
-
-/*
- * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
- * set the success or error code of an emulated VMX instruction, as specified
- * by Vol 2B, VMX Instruction Reference, "Conventions".
- */
-static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
-{
- vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
- & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
- X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
-}
-
-static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
-{
- vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
- & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
- X86_EFLAGS_SF | X86_EFLAGS_OF))
- | X86_EFLAGS_CF);
-}
-
-static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
- u32 vm_instruction_error)
-{
- if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
- /*
- * failValid writes the error number to the current VMCS, which
- * can't be done there isn't a current VMCS.
- */
- nested_vmx_failInvalid(vcpu);
- return;
- }
- vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
- & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
- X86_EFLAGS_SF | X86_EFLAGS_OF))
- | X86_EFLAGS_ZF);
- get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
-}
-
-/* Emulate the VMCLEAR instruction */
-static int handle_vmclear(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- gva_t gva;
- gpa_t vmptr;
- struct vmcs12 *vmcs12;
- struct page *page;
- struct x86_exception e;
-
- if (!nested_vmx_check_permission(vcpu))
- return 1;
-
- if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
- vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
- return 1;
-
- if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
- sizeof(vmptr), &e)) {
- kvm_inject_page_fault(vcpu, &e);
- return 1;
- }
-
- if (!IS_ALIGNED(vmptr, PAGE_SIZE)) {
- nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
- skip_emulated_instruction(vcpu);
- return 1;
- }
-
- if (vmptr == vmx->nested.current_vmptr) {
- kunmap(vmx->nested.current_vmcs12_page);
- nested_release_page(vmx->nested.current_vmcs12_page);
- vmx->nested.current_vmptr = -1ull;
- vmx->nested.current_vmcs12 = NULL;
- }
-
- page = nested_get_page(vcpu, vmptr);
- if (page == NULL) {
- /*
- * For accurate processor emulation, VMCLEAR beyond available
- * physical memory should do nothing at all. However, it is
- * possible that a nested vmx bug, not a guest hypervisor bug,
- * resulted in this case, so let's shut down before doing any
- * more damage:
- */
- kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
- return 1;
- }
- vmcs12 = kmap(page);
- vmcs12->launch_state = 0;
- kunmap(page);
- nested_release_page(page);
-
- nested_free_vmcs02(vmx, vmptr);
-
- skip_emulated_instruction(vcpu);
- nested_vmx_succeed(vcpu);
- return 1;
-}
-
-static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
-
-/* Emulate the VMLAUNCH instruction */
-static int handle_vmlaunch(struct kvm_vcpu *vcpu)
-{
- return nested_vmx_run(vcpu, true);
-}
-
-/* Emulate the VMRESUME instruction */
-static int handle_vmresume(struct kvm_vcpu *vcpu)
-{
-
- return nested_vmx_run(vcpu, false);
-}
-
-enum vmcs_field_type {
- VMCS_FIELD_TYPE_U16 = 0,
- VMCS_FIELD_TYPE_U64 = 1,
- VMCS_FIELD_TYPE_U32 = 2,
- VMCS_FIELD_TYPE_NATURAL_WIDTH = 3
-};
-
-static inline int vmcs_field_type(unsigned long field)
-{
- if (0x1 & field) /* the *_HIGH fields are all 32 bit */
- return VMCS_FIELD_TYPE_U32;
- return (field >> 13) & 0x3 ;
-}
-
-static inline int vmcs_field_readonly(unsigned long field)
-{
- return (((field >> 10) & 0x3) == 1);
-}
-
-/*
- * Read a vmcs12 field. Since these can have varying lengths and we return
- * one type, we chose the biggest type (u64) and zero-extend the return value
- * to that size. Note that the caller, handle_vmread, might need to use only
- * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of
- * 64-bit fields are to be returned).
- */
-static inline bool vmcs12_read_any(struct kvm_vcpu *vcpu,
- unsigned long field, u64 *ret)
-{
- short offset = vmcs_field_to_offset(field);
- char *p;
-
- if (offset < 0)
- return 0;
-
- p = ((char *)(get_vmcs12(vcpu))) + offset;
-
- switch (vmcs_field_type(field)) {
- case VMCS_FIELD_TYPE_NATURAL_WIDTH:
- *ret = *((natural_width *)p);
- return 1;
- case VMCS_FIELD_TYPE_U16:
- *ret = *((u16 *)p);
- return 1;
- case VMCS_FIELD_TYPE_U32:
- *ret = *((u32 *)p);
- return 1;
- case VMCS_FIELD_TYPE_U64:
- *ret = *((u64 *)p);
- return 1;
- default:
- return 0; /* can never happen. */
- }
-}
-
-/*
- * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was
- * used before) all generate the same failure when it is missing.
- */
-static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (vmx->nested.current_vmptr == -1ull) {
- nested_vmx_failInvalid(vcpu);
- skip_emulated_instruction(vcpu);
- return 0;
- }
- return 1;
-}
-
-static int handle_vmread(struct kvm_vcpu *vcpu)
-{
- unsigned long field;
- u64 field_value;
- unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
- u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
- gva_t gva = 0;
-
- if (!nested_vmx_check_permission(vcpu) ||
- !nested_vmx_check_vmcs12(vcpu))
- return 1;
-
- /* Decode instruction info and find the field to read */
- field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
- /* Read the field, zero-extended to a u64 field_value */
- if (!vmcs12_read_any(vcpu, field, &field_value)) {
- nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
- skip_emulated_instruction(vcpu);
- return 1;
- }
- /*
- * Now copy part of this value to register or memory, as requested.
- * Note that the number of bits actually copied is 32 or 64 depending
- * on the guest's mode (32 or 64 bit), not on the given field's length.
- */
- if (vmx_instruction_info & (1u << 10)) {
- kvm_register_write(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
- field_value);
- } else {
- if (get_vmx_mem_address(vcpu, exit_qualification,
- vmx_instruction_info, &gva))
- return 1;
- /* _system ok, as nested_vmx_check_permission verified cpl=0 */
- kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva,
- &field_value, (is_long_mode(vcpu) ? 8 : 4), NULL);
- }
-
- nested_vmx_succeed(vcpu);
- skip_emulated_instruction(vcpu);
- return 1;
-}
-
-
-static int handle_vmwrite(struct kvm_vcpu *vcpu)
-{
- unsigned long field;
- gva_t gva;
- unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
- u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
- char *p;
- short offset;
- /* The value to write might be 32 or 64 bits, depending on L1's long
- * mode, and eventually we need to write that into a field of several
- * possible lengths. The code below first zero-extends the value to 64
- * bit (field_value), and then copies only the approriate number of
- * bits into the vmcs12 field.
- */
- u64 field_value = 0;
- struct x86_exception e;
-
- if (!nested_vmx_check_permission(vcpu) ||
- !nested_vmx_check_vmcs12(vcpu))
- return 1;
-
- if (vmx_instruction_info & (1u << 10))
- field_value = kvm_register_read(vcpu,
- (((vmx_instruction_info) >> 3) & 0xf));
- else {
- if (get_vmx_mem_address(vcpu, exit_qualification,
- vmx_instruction_info, &gva))
- return 1;
- if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva,
- &field_value, (is_long_mode(vcpu) ? 8 : 4), &e)) {
- kvm_inject_page_fault(vcpu, &e);
- return 1;
- }
- }
-
-
- field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
- if (vmcs_field_readonly(field)) {
- nested_vmx_failValid(vcpu,
- VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
- skip_emulated_instruction(vcpu);
- return 1;
- }
-
- offset = vmcs_field_to_offset(field);
- if (offset < 0) {
- nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
- skip_emulated_instruction(vcpu);
- return 1;
- }
- p = ((char *) get_vmcs12(vcpu)) + offset;
-
- switch (vmcs_field_type(field)) {
- case VMCS_FIELD_TYPE_U16:
- *(u16 *)p = field_value;
- break;
- case VMCS_FIELD_TYPE_U32:
- *(u32 *)p = field_value;
- break;
- case VMCS_FIELD_TYPE_U64:
- *(u64 *)p = field_value;
- break;
- case VMCS_FIELD_TYPE_NATURAL_WIDTH:
- *(natural_width *)p = field_value;
- break;
- default:
- nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
- skip_emulated_instruction(vcpu);
- return 1;
- }
-
- nested_vmx_succeed(vcpu);
- skip_emulated_instruction(vcpu);
- return 1;
-}
-
-/* Emulate the VMPTRLD instruction */
-static int handle_vmptrld(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- gva_t gva;
- gpa_t vmptr;
- struct x86_exception e;
-
- if (!nested_vmx_check_permission(vcpu))
- return 1;
-
- if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
- vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
- return 1;
-
- if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
- sizeof(vmptr), &e)) {
- kvm_inject_page_fault(vcpu, &e);
- return 1;
- }
-
- if (!IS_ALIGNED(vmptr, PAGE_SIZE)) {
- nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
- skip_emulated_instruction(vcpu);
- return 1;
- }
-
- if (vmx->nested.current_vmptr != vmptr) {
- struct vmcs12 *new_vmcs12;
- struct page *page;
- page = nested_get_page(vcpu, vmptr);
- if (page == NULL) {
- nested_vmx_failInvalid(vcpu);
- skip_emulated_instruction(vcpu);
- return 1;
- }
- new_vmcs12 = kmap(page);
- if (new_vmcs12->revision_id != VMCS12_REVISION) {
- kunmap(page);
- nested_release_page_clean(page);
- nested_vmx_failValid(vcpu,
- VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
- skip_emulated_instruction(vcpu);
- return 1;
- }
- if (vmx->nested.current_vmptr != -1ull) {
- kunmap(vmx->nested.current_vmcs12_page);
- nested_release_page(vmx->nested.current_vmcs12_page);
- }
-
- vmx->nested.current_vmptr = vmptr;
- vmx->nested.current_vmcs12 = new_vmcs12;
- vmx->nested.current_vmcs12_page = page;
- }
-
- nested_vmx_succeed(vcpu);
- skip_emulated_instruction(vcpu);
- return 1;
-}
-
-/* Emulate the VMPTRST instruction */
-static int handle_vmptrst(struct kvm_vcpu *vcpu)
-{
- unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
- u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
- gva_t vmcs_gva;
- struct x86_exception e;
-
- if (!nested_vmx_check_permission(vcpu))
- return 1;
-
- if (get_vmx_mem_address(vcpu, exit_qualification,
- vmx_instruction_info, &vmcs_gva))
- return 1;
- /* ok to use *_system, as nested_vmx_check_permission verified cpl=0 */
- if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva,
- (void *)&to_vmx(vcpu)->nested.current_vmptr,
- sizeof(u64), &e)) {
- kvm_inject_page_fault(vcpu, &e);
- return 1;
- }
- nested_vmx_succeed(vcpu);
- skip_emulated_instruction(vcpu);
- return 1;
-}
-
-/*
- * The exit handlers return 1 if the exit was handled fully and guest execution
- * may resume. Otherwise they set the kvm_run parameter to indicate what needs
- * to be done to userspace and return 0.
- */
-static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
- [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
- [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
- [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
- [EXIT_REASON_NMI_WINDOW] = handle_nmi_window,
- [EXIT_REASON_IO_INSTRUCTION] = handle_io,
- [EXIT_REASON_CR_ACCESS] = handle_cr,
- [EXIT_REASON_DR_ACCESS] = handle_dr,
- [EXIT_REASON_CPUID] = handle_cpuid,
- [EXIT_REASON_MSR_READ] = handle_rdmsr,
- [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
- [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
- [EXIT_REASON_HLT] = handle_halt,
- [EXIT_REASON_INVD] = handle_invd,
- [EXIT_REASON_INVLPG] = handle_invlpg,
- [EXIT_REASON_RDPMC] = handle_rdpmc,
- [EXIT_REASON_VMCALL] = handle_vmcall,
- [EXIT_REASON_VMCLEAR] = handle_vmclear,
- [EXIT_REASON_VMLAUNCH] = handle_vmlaunch,
- [EXIT_REASON_VMPTRLD] = handle_vmptrld,
- [EXIT_REASON_VMPTRST] = handle_vmptrst,
- [EXIT_REASON_VMREAD] = handle_vmread,
- [EXIT_REASON_VMRESUME] = handle_vmresume,
- [EXIT_REASON_VMWRITE] = handle_vmwrite,
- [EXIT_REASON_VMOFF] = handle_vmoff,
- [EXIT_REASON_VMON] = handle_vmon,
- [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
- [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
- [EXIT_REASON_WBINVD] = handle_wbinvd,
- [EXIT_REASON_XSETBV] = handle_xsetbv,
- [EXIT_REASON_TASK_SWITCH] = handle_task_switch,
- [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
- [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
- [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
- [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
- [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op,
- [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op,
-};
-
-static const int kvm_vmx_max_exit_handlers =
- ARRAY_SIZE(kvm_vmx_exit_handlers);
-
-/*
- * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
- * rather than handle it ourselves in L0. I.e., check whether L1 expressed
- * disinterest in the current event (read or write a specific MSR) by using an
- * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
- */
-static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
- struct vmcs12 *vmcs12, u32 exit_reason)
-{
- u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
- gpa_t bitmap;
-
- if (!nested_cpu_has(get_vmcs12(vcpu), CPU_BASED_USE_MSR_BITMAPS))
- return 1;
-
- /*
- * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
- * for the four combinations of read/write and low/high MSR numbers.
- * First we need to figure out which of the four to use:
- */
- bitmap = vmcs12->msr_bitmap;
- if (exit_reason == EXIT_REASON_MSR_WRITE)
- bitmap += 2048;
- if (msr_index >= 0xc0000000) {
- msr_index -= 0xc0000000;
- bitmap += 1024;
- }
-
- /* Then read the msr_index'th bit from this bitmap: */
- if (msr_index < 1024*8) {
- unsigned char b;
- kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1);
- return 1 & (b >> (msr_index & 7));
- } else
- return 1; /* let L1 handle the wrong parameter */
-}
-
-/*
- * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
- * rather than handle it ourselves in L0. I.e., check if L1 wanted to
- * intercept (via guest_host_mask etc.) the current event.
- */
-static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
- struct vmcs12 *vmcs12)
-{
- unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
- int cr = exit_qualification & 15;
- int reg = (exit_qualification >> 8) & 15;
- unsigned long val = kvm_register_read(vcpu, reg);
-
- switch ((exit_qualification >> 4) & 3) {
- case 0: /* mov to cr */
- switch (cr) {
- case 0:
- if (vmcs12->cr0_guest_host_mask &
- (val ^ vmcs12->cr0_read_shadow))
- return 1;
- break;
- case 3:
- if ((vmcs12->cr3_target_count >= 1 &&
- vmcs12->cr3_target_value0 == val) ||
- (vmcs12->cr3_target_count >= 2 &&
- vmcs12->cr3_target_value1 == val) ||
- (vmcs12->cr3_target_count >= 3 &&
- vmcs12->cr3_target_value2 == val) ||
- (vmcs12->cr3_target_count >= 4 &&
- vmcs12->cr3_target_value3 == val))
- return 0;
- if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
- return 1;
- break;
- case 4:
- if (vmcs12->cr4_guest_host_mask &
- (vmcs12->cr4_read_shadow ^ val))
- return 1;
- break;
- case 8:
- if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
- return 1;
- break;
- }
- break;
- case 2: /* clts */
- if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
- (vmcs12->cr0_read_shadow & X86_CR0_TS))
- return 1;
- break;
- case 1: /* mov from cr */
- switch (cr) {
- case 3:
- if (vmcs12->cpu_based_vm_exec_control &
- CPU_BASED_CR3_STORE_EXITING)
- return 1;
- break;
- case 8:
- if (vmcs12->cpu_based_vm_exec_control &
- CPU_BASED_CR8_STORE_EXITING)
- return 1;
- break;
- }
- break;
- case 3: /* lmsw */
- /*
- * lmsw can change bits 1..3 of cr0, and only set bit 0 of
- * cr0. Other attempted changes are ignored, with no exit.
- */
- if (vmcs12->cr0_guest_host_mask & 0xe &
- (val ^ vmcs12->cr0_read_shadow))
- return 1;
- if ((vmcs12->cr0_guest_host_mask & 0x1) &&
- !(vmcs12->cr0_read_shadow & 0x1) &&
- (val & 0x1))
- return 1;
- break;
- }
- return 0;
-}
-
-/*
- * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
- * should handle it ourselves in L0 (and then continue L2). Only call this
- * when in is_guest_mode (L2).
- */
-static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
-{
- u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
- u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-
- if (vmx->nested.nested_run_pending)
- return 0;
-
- if (unlikely(vmx->fail)) {
- pr_info_ratelimited("%s failed vm entry %x\n", __func__,
- vmcs_read32(VM_INSTRUCTION_ERROR));
- return 1;
- }
-
- switch (exit_reason) {
- case EXIT_REASON_EXCEPTION_NMI:
- if (!is_exception(intr_info))
- return 0;
- else if (is_page_fault(intr_info))
- return enable_ept;
- return vmcs12->exception_bitmap &
- (1u << (intr_info & INTR_INFO_VECTOR_MASK));
- case EXIT_REASON_EXTERNAL_INTERRUPT:
- return 0;
- case EXIT_REASON_TRIPLE_FAULT:
- return 1;
- case EXIT_REASON_PENDING_INTERRUPT:
- case EXIT_REASON_NMI_WINDOW:
- /*
- * prepare_vmcs02() set the CPU_BASED_VIRTUAL_INTR_PENDING bit
- * (aka Interrupt Window Exiting) only when L1 turned it on,
- * so if we got a PENDING_INTERRUPT exit, this must be for L1.
- * Same for NMI Window Exiting.
- */
- return 1;
- case EXIT_REASON_TASK_SWITCH:
- return 1;
- case EXIT_REASON_CPUID:
- return 1;
- case EXIT_REASON_HLT:
- return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
- case EXIT_REASON_INVD:
- return 1;
- case EXIT_REASON_INVLPG:
- return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
- case EXIT_REASON_RDPMC:
- return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
- case EXIT_REASON_RDTSC:
- return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
- case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
- case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
- case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
- case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
- case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
- /*
- * VMX instructions trap unconditionally. This allows L1 to
- * emulate them for its L2 guest, i.e., allows 3-level nesting!
- */
- return 1;
- case EXIT_REASON_CR_ACCESS:
- return nested_vmx_exit_handled_cr(vcpu, vmcs12);
- case EXIT_REASON_DR_ACCESS:
- return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
- case EXIT_REASON_IO_INSTRUCTION:
- /* TODO: support IO bitmaps */
- return 1;
- case EXIT_REASON_MSR_READ:
- case EXIT_REASON_MSR_WRITE:
- return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
- case EXIT_REASON_INVALID_STATE:
- return 1;
- case EXIT_REASON_MWAIT_INSTRUCTION:
- return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
- case EXIT_REASON_MONITOR_INSTRUCTION:
- return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
- case EXIT_REASON_PAUSE_INSTRUCTION:
- return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
- nested_cpu_has2(vmcs12,
- SECONDARY_EXEC_PAUSE_LOOP_EXITING);
- case EXIT_REASON_MCE_DURING_VMENTRY:
- return 0;
- case EXIT_REASON_TPR_BELOW_THRESHOLD:
- return 1;
- case EXIT_REASON_APIC_ACCESS:
- return nested_cpu_has2(vmcs12,
- SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
- case EXIT_REASON_EPT_VIOLATION:
- case EXIT_REASON_EPT_MISCONFIG:
- return 0;
- case EXIT_REASON_WBINVD:
- return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
- case EXIT_REASON_XSETBV:
- return 1;
- default:
- return 1;
- }
-}
-
-static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
-{
- *info1 = vmcs_readl(EXIT_QUALIFICATION);
- *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
-}
-
-/*
- * The guest has exited. See if we can fix it or if we need userspace
- * assistance.
- */
-static int vmx_handle_exit(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- u32 exit_reason = vmx->exit_reason;
- u32 vectoring_info = vmx->idt_vectoring_info;
-
- /* If guest state is invalid, start emulating */
- if (vmx->emulation_required && emulate_invalid_guest_state)
- return handle_invalid_guest_state(vcpu);
-
- /*
- * the KVM_REQ_EVENT optimization bit is only on for one entry, and if
- * we did not inject a still-pending event to L1 now because of
- * nested_run_pending, we need to re-enable this bit.
- */
- if (vmx->nested.nested_run_pending)
- kvm_make_request(KVM_REQ_EVENT, vcpu);
-
- if (!is_guest_mode(vcpu) && (exit_reason == EXIT_REASON_VMLAUNCH ||
- exit_reason == EXIT_REASON_VMRESUME))
- vmx->nested.nested_run_pending = 1;
- else
- vmx->nested.nested_run_pending = 0;
-
- if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
- nested_vmx_vmexit(vcpu);
- return 1;
- }
-
- if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
- vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
- vcpu->run->fail_entry.hardware_entry_failure_reason
- = exit_reason;
- return 0;
- }
-
- if (unlikely(vmx->fail)) {
- vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
- vcpu->run->fail_entry.hardware_entry_failure_reason
- = vmcs_read32(VM_INSTRUCTION_ERROR);
- return 0;
- }
-
- if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
- (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
- exit_reason != EXIT_REASON_EPT_VIOLATION &&
- exit_reason != EXIT_REASON_TASK_SWITCH))
- printk(KERN_WARNING "%s: unexpected, valid vectoring info "
- "(0x%x) and exit reason is 0x%x\n",
- __func__, vectoring_info, exit_reason);
-
- if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
- !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
- get_vmcs12(vcpu), vcpu)))) {
- if (vmx_interrupt_allowed(vcpu)) {
- vmx->soft_vnmi_blocked = 0;
- } else if (vmx->vnmi_blocked_time > 1000000000LL &&
- vcpu->arch.nmi_pending) {
- /*
- * This CPU don't support us in finding the end of an
- * NMI-blocked window if the guest runs with IRQs
- * disabled. So we pull the trigger after 1 s of
- * futile waiting, but inform the user about this.
- */
- printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
- "state on VCPU %d after 1 s timeout\n",
- __func__, vcpu->vcpu_id);
- vmx->soft_vnmi_blocked = 0;
- }
- }
-
- if (exit_reason < kvm_vmx_max_exit_handlers
- && kvm_vmx_exit_handlers[exit_reason])
- return kvm_vmx_exit_handlers[exit_reason](vcpu);
- else {
- vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
- vcpu->run->hw.hardware_exit_reason = exit_reason;
- }
- return 0;
-}
-
-static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
-{
- if (irr == -1 || tpr < irr) {
- vmcs_write32(TPR_THRESHOLD, 0);
- return;
- }
-
- vmcs_write32(TPR_THRESHOLD, irr);
-}
-
-static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
-{
- u32 exit_intr_info;
-
- if (!(vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY
- || vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI))
- return;
-
- vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
- exit_intr_info = vmx->exit_intr_info;
-
- /* Handle machine checks before interrupts are enabled */
- if (is_machine_check(exit_intr_info))
- kvm_machine_check();
-
- /* We need to handle NMIs before interrupts are enabled */
- if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
- (exit_intr_info & INTR_INFO_VALID_MASK)) {
- kvm_before_handle_nmi(&vmx->vcpu);
- asm("int $2");
- kvm_after_handle_nmi(&vmx->vcpu);
- }
-}
-
-static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
-{
- u32 exit_intr_info;
- bool unblock_nmi;
- u8 vector;
- bool idtv_info_valid;
-
- idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
-
- if (cpu_has_virtual_nmis()) {
- if (vmx->nmi_known_unmasked)
- return;
- /*
- * Can't use vmx->exit_intr_info since we're not sure what
- * the exit reason is.
- */
- exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
- unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
- vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
- /*
- * SDM 3: 27.7.1.2 (September 2008)
- * Re-set bit "block by NMI" before VM entry if vmexit caused by
- * a guest IRET fault.
- * SDM 3: 23.2.2 (September 2008)
- * Bit 12 is undefined in any of the following cases:
- * If the VM exit sets the valid bit in the IDT-vectoring
- * information field.
- * If the VM exit is due to a double fault.
- */
- if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
- vector != DF_VECTOR && !idtv_info_valid)
- vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
- GUEST_INTR_STATE_NMI);
- else
- vmx->nmi_known_unmasked =
- !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
- & GUEST_INTR_STATE_NMI);
- } else if (unlikely(vmx->soft_vnmi_blocked))
- vmx->vnmi_blocked_time +=
- ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
-}
-
-static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
- u32 idt_vectoring_info,
- int instr_len_field,
- int error_code_field)
-{
- u8 vector;
- int type;
- bool idtv_info_valid;
-
- idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
-
- vmx->vcpu.arch.nmi_injected = false;
- kvm_clear_exception_queue(&vmx->vcpu);
- kvm_clear_interrupt_queue(&vmx->vcpu);
-
- if (!idtv_info_valid)
- return;
-
- kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
-
- vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
- type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
-
- switch (type) {
- case INTR_TYPE_NMI_INTR:
- vmx->vcpu.arch.nmi_injected = true;
- /*
- * SDM 3: 27.7.1.2 (September 2008)
- * Clear bit "block by NMI" before VM entry if a NMI
- * delivery faulted.
- */
- vmx_set_nmi_mask(&vmx->vcpu, false);
- break;
- case INTR_TYPE_SOFT_EXCEPTION:
- vmx->vcpu.arch.event_exit_inst_len =
- vmcs_read32(instr_len_field);
- /* fall through */
- case INTR_TYPE_HARD_EXCEPTION:
- if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
- u32 err = vmcs_read32(error_code_field);
- kvm_queue_exception_e(&vmx->vcpu, vector, err);
- } else
- kvm_queue_exception(&vmx->vcpu, vector);
- break;
- case INTR_TYPE_SOFT_INTR:
- vmx->vcpu.arch.event_exit_inst_len =
- vmcs_read32(instr_len_field);
- /* fall through */
- case INTR_TYPE_EXT_INTR:
- kvm_queue_interrupt(&vmx->vcpu, vector,
- type == INTR_TYPE_SOFT_INTR);
- break;
- default:
- break;
- }
-}
-
-static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
-{
- if (is_guest_mode(&vmx->vcpu))
- return;
- __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info,
- VM_EXIT_INSTRUCTION_LEN,
- IDT_VECTORING_ERROR_CODE);
-}
-
-static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
-{
- if (is_guest_mode(vcpu))
- return;
- __vmx_complete_interrupts(to_vmx(vcpu),
- vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
- VM_ENTRY_INSTRUCTION_LEN,
- VM_ENTRY_EXCEPTION_ERROR_CODE);
-
- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
-}
-
-static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
-{
- int i, nr_msrs;
- struct perf_guest_switch_msr *msrs;
-
- msrs = perf_guest_get_msrs(&nr_msrs);
-
- if (!msrs)
- return;
-
- for (i = 0; i < nr_msrs; i++)
- if (msrs[i].host == msrs[i].guest)
- clear_atomic_switch_msr(vmx, msrs[i].msr);
- else
- add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
- msrs[i].host);
-}
-
-#ifdef CONFIG_X86_64
-#define R "r"
-#define Q "q"
-#else
-#define R "e"
-#define Q "l"
-#endif
-
-static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) {
- struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- if (vmcs12->idt_vectoring_info_field &
- VECTORING_INFO_VALID_MASK) {
- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
- vmcs12->idt_vectoring_info_field);
- vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
- vmcs12->vm_exit_instruction_len);
- if (vmcs12->idt_vectoring_info_field &
- VECTORING_INFO_DELIVER_CODE_MASK)
- vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
- vmcs12->idt_vectoring_error_code);
- }
- }
-
- /* Record the guest's net vcpu time for enforced NMI injections. */
- if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
- vmx->entry_time = ktime_get();
-
- /* Don't enter VMX if guest state is invalid, let the exit handler
- start emulation until we arrive back to a valid state */
- if (vmx->emulation_required && emulate_invalid_guest_state)
- return;
-
- if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
- vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
- if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
- vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
-
- /* When single-stepping over STI and MOV SS, we must clear the
- * corresponding interruptibility bits in the guest state. Otherwise
- * vmentry fails as it then expects bit 14 (BS) in pending debug
- * exceptions being set, but that's not correct for the guest debugging
- * case. */
- if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
- vmx_set_interrupt_shadow(vcpu, 0);
-
- atomic_switch_perf_msrs(vmx);
-
- vmx->__launched = vmx->loaded_vmcs->launched;
- asm(
- /* Store host registers */
- "push %%"R"dx; push %%"R"bp;"
- "push %%"R"cx \n\t" /* placeholder for guest rcx */
- "push %%"R"cx \n\t"
- "cmp %%"R"sp, %c[host_rsp](%0) \n\t"
- "je 1f \n\t"
- "mov %%"R"sp, %c[host_rsp](%0) \n\t"
- __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
- "1: \n\t"
- /* Reload cr2 if changed */
- "mov %c[cr2](%0), %%"R"ax \n\t"
- "mov %%cr2, %%"R"dx \n\t"
- "cmp %%"R"ax, %%"R"dx \n\t"
- "je 2f \n\t"
- "mov %%"R"ax, %%cr2 \n\t"
- "2: \n\t"
- /* Check if vmlaunch of vmresume is needed */
- "cmpl $0, %c[launched](%0) \n\t"
- /* Load guest registers. Don't clobber flags. */
- "mov %c[rax](%0), %%"R"ax \n\t"
- "mov %c[rbx](%0), %%"R"bx \n\t"
- "mov %c[rdx](%0), %%"R"dx \n\t"
- "mov %c[rsi](%0), %%"R"si \n\t"
- "mov %c[rdi](%0), %%"R"di \n\t"
- "mov %c[rbp](%0), %%"R"bp \n\t"
-#ifdef CONFIG_X86_64
- "mov %c[r8](%0), %%r8 \n\t"
- "mov %c[r9](%0), %%r9 \n\t"
- "mov %c[r10](%0), %%r10 \n\t"
- "mov %c[r11](%0), %%r11 \n\t"
- "mov %c[r12](%0), %%r12 \n\t"
- "mov %c[r13](%0), %%r13 \n\t"
- "mov %c[r14](%0), %%r14 \n\t"
- "mov %c[r15](%0), %%r15 \n\t"
-#endif
- "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */
-
- /* Enter guest mode */
- "jne .Llaunched \n\t"
- __ex(ASM_VMX_VMLAUNCH) "\n\t"
- "jmp .Lkvm_vmx_return \n\t"
- ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
- ".Lkvm_vmx_return: "
- /* Save guest registers, load host registers, keep flags */
- "mov %0, %c[wordsize](%%"R"sp) \n\t"
- "pop %0 \n\t"
- "mov %%"R"ax, %c[rax](%0) \n\t"
- "mov %%"R"bx, %c[rbx](%0) \n\t"
- "pop"Q" %c[rcx](%0) \n\t"
- "mov %%"R"dx, %c[rdx](%0) \n\t"
- "mov %%"R"si, %c[rsi](%0) \n\t"
- "mov %%"R"di, %c[rdi](%0) \n\t"
- "mov %%"R"bp, %c[rbp](%0) \n\t"
-#ifdef CONFIG_X86_64
- "mov %%r8, %c[r8](%0) \n\t"
- "mov %%r9, %c[r9](%0) \n\t"
- "mov %%r10, %c[r10](%0) \n\t"
- "mov %%r11, %c[r11](%0) \n\t"
- "mov %%r12, %c[r12](%0) \n\t"
- "mov %%r13, %c[r13](%0) \n\t"
- "mov %%r14, %c[r14](%0) \n\t"
- "mov %%r15, %c[r15](%0) \n\t"
-#endif
- "mov %%cr2, %%"R"ax \n\t"
- "mov %%"R"ax, %c[cr2](%0) \n\t"
-
- "pop %%"R"bp; pop %%"R"dx \n\t"
- "setbe %c[fail](%0) \n\t"
- : : "c"(vmx), "d"((unsigned long)HOST_RSP),
- [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
- [fail]"i"(offsetof(struct vcpu_vmx, fail)),
- [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
- [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
- [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
- [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
- [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
- [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
- [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
- [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
-#ifdef CONFIG_X86_64
- [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
- [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
- [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
- [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
- [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
- [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
- [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
- [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
-#endif
- [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
- [wordsize]"i"(sizeof(ulong))
- : "cc", "memory"
- , R"ax", R"bx", R"di", R"si"
-#ifdef CONFIG_X86_64
- , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
-#endif
- );
-
- vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
- | (1 << VCPU_EXREG_RFLAGS)
- | (1 << VCPU_EXREG_CPL)
- | (1 << VCPU_EXREG_PDPTR)
- | (1 << VCPU_EXREG_SEGMENTS)
- | (1 << VCPU_EXREG_CR3));
- vcpu->arch.regs_dirty = 0;
-
- vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
-
- if (is_guest_mode(vcpu)) {
- struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- vmcs12->idt_vectoring_info_field = vmx->idt_vectoring_info;
- if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
- vmcs12->idt_vectoring_error_code =
- vmcs_read32(IDT_VECTORING_ERROR_CODE);
- vmcs12->vm_exit_instruction_len =
- vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
- }
- }
-
- asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
- vmx->loaded_vmcs->launched = 1;
-
- vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
- trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX);
-
- vmx_complete_atomic_exit(vmx);
- vmx_recover_nmi_blocking(vmx);
- vmx_complete_interrupts(vmx);
-}
-
-#undef R
-#undef Q
-
-static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- free_vpid(vmx);
- free_nested(vmx);
- free_loaded_vmcs(vmx->loaded_vmcs);
- kfree(vmx->guest_msrs);
- kvm_vcpu_uninit(vcpu);
- kmem_cache_free(kvm_vcpu_cache, vmx);
-}
-
-static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
-{
- int err;
- struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
- int cpu;
-
- if (!vmx)
- return ERR_PTR(-ENOMEM);
-
- allocate_vpid(vmx);
-
- err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
- if (err)
- goto free_vcpu;
-
- vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
- err = -ENOMEM;
- if (!vmx->guest_msrs) {
- goto uninit_vcpu;
- }
-
- vmx->loaded_vmcs = &vmx->vmcs01;
- vmx->loaded_vmcs->vmcs = alloc_vmcs();
- if (!vmx->loaded_vmcs->vmcs)
- goto free_msrs;
- if (!vmm_exclusive)
- kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id())));
- loaded_vmcs_init(vmx->loaded_vmcs);
- if (!vmm_exclusive)
- kvm_cpu_vmxoff();
-
- cpu = get_cpu();
- vmx_vcpu_load(&vmx->vcpu, cpu);
- vmx->vcpu.cpu = cpu;
- err = vmx_vcpu_setup(vmx);
- vmx_vcpu_put(&vmx->vcpu);
- put_cpu();
- if (err)
- goto free_vmcs;
- if (vm_need_virtualize_apic_accesses(kvm))
- err = alloc_apic_access_page(kvm);
- if (err)
- goto free_vmcs;
-
- if (enable_ept) {
- if (!kvm->arch.ept_identity_map_addr)
- kvm->arch.ept_identity_map_addr =
- VMX_EPT_IDENTITY_PAGETABLE_ADDR;
- err = -ENOMEM;
- if (alloc_identity_pagetable(kvm) != 0)
- goto free_vmcs;
- if (!init_rmode_identity_map(kvm))
- goto free_vmcs;
- }
-
- vmx->nested.current_vmptr = -1ull;
- vmx->nested.current_vmcs12 = NULL;
-
- return &vmx->vcpu;
-
-free_vmcs:
- free_vmcs(vmx->loaded_vmcs->vmcs);
-free_msrs:
- kfree(vmx->guest_msrs);
-uninit_vcpu:
- kvm_vcpu_uninit(&vmx->vcpu);
-free_vcpu:
- free_vpid(vmx);
- kmem_cache_free(kvm_vcpu_cache, vmx);
- return ERR_PTR(err);
-}
-
-static void __init vmx_check_processor_compat(void *rtn)
-{
- struct vmcs_config vmcs_conf;
-
- *(int *)rtn = 0;
- if (setup_vmcs_config(&vmcs_conf) < 0)
- *(int *)rtn = -EIO;
- if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
- printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
- smp_processor_id());
- *(int *)rtn = -EIO;
- }
-}
-
-static int get_ept_level(void)
-{
- return VMX_EPT_DEFAULT_GAW + 1;
-}
-
-static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
-{
- u64 ret;
-
- /* For VT-d and EPT combination
- * 1. MMIO: always map as UC
- * 2. EPT with VT-d:
- * a. VT-d without snooping control feature: can't guarantee the
- * result, try to trust guest.
- * b. VT-d with snooping control feature: snooping control feature of
- * VT-d engine can guarantee the cache correctness. Just set it
- * to WB to keep consistent with host. So the same as item 3.
- * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
- * consistent with host MTRR
- */
- if (is_mmio)
- ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
- else if (vcpu->kvm->arch.iommu_domain &&
- !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY))
- ret = kvm_get_guest_memory_type(vcpu, gfn) <<
- VMX_EPT_MT_EPTE_SHIFT;
- else
- ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT)
- | VMX_EPT_IPAT_BIT;
-
- return ret;
-}
-
-static int vmx_get_lpage_level(void)
-{
- if (enable_ept && !cpu_has_vmx_ept_1g_page())
- return PT_DIRECTORY_LEVEL;
- else
- /* For shadow and EPT supported 1GB page */
- return PT_PDPE_LEVEL;
-}
-
-static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best;
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- u32 exec_control;
-
- vmx->rdtscp_enabled = false;
- if (vmx_rdtscp_supported()) {
- exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
- if (exec_control & SECONDARY_EXEC_RDTSCP) {
- best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
- if (best && (best->edx & bit(X86_FEATURE_RDTSCP)))
- vmx->rdtscp_enabled = true;
- else {
- exec_control &= ~SECONDARY_EXEC_RDTSCP;
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
- exec_control);
- }
- }
- }
-}
-
-static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
-{
- if (func == 1 && nested)
- entry->ecx |= bit(X86_FEATURE_VMX);
-}
-
-/*
- * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
- * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
- * with L0's requirements for its guest (a.k.a. vmsc01), so we can run the L2
- * guest in a way that will both be appropriate to L1's requests, and our
- * needs. In addition to modifying the active vmcs (which is vmcs02), this
- * function also has additional necessary side-effects, like setting various
- * vcpu->arch fields.
- */
-static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- u32 exec_control;
-
- vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
- vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
- vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
- vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
- vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
- vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
- vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
- vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
- vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
- vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
- vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
- vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
- vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
- vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
- vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
- vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
- vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
- vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
- vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
- vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
- vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
- vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
- vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
- vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
- vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
- vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
- vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
- vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
- vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
- vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
- vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
- vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
- vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
- vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
- vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
- vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
-
- vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
- vmcs12->vm_entry_intr_info_field);
- vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
- vmcs12->vm_entry_exception_error_code);
- vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
- vmcs12->vm_entry_instruction_len);
- vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
- vmcs12->guest_interruptibility_info);
- vmcs_write32(GUEST_ACTIVITY_STATE, vmcs12->guest_activity_state);
- vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
- vmcs_writel(GUEST_DR7, vmcs12->guest_dr7);
- vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags);
- vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
- vmcs12->guest_pending_dbg_exceptions);
- vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
- vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
-
- vmcs_write64(VMCS_LINK_POINTER, -1ull);
-
- vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
- (vmcs_config.pin_based_exec_ctrl |
- vmcs12->pin_based_vm_exec_control));
-
- /*
- * Whether page-faults are trapped is determined by a combination of
- * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
- * If enable_ept, L0 doesn't care about page faults and we should
- * set all of these to L1's desires. However, if !enable_ept, L0 does
- * care about (at least some) page faults, and because it is not easy
- * (if at all possible?) to merge L0 and L1's desires, we simply ask
- * to exit on each and every L2 page fault. This is done by setting
- * MASK=MATCH=0 and (see below) EB.PF=1.
- * Note that below we don't need special code to set EB.PF beyond the
- * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
- * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
- * !enable_ept, EB.PF is 1, so the "or" will always be 1.
- *
- * A problem with this approach (when !enable_ept) is that L1 may be
- * injected with more page faults than it asked for. This could have
- * caused problems, but in practice existing hypervisors don't care.
- * To fix this, we will need to emulate the PFEC checking (on the L1
- * page tables), using walk_addr(), when injecting PFs to L1.
- */
- vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
- enable_ept ? vmcs12->page_fault_error_code_mask : 0);
- vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
- enable_ept ? vmcs12->page_fault_error_code_match : 0);
-
- if (cpu_has_secondary_exec_ctrls()) {
- u32 exec_control = vmx_secondary_exec_control(vmx);
- if (!vmx->rdtscp_enabled)
- exec_control &= ~SECONDARY_EXEC_RDTSCP;
- /* Take the following fields only from vmcs12 */
- exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
- if (nested_cpu_has(vmcs12,
- CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
- exec_control |= vmcs12->secondary_vm_exec_control;
-
- if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) {
- /*
- * Translate L1 physical address to host physical
- * address for vmcs02. Keep the page pinned, so this
- * physical address remains valid. We keep a reference
- * to it so we can release it later.
- */
- if (vmx->nested.apic_access_page) /* shouldn't happen */
- nested_release_page(vmx->nested.apic_access_page);
- vmx->nested.apic_access_page =
- nested_get_page(vcpu, vmcs12->apic_access_addr);
- /*
- * If translation failed, no matter: This feature asks
- * to exit when accessing the given address, and if it
- * can never be accessed, this feature won't do
- * anything anyway.
- */
- if (!vmx->nested.apic_access_page)
- exec_control &=
- ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
- else
- vmcs_write64(APIC_ACCESS_ADDR,
- page_to_phys(vmx->nested.apic_access_page));
- }
-
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
- }
-
-
- /*
- * Set host-state according to L0's settings (vmcs12 is irrelevant here)
- * Some constant fields are set here by vmx_set_constant_host_state().
- * Other fields are different per CPU, and will be set later when
- * vmx_vcpu_load() is called, and when vmx_save_host_state() is called.
- */
- vmx_set_constant_host_state();
-
- /*
- * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
- * entry, but only if the current (host) sp changed from the value
- * we wrote last (vmx->host_rsp). This cache is no longer relevant
- * if we switch vmcs, and rather than hold a separate cache per vmcs,
- * here we just force the write to happen on entry.
- */
- vmx->host_rsp = 0;
-
- exec_control = vmx_exec_control(vmx); /* L0's desires */
- exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
- exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
- exec_control &= ~CPU_BASED_TPR_SHADOW;
- exec_control |= vmcs12->cpu_based_vm_exec_control;
- /*
- * Merging of IO and MSR bitmaps not currently supported.
- * Rather, exit every time.
- */
- exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
- exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
- exec_control |= CPU_BASED_UNCOND_IO_EXITING;
-
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
-
- /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
- * bitwise-or of what L1 wants to trap for L2, and what we want to
- * trap. Note that CR0.TS also needs updating - we do this later.
- */
- update_exception_bitmap(vcpu);
- vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
- vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
-
- /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */
- vmcs_write32(VM_EXIT_CONTROLS,
- vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl);
- vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls |
- (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
-
- if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)
- vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
- else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
- vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
-
-
- set_cr4_guest_host_mask(vmx);
-
- if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
- vmcs_write64(TSC_OFFSET,
- vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
- else
- vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
-
- if (enable_vpid) {
- /*
- * Trivially support vpid by letting L2s share their parent
- * L1's vpid. TODO: move to a more elaborate solution, giving
- * each L2 its own vpid and exposing the vpid feature to L1.
- */
- vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
- vmx_flush_tlb(vcpu);
- }
-
- if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
- vcpu->arch.efer = vmcs12->guest_ia32_efer;
- if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
- vcpu->arch.efer |= (EFER_LMA | EFER_LME);
- else
- vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
- /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
- vmx_set_efer(vcpu, vcpu->arch.efer);
-
- /*
- * This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified
- * TS bit (for lazy fpu) and bits which we consider mandatory enabled.
- * The CR0_READ_SHADOW is what L2 should have expected to read given
- * the specifications by L1; It's not enough to take
- * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
- * have more bits than L1 expected.
- */
- vmx_set_cr0(vcpu, vmcs12->guest_cr0);
- vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
-
- vmx_set_cr4(vcpu, vmcs12->guest_cr4);
- vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
-
- /* shadow page tables on either EPT or shadow page tables */
- kvm_set_cr3(vcpu, vmcs12->guest_cr3);
- kvm_mmu_reset_context(vcpu);
-
- kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
- kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
-}
-
-/*
- * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
- * for running an L2 nested guest.
- */
-static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
-{
- struct vmcs12 *vmcs12;
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- int cpu;
- struct loaded_vmcs *vmcs02;
-
- if (!nested_vmx_check_permission(vcpu) ||
- !nested_vmx_check_vmcs12(vcpu))
- return 1;
-
- skip_emulated_instruction(vcpu);
- vmcs12 = get_vmcs12(vcpu);
-
- /*
- * The nested entry process starts with enforcing various prerequisites
- * on vmcs12 as required by the Intel SDM, and act appropriately when
- * they fail: As the SDM explains, some conditions should cause the
- * instruction to fail, while others will cause the instruction to seem
- * to succeed, but return an EXIT_REASON_INVALID_STATE.
- * To speed up the normal (success) code path, we should avoid checking
- * for misconfigurations which will anyway be caught by the processor
- * when using the merged vmcs02.
- */
- if (vmcs12->launch_state == launch) {
- nested_vmx_failValid(vcpu,
- launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
- : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
- return 1;
- }
-
- if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) &&
- !IS_ALIGNED(vmcs12->msr_bitmap, PAGE_SIZE)) {
- /*TODO: Also verify bits beyond physical address width are 0*/
- nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
- return 1;
- }
-
- if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
- !IS_ALIGNED(vmcs12->apic_access_addr, PAGE_SIZE)) {
- /*TODO: Also verify bits beyond physical address width are 0*/
- nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
- return 1;
- }
-
- if (vmcs12->vm_entry_msr_load_count > 0 ||
- vmcs12->vm_exit_msr_load_count > 0 ||
- vmcs12->vm_exit_msr_store_count > 0) {
- pr_warn_ratelimited("%s: VMCS MSR_{LOAD,STORE} unsupported\n",
- __func__);
- nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
- return 1;
- }
-
- if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
- nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high) ||
- !vmx_control_verify(vmcs12->secondary_vm_exec_control,
- nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high) ||
- !vmx_control_verify(vmcs12->pin_based_vm_exec_control,
- nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high) ||
- !vmx_control_verify(vmcs12->vm_exit_controls,
- nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high) ||
- !vmx_control_verify(vmcs12->vm_entry_controls,
- nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high))
- {
- nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
- return 1;
- }
-
- if (((vmcs12->host_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) ||
- ((vmcs12->host_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
- nested_vmx_failValid(vcpu,
- VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
- return 1;
- }
-
- if (((vmcs12->guest_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) ||
- ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
- nested_vmx_entry_failure(vcpu, vmcs12,
- EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
- return 1;
- }
- if (vmcs12->vmcs_link_pointer != -1ull) {
- nested_vmx_entry_failure(vcpu, vmcs12,
- EXIT_REASON_INVALID_STATE, ENTRY_FAIL_VMCS_LINK_PTR);
- return 1;
- }
-
- /*
- * We're finally done with prerequisite checking, and can start with
- * the nested entry.
- */
-
- vmcs02 = nested_get_current_vmcs02(vmx);
- if (!vmcs02)
- return -ENOMEM;
-
- enter_guest_mode(vcpu);
-
- vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET);
-
- cpu = get_cpu();
- vmx->loaded_vmcs = vmcs02;
- vmx_vcpu_put(vcpu);
- vmx_vcpu_load(vcpu, cpu);
- vcpu->cpu = cpu;
- put_cpu();
-
- vmcs12->launch_state = 1;
-
- prepare_vmcs02(vcpu, vmcs12);
-
- /*
- * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
- * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
- * returned as far as L1 is concerned. It will only return (and set
- * the success flag) when L2 exits (see nested_vmx_vmexit()).
- */
- return 1;
-}
-
-/*
- * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
- * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK).
- * This function returns the new value we should put in vmcs12.guest_cr0.
- * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
- * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
- * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
- * didn't trap the bit, because if L1 did, so would L0).
- * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have
- * been modified by L2, and L1 knows it. So just leave the old value of
- * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
- * isn't relevant, because if L0 traps this bit it can set it to anything.
- * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
- * changed these bits, and therefore they need to be updated, but L0
- * didn't necessarily allow them to be changed in GUEST_CR0 - and rather
- * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
- */
-static inline unsigned long
-vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
-{
- return
- /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
- /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
- /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
- vcpu->arch.cr0_guest_owned_bits));
-}
-
-static inline unsigned long
-vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
-{
- return
- /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
- /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
- /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
- vcpu->arch.cr4_guest_owned_bits));
-}
-
-/*
- * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
- * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
- * and this function updates it to reflect the changes to the guest state while
- * L2 was running (and perhaps made some exits which were handled directly by L0
- * without going back to L1), and to reflect the exit reason.
- * Note that we do not have to copy here all VMCS fields, just those that
- * could have changed by the L2 guest or the exit - i.e., the guest-state and
- * exit-information fields only. Other fields are modified by L1 with VMWRITE,
- * which already writes to vmcs12 directly.
- */
-void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
-{
- /* update guest state fields: */
- vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
- vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
-
- kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
- vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
- vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP);
- vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
-
- vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
- vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
- vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
- vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
- vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
- vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
- vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
- vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
- vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
- vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
- vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
- vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
- vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
- vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
- vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
- vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
- vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
- vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
- vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
- vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
- vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
- vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
- vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
- vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
- vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
- vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
- vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
- vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
- vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
- vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
- vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
- vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
- vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
- vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
- vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
- vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
-
- vmcs12->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE);
- vmcs12->guest_interruptibility_info =
- vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
- vmcs12->guest_pending_dbg_exceptions =
- vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
-
- /* TODO: These cannot have changed unless we have MSR bitmaps and
- * the relevant bit asks not to trap the change */
- vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
- if (vmcs12->vm_entry_controls & VM_EXIT_SAVE_IA32_PAT)
- vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
- vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
- vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
- vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
-
- /* update exit information fields: */
-
- vmcs12->vm_exit_reason = vmcs_read32(VM_EXIT_REASON);
- vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-
- vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
- vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
- vmcs12->idt_vectoring_info_field =
- vmcs_read32(IDT_VECTORING_INFO_FIELD);
- vmcs12->idt_vectoring_error_code =
- vmcs_read32(IDT_VECTORING_ERROR_CODE);
- vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
- vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
-
- /* clear vm-entry fields which are to be cleared on exit */
- if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
- vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
-}
-
-/*
- * A part of what we need to when the nested L2 guest exits and we want to
- * run its L1 parent, is to reset L1's guest state to the host state specified
- * in vmcs12.
- * This function is to be called not only on normal nested exit, but also on
- * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
- * Failures During or After Loading Guest State").
- * This function should be called when the active VMCS is L1's (vmcs01).
- */
-void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
-{
- if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
- vcpu->arch.efer = vmcs12->host_ia32_efer;
- if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
- vcpu->arch.efer |= (EFER_LMA | EFER_LME);
- else
- vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
- vmx_set_efer(vcpu, vcpu->arch.efer);
-
- kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp);
- kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip);
- /*
- * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
- * actually changed, because it depends on the current state of
- * fpu_active (which may have changed).
- * Note that vmx_set_cr0 refers to efer set above.
- */
- kvm_set_cr0(vcpu, vmcs12->host_cr0);
- /*
- * If we did fpu_activate()/fpu_deactivate() during L2's run, we need
- * to apply the same changes to L1's vmcs. We just set cr0 correctly,
- * but we also need to update cr0_guest_host_mask and exception_bitmap.
- */
- update_exception_bitmap(vcpu);
- vcpu->arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0);
- vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
-
- /*
- * Note that CR4_GUEST_HOST_MASK is already set in the original vmcs01
- * (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask();
- */
- vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
- kvm_set_cr4(vcpu, vmcs12->host_cr4);
-
- /* shadow page tables on either EPT or shadow page tables */
- kvm_set_cr3(vcpu, vmcs12->host_cr3);
- kvm_mmu_reset_context(vcpu);
-
- if (enable_vpid) {
- /*
- * Trivially support vpid by letting L2s share their parent
- * L1's vpid. TODO: move to a more elaborate solution, giving
- * each L2 its own vpid and exposing the vpid feature to L1.
- */
- vmx_flush_tlb(vcpu);
- }
-
-
- vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
- vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
- vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
- vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
- vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
- vmcs_writel(GUEST_TR_BASE, vmcs12->host_tr_base);
- vmcs_writel(GUEST_GS_BASE, vmcs12->host_gs_base);
- vmcs_writel(GUEST_FS_BASE, vmcs12->host_fs_base);
- vmcs_write16(GUEST_ES_SELECTOR, vmcs12->host_es_selector);
- vmcs_write16(GUEST_CS_SELECTOR, vmcs12->host_cs_selector);
- vmcs_write16(GUEST_SS_SELECTOR, vmcs12->host_ss_selector);
- vmcs_write16(GUEST_DS_SELECTOR, vmcs12->host_ds_selector);
- vmcs_write16(GUEST_FS_SELECTOR, vmcs12->host_fs_selector);
- vmcs_write16(GUEST_GS_SELECTOR, vmcs12->host_gs_selector);
- vmcs_write16(GUEST_TR_SELECTOR, vmcs12->host_tr_selector);
-
- if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT)
- vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
- if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
- vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
- vmcs12->host_ia32_perf_global_ctrl);
-}
-
-/*
- * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
- * and modify vmcs12 to make it see what it would expect to see there if
- * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
- */
-static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- int cpu;
- struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-
- leave_guest_mode(vcpu);
- prepare_vmcs12(vcpu, vmcs12);
-
- cpu = get_cpu();
- vmx->loaded_vmcs = &vmx->vmcs01;
- vmx_vcpu_put(vcpu);
- vmx_vcpu_load(vcpu, cpu);
- vcpu->cpu = cpu;
- put_cpu();
-
- /* if no vmcs02 cache requested, remove the one we used */
- if (VMCS02_POOL_SIZE == 0)
- nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
-
- load_vmcs12_host_state(vcpu, vmcs12);
-
- /* Update TSC_OFFSET if TSC was changed while L2 ran */
- vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
-
- /* This is needed for same reason as it was needed in prepare_vmcs02 */
- vmx->host_rsp = 0;
-
- /* Unpin physical memory we referred to in vmcs02 */
- if (vmx->nested.apic_access_page) {
- nested_release_page(vmx->nested.apic_access_page);
- vmx->nested.apic_access_page = 0;
- }
-
- /*
- * Exiting from L2 to L1, we're now back to L1 which thinks it just
- * finished a VMLAUNCH or VMRESUME instruction, so we need to set the
- * success or failure flag accordingly.
- */
- if (unlikely(vmx->fail)) {
- vmx->fail = 0;
- nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR));
- } else
- nested_vmx_succeed(vcpu);
-}
-
-/*
- * L1's failure to enter L2 is a subset of a normal exit, as explained in
- * 23.7 "VM-entry failures during or after loading guest state" (this also
- * lists the acceptable exit-reason and exit-qualification parameters).
- * It should only be called before L2 actually succeeded to run, and when
- * vmcs01 is current (it doesn't leave_guest_mode() or switch vmcss).
- */
-static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
- struct vmcs12 *vmcs12,
- u32 reason, unsigned long qualification)
-{
- load_vmcs12_host_state(vcpu, vmcs12);
- vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
- vmcs12->exit_qualification = qualification;
- nested_vmx_succeed(vcpu);
-}
-
-static int vmx_check_intercept(struct kvm_vcpu *vcpu,
- struct x86_instruction_info *info,
- enum x86_intercept_stage stage)
-{
- return X86EMUL_CONTINUE;
-}
-
-static struct kvm_x86_ops vmx_x86_ops = {
- .cpu_has_kvm_support = cpu_has_kvm_support,
- .disabled_by_bios = vmx_disabled_by_bios,
- .hardware_setup = hardware_setup,
- .hardware_unsetup = hardware_unsetup,
- .check_processor_compatibility = vmx_check_processor_compat,
- .hardware_enable = hardware_enable,
- .hardware_disable = hardware_disable,
- .cpu_has_accelerated_tpr = report_flexpriority,
-
- .vcpu_create = vmx_create_vcpu,
- .vcpu_free = vmx_free_vcpu,
- .vcpu_reset = vmx_vcpu_reset,
-
- .prepare_guest_switch = vmx_save_host_state,
- .vcpu_load = vmx_vcpu_load,
- .vcpu_put = vmx_vcpu_put,
-
- .set_guest_debug = set_guest_debug,
- .get_msr = vmx_get_msr,
- .set_msr = vmx_set_msr,
- .get_segment_base = vmx_get_segment_base,
- .get_segment = vmx_get_segment,
- .set_segment = vmx_set_segment,
- .get_cpl = vmx_get_cpl,
- .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
- .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
- .decache_cr3 = vmx_decache_cr3,
- .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
- .set_cr0 = vmx_set_cr0,
- .set_cr3 = vmx_set_cr3,
- .set_cr4 = vmx_set_cr4,
- .set_efer = vmx_set_efer,
- .get_idt = vmx_get_idt,
- .set_idt = vmx_set_idt,
- .get_gdt = vmx_get_gdt,
- .set_gdt = vmx_set_gdt,
- .set_dr7 = vmx_set_dr7,
- .cache_reg = vmx_cache_reg,
- .get_rflags = vmx_get_rflags,
- .set_rflags = vmx_set_rflags,
- .fpu_activate = vmx_fpu_activate,
- .fpu_deactivate = vmx_fpu_deactivate,
-
- .tlb_flush = vmx_flush_tlb,
-
- .run = vmx_vcpu_run,
- .handle_exit = vmx_handle_exit,
- .skip_emulated_instruction = skip_emulated_instruction,
- .set_interrupt_shadow = vmx_set_interrupt_shadow,
- .get_interrupt_shadow = vmx_get_interrupt_shadow,
- .patch_hypercall = vmx_patch_hypercall,
- .set_irq = vmx_inject_irq,
- .set_nmi = vmx_inject_nmi,
- .queue_exception = vmx_queue_exception,
- .cancel_injection = vmx_cancel_injection,
- .interrupt_allowed = vmx_interrupt_allowed,
- .nmi_allowed = vmx_nmi_allowed,
- .get_nmi_mask = vmx_get_nmi_mask,
- .set_nmi_mask = vmx_set_nmi_mask,
- .enable_nmi_window = enable_nmi_window,
- .enable_irq_window = enable_irq_window,
- .update_cr8_intercept = update_cr8_intercept,
-
- .set_tss_addr = vmx_set_tss_addr,
- .get_tdp_level = get_ept_level,
- .get_mt_mask = vmx_get_mt_mask,
-
- .get_exit_info = vmx_get_exit_info,
-
- .get_lpage_level = vmx_get_lpage_level,
-
- .cpuid_update = vmx_cpuid_update,
-
- .rdtscp_supported = vmx_rdtscp_supported,
-
- .set_supported_cpuid = vmx_set_supported_cpuid,
-
- .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
-
- .set_tsc_khz = vmx_set_tsc_khz,
- .write_tsc_offset = vmx_write_tsc_offset,
- .adjust_tsc_offset = vmx_adjust_tsc_offset,
- .compute_tsc_offset = vmx_compute_tsc_offset,
- .read_l1_tsc = vmx_read_l1_tsc,
-
- .set_tdp_cr3 = vmx_set_cr3,
-
- .check_intercept = vmx_check_intercept,
-};
-
-static int __init vmx_init(void)
-{
- int r, i;
-
- rdmsrl_safe(MSR_EFER, &host_efer);
-
- for (i = 0; i < NR_VMX_MSR; ++i)
- kvm_define_shared_msr(i, vmx_msr_index[i]);
-
- vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_io_bitmap_a)
- return -ENOMEM;
-
- vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_io_bitmap_b) {
- r = -ENOMEM;
- goto out;
- }
-
- vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_msr_bitmap_legacy) {
- r = -ENOMEM;
- goto out1;
- }
-
- vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_msr_bitmap_longmode) {
- r = -ENOMEM;
- goto out2;
- }
-
- /*
- * Allow direct access to the PC debug port (it is often used for I/O
- * delays, but the vmexits simply slow things down).
- */
- memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE);
- clear_bit(0x80, vmx_io_bitmap_a);
-
- memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
-
- memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
- memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
-
- set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
-
- r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
- __alignof__(struct vcpu_vmx), THIS_MODULE);
- if (r)
- goto out3;
-
- vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
- vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
- vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
-
- if (enable_ept) {
- kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
- VMX_EPT_EXECUTABLE_MASK);
- ept_set_mmio_spte_mask();
- kvm_enable_tdp();
- } else
- kvm_disable_tdp();
-
- return 0;
-
-out3:
- free_page((unsigned long)vmx_msr_bitmap_longmode);
-out2:
- free_page((unsigned long)vmx_msr_bitmap_legacy);
-out1:
- free_page((unsigned long)vmx_io_bitmap_b);
-out:
- free_page((unsigned long)vmx_io_bitmap_a);
- return r;
-}
-
-static void __exit vmx_exit(void)
-{
- free_page((unsigned long)vmx_msr_bitmap_legacy);
- free_page((unsigned long)vmx_msr_bitmap_longmode);
- free_page((unsigned long)vmx_io_bitmap_b);
- free_page((unsigned long)vmx_io_bitmap_a);
-
- kvm_exit();
-}
-
-module_init(vmx_init)
-module_exit(vmx_exit)
diff --git a/ANDROID_3.4.5/arch/x86/kvm/x86.c b/ANDROID_3.4.5/arch/x86/kvm/x86.c
deleted file mode 100644
index 185a2b82..00000000
--- a/ANDROID_3.4.5/arch/x86/kvm/x86.c
+++ /dev/null
@@ -1,6607 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * derived from drivers/kvm/kvm_main.c
- *
- * Copyright (C) 2006 Qumranet, Inc.
- * Copyright (C) 2008 Qumranet, Inc.
- * Copyright IBM Corporation, 2008
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
- *
- * Authors:
- * Avi Kivity <avi@qumranet.com>
- * Yaniv Kamay <yaniv@qumranet.com>
- * Amit Shah <amit.shah@qumranet.com>
- * Ben-Ami Yassour <benami@il.ibm.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- *
- */
-
-#include <linux/kvm_host.h>
-#include "irq.h"
-#include "mmu.h"
-#include "i8254.h"
-#include "tss.h"
-#include "kvm_cache_regs.h"
-#include "x86.h"
-#include "cpuid.h"
-
-#include <linux/clocksource.h>
-#include <linux/interrupt.h>
-#include <linux/kvm.h>
-#include <linux/fs.h>
-#include <linux/vmalloc.h>
-#include <linux/module.h>
-#include <linux/mman.h>
-#include <linux/highmem.h>
-#include <linux/iommu.h>
-#include <linux/intel-iommu.h>
-#include <linux/cpufreq.h>
-#include <linux/user-return-notifier.h>
-#include <linux/srcu.h>
-#include <linux/slab.h>
-#include <linux/perf_event.h>
-#include <linux/uaccess.h>
-#include <linux/hash.h>
-#include <linux/pci.h>
-#include <trace/events/kvm.h>
-
-#define CREATE_TRACE_POINTS
-#include "trace.h"
-
-#include <asm/debugreg.h>
-#include <asm/msr.h>
-#include <asm/desc.h>
-#include <asm/mtrr.h>
-#include <asm/mce.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h> /* Ugh! */
-#include <asm/xcr.h>
-#include <asm/pvclock.h>
-#include <asm/div64.h>
-
-#define MAX_IO_MSRS 256
-#define KVM_MAX_MCE_BANKS 32
-#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
-
-#define emul_to_vcpu(ctxt) \
- container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
-
-/* EFER defaults:
- * - enable syscall per default because its emulated by KVM
- * - enable LME and LMA per default on 64 bit KVM
- */
-#ifdef CONFIG_X86_64
-static
-u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
-#else
-static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
-#endif
-
-#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
-#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
-
-static void update_cr8_intercept(struct kvm_vcpu *vcpu);
-static void process_nmi(struct kvm_vcpu *vcpu);
-
-struct kvm_x86_ops *kvm_x86_ops;
-EXPORT_SYMBOL_GPL(kvm_x86_ops);
-
-static bool ignore_msrs = 0;
-module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
-
-bool kvm_has_tsc_control;
-EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
-u32 kvm_max_guest_tsc_khz;
-EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
-
-/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
-static u32 tsc_tolerance_ppm = 250;
-module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
-
-#define KVM_NR_SHARED_MSRS 16
-
-struct kvm_shared_msrs_global {
- int nr;
- u32 msrs[KVM_NR_SHARED_MSRS];
-};
-
-struct kvm_shared_msrs {
- struct user_return_notifier urn;
- bool registered;
- struct kvm_shared_msr_values {
- u64 host;
- u64 curr;
- } values[KVM_NR_SHARED_MSRS];
-};
-
-static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
-static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs);
-
-struct kvm_stats_debugfs_item debugfs_entries[] = {
- { "pf_fixed", VCPU_STAT(pf_fixed) },
- { "pf_guest", VCPU_STAT(pf_guest) },
- { "tlb_flush", VCPU_STAT(tlb_flush) },
- { "invlpg", VCPU_STAT(invlpg) },
- { "exits", VCPU_STAT(exits) },
- { "io_exits", VCPU_STAT(io_exits) },
- { "mmio_exits", VCPU_STAT(mmio_exits) },
- { "signal_exits", VCPU_STAT(signal_exits) },
- { "irq_window", VCPU_STAT(irq_window_exits) },
- { "nmi_window", VCPU_STAT(nmi_window_exits) },
- { "halt_exits", VCPU_STAT(halt_exits) },
- { "halt_wakeup", VCPU_STAT(halt_wakeup) },
- { "hypercalls", VCPU_STAT(hypercalls) },
- { "request_irq", VCPU_STAT(request_irq_exits) },
- { "irq_exits", VCPU_STAT(irq_exits) },
- { "host_state_reload", VCPU_STAT(host_state_reload) },
- { "efer_reload", VCPU_STAT(efer_reload) },
- { "fpu_reload", VCPU_STAT(fpu_reload) },
- { "insn_emulation", VCPU_STAT(insn_emulation) },
- { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
- { "irq_injections", VCPU_STAT(irq_injections) },
- { "nmi_injections", VCPU_STAT(nmi_injections) },
- { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
- { "mmu_pte_write", VM_STAT(mmu_pte_write) },
- { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
- { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
- { "mmu_flooded", VM_STAT(mmu_flooded) },
- { "mmu_recycled", VM_STAT(mmu_recycled) },
- { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
- { "mmu_unsync", VM_STAT(mmu_unsync) },
- { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
- { "largepages", VM_STAT(lpages) },
- { NULL }
-};
-
-u64 __read_mostly host_xcr0;
-
-int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
-
-static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
-{
- int i;
- for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
- vcpu->arch.apf.gfns[i] = ~0;
-}
-
-static void kvm_on_user_return(struct user_return_notifier *urn)
-{
- unsigned slot;
- struct kvm_shared_msrs *locals
- = container_of(urn, struct kvm_shared_msrs, urn);
- struct kvm_shared_msr_values *values;
-
- for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
- values = &locals->values[slot];
- if (values->host != values->curr) {
- wrmsrl(shared_msrs_global.msrs[slot], values->host);
- values->curr = values->host;
- }
- }
- locals->registered = false;
- user_return_notifier_unregister(urn);
-}
-
-static void shared_msr_update(unsigned slot, u32 msr)
-{
- struct kvm_shared_msrs *smsr;
- u64 value;
-
- smsr = &__get_cpu_var(shared_msrs);
- /* only read, and nobody should modify it at this time,
- * so don't need lock */
- if (slot >= shared_msrs_global.nr) {
- printk(KERN_ERR "kvm: invalid MSR slot!");
- return;
- }
- rdmsrl_safe(msr, &value);
- smsr->values[slot].host = value;
- smsr->values[slot].curr = value;
-}
-
-void kvm_define_shared_msr(unsigned slot, u32 msr)
-{
- if (slot >= shared_msrs_global.nr)
- shared_msrs_global.nr = slot + 1;
- shared_msrs_global.msrs[slot] = msr;
- /* we need ensured the shared_msr_global have been updated */
- smp_wmb();
-}
-EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
-
-static void kvm_shared_msr_cpu_online(void)
-{
- unsigned i;
-
- for (i = 0; i < shared_msrs_global.nr; ++i)
- shared_msr_update(i, shared_msrs_global.msrs[i]);
-}
-
-void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
-{
- struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
-
- if (((value ^ smsr->values[slot].curr) & mask) == 0)
- return;
- smsr->values[slot].curr = value;
- wrmsrl(shared_msrs_global.msrs[slot], value);
- if (!smsr->registered) {
- smsr->urn.on_user_return = kvm_on_user_return;
- user_return_notifier_register(&smsr->urn);
- smsr->registered = true;
- }
-}
-EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
-
-static void drop_user_return_notifiers(void *ignore)
-{
- struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
-
- if (smsr->registered)
- kvm_on_user_return(&smsr->urn);
-}
-
-u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
-{
- if (irqchip_in_kernel(vcpu->kvm))
- return vcpu->arch.apic_base;
- else
- return vcpu->arch.apic_base;
-}
-EXPORT_SYMBOL_GPL(kvm_get_apic_base);
-
-void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
-{
- /* TODO: reserve bits check */
- if (irqchip_in_kernel(vcpu->kvm))
- kvm_lapic_set_base(vcpu, data);
- else
- vcpu->arch.apic_base = data;
-}
-EXPORT_SYMBOL_GPL(kvm_set_apic_base);
-
-#define EXCPT_BENIGN 0
-#define EXCPT_CONTRIBUTORY 1
-#define EXCPT_PF 2
-
-static int exception_class(int vector)
-{
- switch (vector) {
- case PF_VECTOR:
- return EXCPT_PF;
- case DE_VECTOR:
- case TS_VECTOR:
- case NP_VECTOR:
- case SS_VECTOR:
- case GP_VECTOR:
- return EXCPT_CONTRIBUTORY;
- default:
- break;
- }
- return EXCPT_BENIGN;
-}
-
-static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
- unsigned nr, bool has_error, u32 error_code,
- bool reinject)
-{
- u32 prev_nr;
- int class1, class2;
-
- kvm_make_request(KVM_REQ_EVENT, vcpu);
-
- if (!vcpu->arch.exception.pending) {
- queue:
- vcpu->arch.exception.pending = true;
- vcpu->arch.exception.has_error_code = has_error;
- vcpu->arch.exception.nr = nr;
- vcpu->arch.exception.error_code = error_code;
- vcpu->arch.exception.reinject = reinject;
- return;
- }
-
- /* to check exception */
- prev_nr = vcpu->arch.exception.nr;
- if (prev_nr == DF_VECTOR) {
- /* triple fault -> shutdown */
- kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
- return;
- }
- class1 = exception_class(prev_nr);
- class2 = exception_class(nr);
- if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
- || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
- /* generate double fault per SDM Table 5-5 */
- vcpu->arch.exception.pending = true;
- vcpu->arch.exception.has_error_code = true;
- vcpu->arch.exception.nr = DF_VECTOR;
- vcpu->arch.exception.error_code = 0;
- } else
- /* replace previous exception with a new one in a hope
- that instruction re-execution will regenerate lost
- exception */
- goto queue;
-}
-
-void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
-{
- kvm_multiple_exception(vcpu, nr, false, 0, false);
-}
-EXPORT_SYMBOL_GPL(kvm_queue_exception);
-
-void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
-{
- kvm_multiple_exception(vcpu, nr, false, 0, true);
-}
-EXPORT_SYMBOL_GPL(kvm_requeue_exception);
-
-void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
-{
- if (err)
- kvm_inject_gp(vcpu, 0);
- else
- kvm_x86_ops->skip_emulated_instruction(vcpu);
-}
-EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
-
-void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
-{
- ++vcpu->stat.pf_guest;
- vcpu->arch.cr2 = fault->address;
- kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
-}
-EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
-
-void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
-{
- if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
- vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
- else
- vcpu->arch.mmu.inject_page_fault(vcpu, fault);
-}
-
-void kvm_inject_nmi(struct kvm_vcpu *vcpu)
-{
- atomic_inc(&vcpu->arch.nmi_queued);
- kvm_make_request(KVM_REQ_NMI, vcpu);
-}
-EXPORT_SYMBOL_GPL(kvm_inject_nmi);
-
-void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
-{
- kvm_multiple_exception(vcpu, nr, true, error_code, false);
-}
-EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
-
-void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
-{
- kvm_multiple_exception(vcpu, nr, true, error_code, true);
-}
-EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
-
-/*
- * Checks if cpl <= required_cpl; if true, return true. Otherwise queue
- * a #GP and return false.
- */
-bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
-{
- if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
- return true;
- kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
- return false;
-}
-EXPORT_SYMBOL_GPL(kvm_require_cpl);
-
-/*
- * This function will be used to read from the physical memory of the currently
- * running guest. The difference to kvm_read_guest_page is that this function
- * can read from guest physical or from the guest's guest physical memory.
- */
-int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
- gfn_t ngfn, void *data, int offset, int len,
- u32 access)
-{
- gfn_t real_gfn;
- gpa_t ngpa;
-
- ngpa = gfn_to_gpa(ngfn);
- real_gfn = mmu->translate_gpa(vcpu, ngpa, access);
- if (real_gfn == UNMAPPED_GVA)
- return -EFAULT;
-
- real_gfn = gpa_to_gfn(real_gfn);
-
- return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len);
-}
-EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
-
-int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
- void *data, int offset, int len, u32 access)
-{
- return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
- data, offset, len, access);
-}
-
-/*
- * Load the pae pdptrs. Return true is they are all valid.
- */
-int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
-{
- gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
- unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
- int i;
- int ret;
- u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
-
- ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
- offset * sizeof(u64), sizeof(pdpte),
- PFERR_USER_MASK|PFERR_WRITE_MASK);
- if (ret < 0) {
- ret = 0;
- goto out;
- }
- for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
- if (is_present_gpte(pdpte[i]) &&
- (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
- ret = 0;
- goto out;
- }
- }
- ret = 1;
-
- memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
- __set_bit(VCPU_EXREG_PDPTR,
- (unsigned long *)&vcpu->arch.regs_avail);
- __set_bit(VCPU_EXREG_PDPTR,
- (unsigned long *)&vcpu->arch.regs_dirty);
-out:
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(load_pdptrs);
-
-static bool pdptrs_changed(struct kvm_vcpu *vcpu)
-{
- u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
- bool changed = true;
- int offset;
- gfn_t gfn;
- int r;
-
- if (is_long_mode(vcpu) || !is_pae(vcpu))
- return false;
-
- if (!test_bit(VCPU_EXREG_PDPTR,
- (unsigned long *)&vcpu->arch.regs_avail))
- return true;
-
- gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT;
- offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1);
- r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
- PFERR_USER_MASK | PFERR_WRITE_MASK);
- if (r < 0)
- goto out;
- changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
-out:
-
- return changed;
-}
-
-int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
-{
- unsigned long old_cr0 = kvm_read_cr0(vcpu);
- unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |
- X86_CR0_CD | X86_CR0_NW;
-
- cr0 |= X86_CR0_ET;
-
-#ifdef CONFIG_X86_64
- if (cr0 & 0xffffffff00000000UL)
- return 1;
-#endif
-
- cr0 &= ~CR0_RESERVED_BITS;
-
- if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
- return 1;
-
- if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
- return 1;
-
- if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
-#ifdef CONFIG_X86_64
- if ((vcpu->arch.efer & EFER_LME)) {
- int cs_db, cs_l;
-
- if (!is_pae(vcpu))
- return 1;
- kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
- if (cs_l)
- return 1;
- } else
-#endif
- if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
- kvm_read_cr3(vcpu)))
- return 1;
- }
-
- kvm_x86_ops->set_cr0(vcpu, cr0);
-
- if ((cr0 ^ old_cr0) & X86_CR0_PG) {
- kvm_clear_async_pf_completion_queue(vcpu);
- kvm_async_pf_hash_reset(vcpu);
- }
-
- if ((cr0 ^ old_cr0) & update_bits)
- kvm_mmu_reset_context(vcpu);
- return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_set_cr0);
-
-void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
-{
- (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
-}
-EXPORT_SYMBOL_GPL(kvm_lmsw);
-
-int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
-{
- u64 xcr0;
-
- /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */
- if (index != XCR_XFEATURE_ENABLED_MASK)
- return 1;
- xcr0 = xcr;
- if (kvm_x86_ops->get_cpl(vcpu) != 0)
- return 1;
- if (!(xcr0 & XSTATE_FP))
- return 1;
- if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
- return 1;
- if (xcr0 & ~host_xcr0)
- return 1;
- vcpu->arch.xcr0 = xcr0;
- vcpu->guest_xcr0_loaded = 0;
- return 0;
-}
-
-int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
-{
- if (__kvm_set_xcr(vcpu, index, xcr)) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
- return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_set_xcr);
-
-int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
-{
- unsigned long old_cr4 = kvm_read_cr4(vcpu);
- unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE |
- X86_CR4_PAE | X86_CR4_SMEP;
- if (cr4 & CR4_RESERVED_BITS)
- return 1;
-
- if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))
- return 1;
-
- if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP))
- return 1;
-
- if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_RDWRGSFS))
- return 1;
-
- if (is_long_mode(vcpu)) {
- if (!(cr4 & X86_CR4_PAE))
- return 1;
- } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
- && ((cr4 ^ old_cr4) & pdptr_bits)
- && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
- kvm_read_cr3(vcpu)))
- return 1;
-
- if (kvm_x86_ops->set_cr4(vcpu, cr4))
- return 1;
-
- if ((cr4 ^ old_cr4) & pdptr_bits)
- kvm_mmu_reset_context(vcpu);
-
- if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
- kvm_update_cpuid(vcpu);
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_set_cr4);
-
-int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
-{
- if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
- kvm_mmu_sync_roots(vcpu);
- kvm_mmu_flush_tlb(vcpu);
- return 0;
- }
-
- if (is_long_mode(vcpu)) {
- if (cr3 & CR3_L_MODE_RESERVED_BITS)
- return 1;
- } else {
- if (is_pae(vcpu)) {
- if (cr3 & CR3_PAE_RESERVED_BITS)
- return 1;
- if (is_paging(vcpu) &&
- !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
- return 1;
- }
- /*
- * We don't check reserved bits in nonpae mode, because
- * this isn't enforced, and VMware depends on this.
- */
- }
-
- /*
- * Does the new cr3 value map to physical memory? (Note, we
- * catch an invalid cr3 even in real-mode, because it would
- * cause trouble later on when we turn on paging anyway.)
- *
- * A real CPU would silently accept an invalid cr3 and would
- * attempt to use it - with largely undefined (and often hard
- * to debug) behavior on the guest side.
- */
- if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
- return 1;
- vcpu->arch.cr3 = cr3;
- __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
- vcpu->arch.mmu.new_cr3(vcpu);
- return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_set_cr3);
-
-int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
-{
- if (cr8 & CR8_RESERVED_BITS)
- return 1;
- if (irqchip_in_kernel(vcpu->kvm))
- kvm_lapic_set_tpr(vcpu, cr8);
- else
- vcpu->arch.cr8 = cr8;
- return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_set_cr8);
-
-unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
-{
- if (irqchip_in_kernel(vcpu->kvm))
- return kvm_lapic_get_cr8(vcpu);
- else
- return vcpu->arch.cr8;
-}
-EXPORT_SYMBOL_GPL(kvm_get_cr8);
-
-static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
-{
- switch (dr) {
- case 0 ... 3:
- vcpu->arch.db[dr] = val;
- if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
- vcpu->arch.eff_db[dr] = val;
- break;
- case 4:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
- return 1; /* #UD */
- /* fall through */
- case 6:
- if (val & 0xffffffff00000000ULL)
- return -1; /* #GP */
- vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
- break;
- case 5:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
- return 1; /* #UD */
- /* fall through */
- default: /* 7 */
- if (val & 0xffffffff00000000ULL)
- return -1; /* #GP */
- vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
- if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
- kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
- vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK);
- }
- break;
- }
-
- return 0;
-}
-
-int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
-{
- int res;
-
- res = __kvm_set_dr(vcpu, dr, val);
- if (res > 0)
- kvm_queue_exception(vcpu, UD_VECTOR);
- else if (res < 0)
- kvm_inject_gp(vcpu, 0);
-
- return res;
-}
-EXPORT_SYMBOL_GPL(kvm_set_dr);
-
-static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
-{
- switch (dr) {
- case 0 ... 3:
- *val = vcpu->arch.db[dr];
- break;
- case 4:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
- return 1;
- /* fall through */
- case 6:
- *val = vcpu->arch.dr6;
- break;
- case 5:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
- return 1;
- /* fall through */
- default: /* 7 */
- *val = vcpu->arch.dr7;
- break;
- }
-
- return 0;
-}
-
-int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
-{
- if (_kvm_get_dr(vcpu, dr, val)) {
- kvm_queue_exception(vcpu, UD_VECTOR);
- return 1;
- }
- return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_get_dr);
-
-bool kvm_rdpmc(struct kvm_vcpu *vcpu)
-{
- u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
- u64 data;
- int err;
-
- err = kvm_pmu_read_pmc(vcpu, ecx, &data);
- if (err)
- return err;
- kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data);
- kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32);
- return err;
-}
-EXPORT_SYMBOL_GPL(kvm_rdpmc);
-
-/*
- * List of msr numbers which we expose to userspace through KVM_GET_MSRS
- * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
- *
- * This list is modified at module load time to reflect the
- * capabilities of the host cpu. This capabilities test skips MSRs that are
- * kvm-specific. Those are put in the beginning of the list.
- */
-
-#define KVM_SAVE_MSRS_BEGIN 9
-static u32 msrs_to_save[] = {
- MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
- MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
- HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
- HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
- MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
- MSR_STAR,
-#ifdef CONFIG_X86_64
- MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
-#endif
- MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
-};
-
-static unsigned num_msrs_to_save;
-
-static u32 emulated_msrs[] = {
- MSR_IA32_TSCDEADLINE,
- MSR_IA32_MISC_ENABLE,
- MSR_IA32_MCG_STATUS,
- MSR_IA32_MCG_CTL,
-};
-
-static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
-{
- u64 old_efer = vcpu->arch.efer;
-
- if (efer & efer_reserved_bits)
- return 1;
-
- if (is_paging(vcpu)
- && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
- return 1;
-
- if (efer & EFER_FFXSR) {
- struct kvm_cpuid_entry2 *feat;
-
- feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
- if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT)))
- return 1;
- }
-
- if (efer & EFER_SVME) {
- struct kvm_cpuid_entry2 *feat;
-
- feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
- if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM)))
- return 1;
- }
-
- efer &= ~EFER_LMA;
- efer |= vcpu->arch.efer & EFER_LMA;
-
- kvm_x86_ops->set_efer(vcpu, efer);
-
- vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
-
- /* Update reserved bits */
- if ((efer ^ old_efer) & EFER_NX)
- kvm_mmu_reset_context(vcpu);
-
- return 0;
-}
-
-void kvm_enable_efer_bits(u64 mask)
-{
- efer_reserved_bits &= ~mask;
-}
-EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
-
-
-/*
- * Writes msr value into into the appropriate "register".
- * Returns 0 on success, non-0 otherwise.
- * Assumes vcpu_load() was already called.
- */
-int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
-{
- return kvm_x86_ops->set_msr(vcpu, msr_index, data);
-}
-
-/*
- * Adapt set_msr() to msr_io()'s calling convention
- */
-static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
-{
- return kvm_set_msr(vcpu, index, *data);
-}
-
-static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
-{
- int version;
- int r;
- struct pvclock_wall_clock wc;
- struct timespec boot;
-
- if (!wall_clock)
- return;
-
- r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
- if (r)
- return;
-
- if (version & 1)
- ++version; /* first time write, random junk */
-
- ++version;
-
- kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
-
- /*
- * The guest calculates current wall clock time by adding
- * system time (updated by kvm_guest_time_update below) to the
- * wall clock specified here. guest system time equals host
- * system time for us, thus we must fill in host boot time here.
- */
- getboottime(&boot);
-
- wc.sec = boot.tv_sec;
- wc.nsec = boot.tv_nsec;
- wc.version = version;
-
- kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
-
- version++;
- kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
-}
-
-static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
-{
- uint32_t quotient, remainder;
-
- /* Don't try to replace with do_div(), this one calculates
- * "(dividend << 32) / divisor" */
- __asm__ ( "divl %4"
- : "=a" (quotient), "=d" (remainder)
- : "0" (0), "1" (dividend), "r" (divisor) );
- return quotient;
-}
-
-static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
- s8 *pshift, u32 *pmultiplier)
-{
- uint64_t scaled64;
- int32_t shift = 0;
- uint64_t tps64;
- uint32_t tps32;
-
- tps64 = base_khz * 1000LL;
- scaled64 = scaled_khz * 1000LL;
- while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
- tps64 >>= 1;
- shift--;
- }
-
- tps32 = (uint32_t)tps64;
- while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
- if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
- scaled64 >>= 1;
- else
- tps32 <<= 1;
- shift++;
- }
-
- *pshift = shift;
- *pmultiplier = div_frac(scaled64, tps32);
-
- pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n",
- __func__, base_khz, scaled_khz, shift, *pmultiplier);
-}
-
-static inline u64 get_kernel_ns(void)
-{
- struct timespec ts;
-
- WARN_ON(preemptible());
- ktime_get_ts(&ts);
- monotonic_to_bootbased(&ts);
- return timespec_to_ns(&ts);
-}
-
-static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
-unsigned long max_tsc_khz;
-
-static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
-{
- return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
- vcpu->arch.virtual_tsc_shift);
-}
-
-static u32 adjust_tsc_khz(u32 khz, s32 ppm)
-{
- u64 v = (u64)khz * (1000000 + ppm);
- do_div(v, 1000000);
- return v;
-}
-
-static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
-{
- u32 thresh_lo, thresh_hi;
- int use_scaling = 0;
-
- /* Compute a scale to convert nanoseconds in TSC cycles */
- kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
- &vcpu->arch.virtual_tsc_shift,
- &vcpu->arch.virtual_tsc_mult);
- vcpu->arch.virtual_tsc_khz = this_tsc_khz;
-
- /*
- * Compute the variation in TSC rate which is acceptable
- * within the range of tolerance and decide if the
- * rate being applied is within that bounds of the hardware
- * rate. If so, no scaling or compensation need be done.
- */
- thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
- thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
- if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) {
- pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);
- use_scaling = 1;
- }
- kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
-}
-
-static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
-{
- u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
- vcpu->arch.virtual_tsc_mult,
- vcpu->arch.virtual_tsc_shift);
- tsc += vcpu->arch.this_tsc_write;
- return tsc;
-}
-
-void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
-{
- struct kvm *kvm = vcpu->kvm;
- u64 offset, ns, elapsed;
- unsigned long flags;
- s64 usdiff;
-
- raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
- offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
- ns = get_kernel_ns();
- elapsed = ns - kvm->arch.last_tsc_nsec;
-
- /* n.b - signed multiplication and division required */
- usdiff = data - kvm->arch.last_tsc_write;
-#ifdef CONFIG_X86_64
- usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;
-#else
- /* do_div() only does unsigned */
- asm("idivl %2; xor %%edx, %%edx"
- : "=A"(usdiff)
- : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz));
-#endif
- do_div(elapsed, 1000);
- usdiff -= elapsed;
- if (usdiff < 0)
- usdiff = -usdiff;
-
- /*
- * Special case: TSC write with a small delta (1 second) of virtual
- * cycle time against real time is interpreted as an attempt to
- * synchronize the CPU.
- *
- * For a reliable TSC, we can match TSC offsets, and for an unstable
- * TSC, we add elapsed time in this computation. We could let the
- * compensation code attempt to catch up if we fall behind, but
- * it's better to try to match offsets from the beginning.
- */
- if (usdiff < USEC_PER_SEC &&
- vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
- if (!check_tsc_unstable()) {
- offset = kvm->arch.cur_tsc_offset;
- pr_debug("kvm: matched tsc offset for %llu\n", data);
- } else {
- u64 delta = nsec_to_cycles(vcpu, elapsed);
- data += delta;
- offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
- pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
- }
- } else {
- /*
- * We split periods of matched TSC writes into generations.
- * For each generation, we track the original measured
- * nanosecond time, offset, and write, so if TSCs are in
- * sync, we can match exact offset, and if not, we can match
- * exact software computaion in compute_guest_tsc()
- *
- * These values are tracked in kvm->arch.cur_xxx variables.
- */
- kvm->arch.cur_tsc_generation++;
- kvm->arch.cur_tsc_nsec = ns;
- kvm->arch.cur_tsc_write = data;
- kvm->arch.cur_tsc_offset = offset;
- pr_debug("kvm: new tsc generation %u, clock %llu\n",
- kvm->arch.cur_tsc_generation, data);
- }
-
- /*
- * We also track th most recent recorded KHZ, write and time to
- * allow the matching interval to be extended at each write.
- */
- kvm->arch.last_tsc_nsec = ns;
- kvm->arch.last_tsc_write = data;
- kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
-
- /* Reset of TSC must disable overshoot protection below */
- vcpu->arch.hv_clock.tsc_timestamp = 0;
- vcpu->arch.last_guest_tsc = data;
-
- /* Keep track of which generation this VCPU has synchronized to */
- vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
- vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
- vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
-
- kvm_x86_ops->write_tsc_offset(vcpu, offset);
- raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
-}
-
-EXPORT_SYMBOL_GPL(kvm_write_tsc);
-
-static int kvm_guest_time_update(struct kvm_vcpu *v)
-{
- unsigned long flags;
- struct kvm_vcpu_arch *vcpu = &v->arch;
- void *shared_kaddr;
- unsigned long this_tsc_khz;
- s64 kernel_ns, max_kernel_ns;
- u64 tsc_timestamp;
-
- /* Keep irq disabled to prevent changes to the clock */
- local_irq_save(flags);
- tsc_timestamp = kvm_x86_ops->read_l1_tsc(v);
- kernel_ns = get_kernel_ns();
- this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
- if (unlikely(this_tsc_khz == 0)) {
- local_irq_restore(flags);
- kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
- return 1;
- }
-
- /*
- * We may have to catch up the TSC to match elapsed wall clock
- * time for two reasons, even if kvmclock is used.
- * 1) CPU could have been running below the maximum TSC rate
- * 2) Broken TSC compensation resets the base at each VCPU
- * entry to avoid unknown leaps of TSC even when running
- * again on the same CPU. This may cause apparent elapsed
- * time to disappear, and the guest to stand still or run
- * very slowly.
- */
- if (vcpu->tsc_catchup) {
- u64 tsc = compute_guest_tsc(v, kernel_ns);
- if (tsc > tsc_timestamp) {
- adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
- tsc_timestamp = tsc;
- }
- }
-
- local_irq_restore(flags);
-
- if (!vcpu->time_page)
- return 0;
-
- /*
- * Time as measured by the TSC may go backwards when resetting the base
- * tsc_timestamp. The reason for this is that the TSC resolution is
- * higher than the resolution of the other clock scales. Thus, many
- * possible measurments of the TSC correspond to one measurement of any
- * other clock, and so a spread of values is possible. This is not a
- * problem for the computation of the nanosecond clock; with TSC rates
- * around 1GHZ, there can only be a few cycles which correspond to one
- * nanosecond value, and any path through this code will inevitably
- * take longer than that. However, with the kernel_ns value itself,
- * the precision may be much lower, down to HZ granularity. If the
- * first sampling of TSC against kernel_ns ends in the low part of the
- * range, and the second in the high end of the range, we can get:
- *
- * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new
- *
- * As the sampling errors potentially range in the thousands of cycles,
- * it is possible such a time value has already been observed by the
- * guest. To protect against this, we must compute the system time as
- * observed by the guest and ensure the new system time is greater.
- */
- max_kernel_ns = 0;
- if (vcpu->hv_clock.tsc_timestamp) {
- max_kernel_ns = vcpu->last_guest_tsc -
- vcpu->hv_clock.tsc_timestamp;
- max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
- vcpu->hv_clock.tsc_to_system_mul,
- vcpu->hv_clock.tsc_shift);
- max_kernel_ns += vcpu->last_kernel_ns;
- }
-
- if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
- kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz,
- &vcpu->hv_clock.tsc_shift,
- &vcpu->hv_clock.tsc_to_system_mul);
- vcpu->hw_tsc_khz = this_tsc_khz;
- }
-
- if (max_kernel_ns > kernel_ns)
- kernel_ns = max_kernel_ns;
-
- /* With all the info we got, fill in the values */
- vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
- vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
- vcpu->last_kernel_ns = kernel_ns;
- vcpu->last_guest_tsc = tsc_timestamp;
- vcpu->hv_clock.flags = 0;
-
- /*
- * The interface expects us to write an even number signaling that the
- * update is finished. Since the guest won't see the intermediate
- * state, we just increase by 2 at the end.
- */
- vcpu->hv_clock.version += 2;
-
- shared_kaddr = kmap_atomic(vcpu->time_page);
-
- memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
- sizeof(vcpu->hv_clock));
-
- kunmap_atomic(shared_kaddr);
-
- mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
- return 0;
-}
-
-static bool msr_mtrr_valid(unsigned msr)
-{
- switch (msr) {
- case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
- case MSR_MTRRfix64K_00000:
- case MSR_MTRRfix16K_80000:
- case MSR_MTRRfix16K_A0000:
- case MSR_MTRRfix4K_C0000:
- case MSR_MTRRfix4K_C8000:
- case MSR_MTRRfix4K_D0000:
- case MSR_MTRRfix4K_D8000:
- case MSR_MTRRfix4K_E0000:
- case MSR_MTRRfix4K_E8000:
- case MSR_MTRRfix4K_F0000:
- case MSR_MTRRfix4K_F8000:
- case MSR_MTRRdefType:
- case MSR_IA32_CR_PAT:
- return true;
- case 0x2f8:
- return true;
- }
- return false;
-}
-
-static bool valid_pat_type(unsigned t)
-{
- return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
-}
-
-static bool valid_mtrr_type(unsigned t)
-{
- return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
-}
-
-static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
-{
- int i;
-
- if (!msr_mtrr_valid(msr))
- return false;
-
- if (msr == MSR_IA32_CR_PAT) {
- for (i = 0; i < 8; i++)
- if (!valid_pat_type((data >> (i * 8)) & 0xff))
- return false;
- return true;
- } else if (msr == MSR_MTRRdefType) {
- if (data & ~0xcff)
- return false;
- return valid_mtrr_type(data & 0xff);
- } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
- for (i = 0; i < 8 ; i++)
- if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
- return false;
- return true;
- }
-
- /* variable MTRRs */
- return valid_mtrr_type(data & 0xff);
-}
-
-static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
-{
- u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
-
- if (!mtrr_valid(vcpu, msr, data))
- return 1;
-
- if (msr == MSR_MTRRdefType) {
- vcpu->arch.mtrr_state.def_type = data;
- vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
- } else if (msr == MSR_MTRRfix64K_00000)
- p[0] = data;
- else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
- p[1 + msr - MSR_MTRRfix16K_80000] = data;
- else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
- p[3 + msr - MSR_MTRRfix4K_C0000] = data;
- else if (msr == MSR_IA32_CR_PAT)
- vcpu->arch.pat = data;
- else { /* Variable MTRRs */
- int idx, is_mtrr_mask;
- u64 *pt;
-
- idx = (msr - 0x200) / 2;
- is_mtrr_mask = msr - 0x200 - 2 * idx;
- if (!is_mtrr_mask)
- pt =
- (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
- else
- pt =
- (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
- *pt = data;
- }
-
- kvm_mmu_reset_context(vcpu);
- return 0;
-}
-
-static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
-{
- u64 mcg_cap = vcpu->arch.mcg_cap;
- unsigned bank_num = mcg_cap & 0xff;
-
- switch (msr) {
- case MSR_IA32_MCG_STATUS:
- vcpu->arch.mcg_status = data;
- break;
- case MSR_IA32_MCG_CTL:
- if (!(mcg_cap & MCG_CTL_P))
- return 1;
- if (data != 0 && data != ~(u64)0)
- return -1;
- vcpu->arch.mcg_ctl = data;
- break;
- default:
- if (msr >= MSR_IA32_MC0_CTL &&
- msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
- u32 offset = msr - MSR_IA32_MC0_CTL;
- /* only 0 or all 1s can be written to IA32_MCi_CTL
- * some Linux kernels though clear bit 10 in bank 4 to
- * workaround a BIOS/GART TBL issue on AMD K8s, ignore
- * this to avoid an uncatched #GP in the guest
- */
- if ((offset & 0x3) == 0 &&
- data != 0 && (data | (1 << 10)) != ~(u64)0)
- return -1;
- vcpu->arch.mce_banks[offset] = data;
- break;
- }
- return 1;
- }
- return 0;
-}
-
-static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
-{
- struct kvm *kvm = vcpu->kvm;
- int lm = is_long_mode(vcpu);
- u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
- : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
- u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
- : kvm->arch.xen_hvm_config.blob_size_32;
- u32 page_num = data & ~PAGE_MASK;
- u64 page_addr = data & PAGE_MASK;
- u8 *page;
- int r;
-
- r = -E2BIG;
- if (page_num >= blob_size)
- goto out;
- r = -ENOMEM;
- page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
- if (IS_ERR(page)) {
- r = PTR_ERR(page);
- goto out;
- }
- if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
- goto out_free;
- r = 0;
-out_free:
- kfree(page);
-out:
- return r;
-}
-
-static bool kvm_hv_hypercall_enabled(struct kvm *kvm)
-{
- return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;
-}
-
-static bool kvm_hv_msr_partition_wide(u32 msr)
-{
- bool r = false;
- switch (msr) {
- case HV_X64_MSR_GUEST_OS_ID:
- case HV_X64_MSR_HYPERCALL:
- r = true;
- break;
- }
-
- return r;
-}
-
-static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
-{
- struct kvm *kvm = vcpu->kvm;
-
- switch (msr) {
- case HV_X64_MSR_GUEST_OS_ID:
- kvm->arch.hv_guest_os_id = data;
- /* setting guest os id to zero disables hypercall page */
- if (!kvm->arch.hv_guest_os_id)
- kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
- break;
- case HV_X64_MSR_HYPERCALL: {
- u64 gfn;
- unsigned long addr;
- u8 instructions[4];
-
- /* if guest os id is not set hypercall should remain disabled */
- if (!kvm->arch.hv_guest_os_id)
- break;
- if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) {
- kvm->arch.hv_hypercall = data;
- break;
- }
- gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;
- addr = gfn_to_hva(kvm, gfn);
- if (kvm_is_error_hva(addr))
- return 1;
- kvm_x86_ops->patch_hypercall(vcpu, instructions);
- ((unsigned char *)instructions)[3] = 0xc3; /* ret */
- if (__copy_to_user((void __user *)addr, instructions, 4))
- return 1;
- kvm->arch.hv_hypercall = data;
- break;
- }
- default:
- pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
- "data 0x%llx\n", msr, data);
- return 1;
- }
- return 0;
-}
-
-static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
-{
- switch (msr) {
- case HV_X64_MSR_APIC_ASSIST_PAGE: {
- unsigned long addr;
-
- if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
- vcpu->arch.hv_vapic = data;
- break;
- }
- addr = gfn_to_hva(vcpu->kvm, data >>
- HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT);
- if (kvm_is_error_hva(addr))
- return 1;
- if (__clear_user((void __user *)addr, PAGE_SIZE))
- return 1;
- vcpu->arch.hv_vapic = data;
- break;
- }
- case HV_X64_MSR_EOI:
- return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data);
- case HV_X64_MSR_ICR:
- return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
- case HV_X64_MSR_TPR:
- return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
- default:
- pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
- "data 0x%llx\n", msr, data);
- return 1;
- }
-
- return 0;
-}
-
-static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
-{
- gpa_t gpa = data & ~0x3f;
-
- /* Bits 2:5 are resrved, Should be zero */
- if (data & 0x3c)
- return 1;
-
- vcpu->arch.apf.msr_val = data;
-
- if (!(data & KVM_ASYNC_PF_ENABLED)) {
- kvm_clear_async_pf_completion_queue(vcpu);
- kvm_async_pf_hash_reset(vcpu);
- return 0;
- }
-
- if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa))
- return 1;
-
- vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
- kvm_async_pf_wakeup_all(vcpu);
- return 0;
-}
-
-static void kvmclock_reset(struct kvm_vcpu *vcpu)
-{
- if (vcpu->arch.time_page) {
- kvm_release_page_dirty(vcpu->arch.time_page);
- vcpu->arch.time_page = NULL;
- }
-}
-
-static void accumulate_steal_time(struct kvm_vcpu *vcpu)
-{
- u64 delta;
-
- if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
- return;
-
- delta = current->sched_info.run_delay - vcpu->arch.st.last_steal;
- vcpu->arch.st.last_steal = current->sched_info.run_delay;
- vcpu->arch.st.accum_steal = delta;
-}
-
-static void record_steal_time(struct kvm_vcpu *vcpu)
-{
- if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
- return;
-
- if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
- &vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
- return;
-
- vcpu->arch.st.steal.steal += vcpu->arch.st.accum_steal;
- vcpu->arch.st.steal.version += 2;
- vcpu->arch.st.accum_steal = 0;
-
- kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
- &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
-}
-
-int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
-{
- bool pr = false;
-
- switch (msr) {
- case MSR_EFER:
- return set_efer(vcpu, data);
- case MSR_K7_HWCR:
- data &= ~(u64)0x40; /* ignore flush filter disable */
- data &= ~(u64)0x100; /* ignore ignne emulation enable */
- data &= ~(u64)0x8; /* ignore TLB cache disable */
- if (data != 0) {
- pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
- data);
- return 1;
- }
- break;
- case MSR_FAM10H_MMIO_CONF_BASE:
- if (data != 0) {
- pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
- "0x%llx\n", data);
- return 1;
- }
- break;
- case MSR_AMD64_NB_CFG:
- break;
- case MSR_IA32_DEBUGCTLMSR:
- if (!data) {
- /* We support the non-activated case already */
- break;
- } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
- /* Values other than LBR and BTF are vendor-specific,
- thus reserved and should throw a #GP */
- return 1;
- }
- pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
- __func__, data);
- break;
- case MSR_IA32_UCODE_REV:
- case MSR_IA32_UCODE_WRITE:
- case MSR_VM_HSAVE_PA:
- case MSR_AMD64_PATCH_LOADER:
- break;
- case 0x200 ... 0x2ff:
- return set_msr_mtrr(vcpu, msr, data);
- case MSR_IA32_APICBASE:
- kvm_set_apic_base(vcpu, data);
- break;
- case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
- return kvm_x2apic_msr_write(vcpu, msr, data);
- case MSR_IA32_TSCDEADLINE:
- kvm_set_lapic_tscdeadline_msr(vcpu, data);
- break;
- case MSR_IA32_MISC_ENABLE:
- vcpu->arch.ia32_misc_enable_msr = data;
- break;
- case MSR_KVM_WALL_CLOCK_NEW:
- case MSR_KVM_WALL_CLOCK:
- vcpu->kvm->arch.wall_clock = data;
- kvm_write_wall_clock(vcpu->kvm, data);
- break;
- case MSR_KVM_SYSTEM_TIME_NEW:
- case MSR_KVM_SYSTEM_TIME: {
- kvmclock_reset(vcpu);
-
- vcpu->arch.time = data;
- kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
-
- /* we verify if the enable bit is set... */
- if (!(data & 1))
- break;
-
- /* ...but clean it before doing the actual write */
- vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
-
- vcpu->arch.time_page =
- gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
-
- if (is_error_page(vcpu->arch.time_page)) {
- kvm_release_page_clean(vcpu->arch.time_page);
- vcpu->arch.time_page = NULL;
- }
- break;
- }
- case MSR_KVM_ASYNC_PF_EN:
- if (kvm_pv_enable_async_pf(vcpu, data))
- return 1;
- break;
- case MSR_KVM_STEAL_TIME:
-
- if (unlikely(!sched_info_on()))
- return 1;
-
- if (data & KVM_STEAL_RESERVED_MASK)
- return 1;
-
- if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
- data & KVM_STEAL_VALID_BITS))
- return 1;
-
- vcpu->arch.st.msr_val = data;
-
- if (!(data & KVM_MSR_ENABLED))
- break;
-
- vcpu->arch.st.last_steal = current->sched_info.run_delay;
-
- preempt_disable();
- accumulate_steal_time(vcpu);
- preempt_enable();
-
- kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
-
- break;
-
- case MSR_IA32_MCG_CTL:
- case MSR_IA32_MCG_STATUS:
- case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
- return set_msr_mce(vcpu, msr, data);
-
- /* Performance counters are not protected by a CPUID bit,
- * so we should check all of them in the generic path for the sake of
- * cross vendor migration.
- * Writing a zero into the event select MSRs disables them,
- * which we perfectly emulate ;-). Any other value should be at least
- * reported, some guests depend on them.
- */
- case MSR_K7_EVNTSEL0:
- case MSR_K7_EVNTSEL1:
- case MSR_K7_EVNTSEL2:
- case MSR_K7_EVNTSEL3:
- if (data != 0)
- pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
- "0x%x data 0x%llx\n", msr, data);
- break;
- /* at least RHEL 4 unconditionally writes to the perfctr registers,
- * so we ignore writes to make it happy.
- */
- case MSR_K7_PERFCTR0:
- case MSR_K7_PERFCTR1:
- case MSR_K7_PERFCTR2:
- case MSR_K7_PERFCTR3:
- pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
- "0x%x data 0x%llx\n", msr, data);
- break;
- case MSR_P6_PERFCTR0:
- case MSR_P6_PERFCTR1:
- pr = true;
- case MSR_P6_EVNTSEL0:
- case MSR_P6_EVNTSEL1:
- if (kvm_pmu_msr(vcpu, msr))
- return kvm_pmu_set_msr(vcpu, msr, data);
-
- if (pr || data != 0)
- pr_unimpl(vcpu, "disabled perfctr wrmsr: "
- "0x%x data 0x%llx\n", msr, data);
- break;
- case MSR_K7_CLK_CTL:
- /*
- * Ignore all writes to this no longer documented MSR.
- * Writes are only relevant for old K7 processors,
- * all pre-dating SVM, but a recommended workaround from
- * AMD for these chips. It is possible to speicify the
- * affected processor models on the command line, hence
- * the need to ignore the workaround.
- */
- break;
- case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
- if (kvm_hv_msr_partition_wide(msr)) {
- int r;
- mutex_lock(&vcpu->kvm->lock);
- r = set_msr_hyperv_pw(vcpu, msr, data);
- mutex_unlock(&vcpu->kvm->lock);
- return r;
- } else
- return set_msr_hyperv(vcpu, msr, data);
- break;
- case MSR_IA32_BBL_CR_CTL3:
- /* Drop writes to this legacy MSR -- see rdmsr
- * counterpart for further detail.
- */
- pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);
- break;
- case MSR_AMD64_OSVW_ID_LENGTH:
- if (!guest_cpuid_has_osvw(vcpu))
- return 1;
- vcpu->arch.osvw.length = data;
- break;
- case MSR_AMD64_OSVW_STATUS:
- if (!guest_cpuid_has_osvw(vcpu))
- return 1;
- vcpu->arch.osvw.status = data;
- break;
- default:
- if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
- return xen_hvm_config(vcpu, data);
- if (kvm_pmu_msr(vcpu, msr))
- return kvm_pmu_set_msr(vcpu, msr, data);
- if (!ignore_msrs) {
- pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
- msr, data);
- return 1;
- } else {
- pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n",
- msr, data);
- break;
- }
- }
- return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_set_msr_common);
-
-
-/*
- * Reads an msr value (of 'msr_index') into 'pdata'.
- * Returns 0 on success, non-0 otherwise.
- * Assumes vcpu_load() was already called.
- */
-int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
-{
- return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
-}
-
-static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
-{
- u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
-
- if (!msr_mtrr_valid(msr))
- return 1;
-
- if (msr == MSR_MTRRdefType)
- *pdata = vcpu->arch.mtrr_state.def_type +
- (vcpu->arch.mtrr_state.enabled << 10);
- else if (msr == MSR_MTRRfix64K_00000)
- *pdata = p[0];
- else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
- *pdata = p[1 + msr - MSR_MTRRfix16K_80000];
- else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
- *pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
- else if (msr == MSR_IA32_CR_PAT)
- *pdata = vcpu->arch.pat;
- else { /* Variable MTRRs */
- int idx, is_mtrr_mask;
- u64 *pt;
-
- idx = (msr - 0x200) / 2;
- is_mtrr_mask = msr - 0x200 - 2 * idx;
- if (!is_mtrr_mask)
- pt =
- (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
- else
- pt =
- (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
- *pdata = *pt;
- }
-
- return 0;
-}
-
-static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
-{
- u64 data;
- u64 mcg_cap = vcpu->arch.mcg_cap;
- unsigned bank_num = mcg_cap & 0xff;
-
- switch (msr) {
- case MSR_IA32_P5_MC_ADDR:
- case MSR_IA32_P5_MC_TYPE:
- data = 0;
- break;
- case MSR_IA32_MCG_CAP:
- data = vcpu->arch.mcg_cap;
- break;
- case MSR_IA32_MCG_CTL:
- if (!(mcg_cap & MCG_CTL_P))
- return 1;
- data = vcpu->arch.mcg_ctl;
- break;
- case MSR_IA32_MCG_STATUS:
- data = vcpu->arch.mcg_status;
- break;
- default:
- if (msr >= MSR_IA32_MC0_CTL &&
- msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
- u32 offset = msr - MSR_IA32_MC0_CTL;
- data = vcpu->arch.mce_banks[offset];
- break;
- }
- return 1;
- }
- *pdata = data;
- return 0;
-}
-
-static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
-{
- u64 data = 0;
- struct kvm *kvm = vcpu->kvm;
-
- switch (msr) {
- case HV_X64_MSR_GUEST_OS_ID:
- data = kvm->arch.hv_guest_os_id;
- break;
- case HV_X64_MSR_HYPERCALL:
- data = kvm->arch.hv_hypercall;
- break;
- default:
- pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
- return 1;
- }
-
- *pdata = data;
- return 0;
-}
-
-static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
-{
- u64 data = 0;
-
- switch (msr) {
- case HV_X64_MSR_VP_INDEX: {
- int r;
- struct kvm_vcpu *v;
- kvm_for_each_vcpu(r, v, vcpu->kvm)
- if (v == vcpu)
- data = r;
- break;
- }
- case HV_X64_MSR_EOI:
- return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);
- case HV_X64_MSR_ICR:
- return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
- case HV_X64_MSR_TPR:
- return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
- case HV_X64_MSR_APIC_ASSIST_PAGE:
- data = vcpu->arch.hv_vapic;
- break;
- default:
- pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
- return 1;
- }
- *pdata = data;
- return 0;
-}
-
-int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
-{
- u64 data;
-
- switch (msr) {
- case MSR_IA32_PLATFORM_ID:
- case MSR_IA32_EBL_CR_POWERON:
- case MSR_IA32_DEBUGCTLMSR:
- case MSR_IA32_LASTBRANCHFROMIP:
- case MSR_IA32_LASTBRANCHTOIP:
- case MSR_IA32_LASTINTFROMIP:
- case MSR_IA32_LASTINTTOIP:
- case MSR_K8_SYSCFG:
- case MSR_K7_HWCR:
- case MSR_VM_HSAVE_PA:
- case MSR_K7_EVNTSEL0:
- case MSR_K7_PERFCTR0:
- case MSR_K8_INT_PENDING_MSG:
- case MSR_AMD64_NB_CFG:
- case MSR_FAM10H_MMIO_CONF_BASE:
- data = 0;
- break;
- case MSR_P6_PERFCTR0:
- case MSR_P6_PERFCTR1:
- case MSR_P6_EVNTSEL0:
- case MSR_P6_EVNTSEL1:
- if (kvm_pmu_msr(vcpu, msr))
- return kvm_pmu_get_msr(vcpu, msr, pdata);
- data = 0;
- break;
- case MSR_IA32_UCODE_REV:
- data = 0x100000000ULL;
- break;
- case MSR_MTRRcap:
- data = 0x500 | KVM_NR_VAR_MTRR;
- break;
- case 0x200 ... 0x2ff:
- return get_msr_mtrr(vcpu, msr, pdata);
- case 0xcd: /* fsb frequency */
- data = 3;
- break;
- /*
- * MSR_EBC_FREQUENCY_ID
- * Conservative value valid for even the basic CPU models.
- * Models 0,1: 000 in bits 23:21 indicating a bus speed of
- * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
- * and 266MHz for model 3, or 4. Set Core Clock
- * Frequency to System Bus Frequency Ratio to 1 (bits
- * 31:24) even though these are only valid for CPU
- * models > 2, however guests may end up dividing or
- * multiplying by zero otherwise.
- */
- case MSR_EBC_FREQUENCY_ID:
- data = 1 << 24;
- break;
- case MSR_IA32_APICBASE:
- data = kvm_get_apic_base(vcpu);
- break;
- case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
- return kvm_x2apic_msr_read(vcpu, msr, pdata);
- break;
- case MSR_IA32_TSCDEADLINE:
- data = kvm_get_lapic_tscdeadline_msr(vcpu);
- break;
- case MSR_IA32_MISC_ENABLE:
- data = vcpu->arch.ia32_misc_enable_msr;
- break;
- case MSR_IA32_PERF_STATUS:
- /* TSC increment by tick */
- data = 1000ULL;
- /* CPU multiplier */
- data |= (((uint64_t)4ULL) << 40);
- break;
- case MSR_EFER:
- data = vcpu->arch.efer;
- break;
- case MSR_KVM_WALL_CLOCK:
- case MSR_KVM_WALL_CLOCK_NEW:
- data = vcpu->kvm->arch.wall_clock;
- break;
- case MSR_KVM_SYSTEM_TIME:
- case MSR_KVM_SYSTEM_TIME_NEW:
- data = vcpu->arch.time;
- break;
- case MSR_KVM_ASYNC_PF_EN:
- data = vcpu->arch.apf.msr_val;
- break;
- case MSR_KVM_STEAL_TIME:
- data = vcpu->arch.st.msr_val;
- break;
- case MSR_IA32_P5_MC_ADDR:
- case MSR_IA32_P5_MC_TYPE:
- case MSR_IA32_MCG_CAP:
- case MSR_IA32_MCG_CTL:
- case MSR_IA32_MCG_STATUS:
- case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
- return get_msr_mce(vcpu, msr, pdata);
- case MSR_K7_CLK_CTL:
- /*
- * Provide expected ramp-up count for K7. All other
- * are set to zero, indicating minimum divisors for
- * every field.
- *
- * This prevents guest kernels on AMD host with CPU
- * type 6, model 8 and higher from exploding due to
- * the rdmsr failing.
- */
- data = 0x20000000;
- break;
- case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
- if (kvm_hv_msr_partition_wide(msr)) {
- int r;
- mutex_lock(&vcpu->kvm->lock);
- r = get_msr_hyperv_pw(vcpu, msr, pdata);
- mutex_unlock(&vcpu->kvm->lock);
- return r;
- } else
- return get_msr_hyperv(vcpu, msr, pdata);
- break;
- case MSR_IA32_BBL_CR_CTL3:
- /* This legacy MSR exists but isn't fully documented in current
- * silicon. It is however accessed by winxp in very narrow
- * scenarios where it sets bit #19, itself documented as
- * a "reserved" bit. Best effort attempt to source coherent
- * read data here should the balance of the register be
- * interpreted by the guest:
- *
- * L2 cache control register 3: 64GB range, 256KB size,
- * enabled, latency 0x1, configured
- */
- data = 0xbe702111;
- break;
- case MSR_AMD64_OSVW_ID_LENGTH:
- if (!guest_cpuid_has_osvw(vcpu))
- return 1;
- data = vcpu->arch.osvw.length;
- break;
- case MSR_AMD64_OSVW_STATUS:
- if (!guest_cpuid_has_osvw(vcpu))
- return 1;
- data = vcpu->arch.osvw.status;
- break;
- default:
- if (kvm_pmu_msr(vcpu, msr))
- return kvm_pmu_get_msr(vcpu, msr, pdata);
- if (!ignore_msrs) {
- pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
- return 1;
- } else {
- pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
- data = 0;
- }
- break;
- }
- *pdata = data;
- return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_get_msr_common);
-
-/*
- * Read or write a bunch of msrs. All parameters are kernel addresses.
- *
- * @return number of msrs set successfully.
- */
-static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
- struct kvm_msr_entry *entries,
- int (*do_msr)(struct kvm_vcpu *vcpu,
- unsigned index, u64 *data))
-{
- int i, idx;
-
- idx = srcu_read_lock(&vcpu->kvm->srcu);
- for (i = 0; i < msrs->nmsrs; ++i)
- if (do_msr(vcpu, entries[i].index, &entries[i].data))
- break;
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
-
- return i;
-}
-
-/*
- * Read or write a bunch of msrs. Parameters are user addresses.
- *
- * @return number of msrs set successfully.
- */
-static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
- int (*do_msr)(struct kvm_vcpu *vcpu,
- unsigned index, u64 *data),
- int writeback)
-{
- struct kvm_msrs msrs;
- struct kvm_msr_entry *entries;
- int r, n;
- unsigned size;
-
- r = -EFAULT;
- if (copy_from_user(&msrs, user_msrs, sizeof msrs))
- goto out;
-
- r = -E2BIG;
- if (msrs.nmsrs >= MAX_IO_MSRS)
- goto out;
-
- size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
- entries = memdup_user(user_msrs->entries, size);
- if (IS_ERR(entries)) {
- r = PTR_ERR(entries);
- goto out;
- }
-
- r = n = __msr_io(vcpu, &msrs, entries, do_msr);
- if (r < 0)
- goto out_free;
-
- r = -EFAULT;
- if (writeback && copy_to_user(user_msrs->entries, entries, size))
- goto out_free;
-
- r = n;
-
-out_free:
- kfree(entries);
-out:
- return r;
-}
-
-int kvm_dev_ioctl_check_extension(long ext)
-{
- int r;
-
- switch (ext) {
- case KVM_CAP_IRQCHIP:
- case KVM_CAP_HLT:
- case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
- case KVM_CAP_SET_TSS_ADDR:
- case KVM_CAP_EXT_CPUID:
- case KVM_CAP_CLOCKSOURCE:
- case KVM_CAP_PIT:
- case KVM_CAP_NOP_IO_DELAY:
- case KVM_CAP_MP_STATE:
- case KVM_CAP_SYNC_MMU:
- case KVM_CAP_USER_NMI:
- case KVM_CAP_REINJECT_CONTROL:
- case KVM_CAP_IRQ_INJECT_STATUS:
- case KVM_CAP_ASSIGN_DEV_IRQ:
- case KVM_CAP_IRQFD:
- case KVM_CAP_IOEVENTFD:
- case KVM_CAP_PIT2:
- case KVM_CAP_PIT_STATE2:
- case KVM_CAP_SET_IDENTITY_MAP_ADDR:
- case KVM_CAP_XEN_HVM:
- case KVM_CAP_ADJUST_CLOCK:
- case KVM_CAP_VCPU_EVENTS:
- case KVM_CAP_HYPERV:
- case KVM_CAP_HYPERV_VAPIC:
- case KVM_CAP_HYPERV_SPIN:
- case KVM_CAP_PCI_SEGMENT:
- case KVM_CAP_DEBUGREGS:
- case KVM_CAP_X86_ROBUST_SINGLESTEP:
- case KVM_CAP_XSAVE:
- case KVM_CAP_ASYNC_PF:
- case KVM_CAP_GET_TSC_KHZ:
- case KVM_CAP_PCI_2_3:
- r = 1;
- break;
- case KVM_CAP_COALESCED_MMIO:
- r = KVM_COALESCED_MMIO_PAGE_OFFSET;
- break;
- case KVM_CAP_VAPIC:
- r = !kvm_x86_ops->cpu_has_accelerated_tpr();
- break;
- case KVM_CAP_NR_VCPUS:
- r = KVM_SOFT_MAX_VCPUS;
- break;
- case KVM_CAP_MAX_VCPUS:
- r = KVM_MAX_VCPUS;
- break;
- case KVM_CAP_NR_MEMSLOTS:
- r = KVM_MEMORY_SLOTS;
- break;
- case KVM_CAP_PV_MMU: /* obsolete */
- r = 0;
- break;
- case KVM_CAP_IOMMU:
- r = iommu_present(&pci_bus_type);
- break;
- case KVM_CAP_MCE:
- r = KVM_MAX_MCE_BANKS;
- break;
- case KVM_CAP_XCRS:
- r = cpu_has_xsave;
- break;
- case KVM_CAP_TSC_CONTROL:
- r = kvm_has_tsc_control;
- break;
- case KVM_CAP_TSC_DEADLINE_TIMER:
- r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER);
- break;
- default:
- r = 0;
- break;
- }
- return r;
-
-}
-
-long kvm_arch_dev_ioctl(struct file *filp,
- unsigned int ioctl, unsigned long arg)
-{
- void __user *argp = (void __user *)arg;
- long r;
-
- switch (ioctl) {
- case KVM_GET_MSR_INDEX_LIST: {
- struct kvm_msr_list __user *user_msr_list = argp;
- struct kvm_msr_list msr_list;
- unsigned n;
-
- r = -EFAULT;
- if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
- goto out;
- n = msr_list.nmsrs;
- msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
- if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
- goto out;
- r = -E2BIG;
- if (n < msr_list.nmsrs)
- goto out;
- r = -EFAULT;
- if (copy_to_user(user_msr_list->indices, &msrs_to_save,
- num_msrs_to_save * sizeof(u32)))
- goto out;
- if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
- &emulated_msrs,
- ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
- goto out;
- r = 0;
- break;
- }
- case KVM_GET_SUPPORTED_CPUID: {
- struct kvm_cpuid2 __user *cpuid_arg = argp;
- struct kvm_cpuid2 cpuid;
-
- r = -EFAULT;
- if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
- goto out;
- r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
- cpuid_arg->entries);
- if (r)
- goto out;
-
- r = -EFAULT;
- if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
- goto out;
- r = 0;
- break;
- }
- case KVM_X86_GET_MCE_CAP_SUPPORTED: {
- u64 mce_cap;
-
- mce_cap = KVM_MCE_CAP_SUPPORTED;
- r = -EFAULT;
- if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
- goto out;
- r = 0;
- break;
- }
- default:
- r = -EINVAL;
- }
-out:
- return r;
-}
-
-static void wbinvd_ipi(void *garbage)
-{
- wbinvd();
-}
-
-static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
-{
- return vcpu->kvm->arch.iommu_domain &&
- !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY);
-}
-
-void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
-{
- /* Address WBINVD may be executed by guest */
- if (need_emulate_wbinvd(vcpu)) {
- if (kvm_x86_ops->has_wbinvd_exit())
- cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
- else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
- smp_call_function_single(vcpu->cpu,
- wbinvd_ipi, NULL, 1);
- }
-
- kvm_x86_ops->vcpu_load(vcpu, cpu);
-
- /* Apply any externally detected TSC adjustments (due to suspend) */
- if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
- adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
- vcpu->arch.tsc_offset_adjustment = 0;
- set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
- }
-
- if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
- s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
- native_read_tsc() - vcpu->arch.last_host_tsc;
- if (tsc_delta < 0)
- mark_tsc_unstable("KVM discovered backwards TSC");
- if (check_tsc_unstable()) {
- u64 offset = kvm_x86_ops->compute_tsc_offset(vcpu,
- vcpu->arch.last_guest_tsc);
- kvm_x86_ops->write_tsc_offset(vcpu, offset);
- vcpu->arch.tsc_catchup = 1;
- }
- kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
- if (vcpu->cpu != cpu)
- kvm_migrate_timers(vcpu);
- vcpu->cpu = cpu;
- }
-
- accumulate_steal_time(vcpu);
- kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
-}
-
-void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
-{
- kvm_x86_ops->vcpu_put(vcpu);
- kvm_put_guest_fpu(vcpu);
- vcpu->arch.last_host_tsc = native_read_tsc();
-}
-
-static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
- struct kvm_lapic_state *s)
-{
- memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
-
- return 0;
-}
-
-static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
- struct kvm_lapic_state *s)
-{
- memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
- kvm_apic_post_state_restore(vcpu);
- update_cr8_intercept(vcpu);
-
- return 0;
-}
-
-static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
- struct kvm_interrupt *irq)
-{
- if (irq->irq < 0 || irq->irq >= 256)
- return -EINVAL;
- if (irqchip_in_kernel(vcpu->kvm))
- return -ENXIO;
-
- kvm_queue_interrupt(vcpu, irq->irq, false);
- kvm_make_request(KVM_REQ_EVENT, vcpu);
-
- return 0;
-}
-
-static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
-{
- kvm_inject_nmi(vcpu);
-
- return 0;
-}
-
-static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
- struct kvm_tpr_access_ctl *tac)
-{
- if (tac->flags)
- return -EINVAL;
- vcpu->arch.tpr_access_reporting = !!tac->enabled;
- return 0;
-}
-
-static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
- u64 mcg_cap)
-{
- int r;
- unsigned bank_num = mcg_cap & 0xff, bank;
-
- r = -EINVAL;
- if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
- goto out;
- if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
- goto out;
- r = 0;
- vcpu->arch.mcg_cap = mcg_cap;
- /* Init IA32_MCG_CTL to all 1s */
- if (mcg_cap & MCG_CTL_P)
- vcpu->arch.mcg_ctl = ~(u64)0;
- /* Init IA32_MCi_CTL to all 1s */
- for (bank = 0; bank < bank_num; bank++)
- vcpu->arch.mce_banks[bank*4] = ~(u64)0;
-out:
- return r;
-}
-
-static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
- struct kvm_x86_mce *mce)
-{
- u64 mcg_cap = vcpu->arch.mcg_cap;
- unsigned bank_num = mcg_cap & 0xff;
- u64 *banks = vcpu->arch.mce_banks;
-
- if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
- return -EINVAL;
- /*
- * if IA32_MCG_CTL is not all 1s, the uncorrected error
- * reporting is disabled
- */
- if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
- vcpu->arch.mcg_ctl != ~(u64)0)
- return 0;
- banks += 4 * mce->bank;
- /*
- * if IA32_MCi_CTL is not all 1s, the uncorrected error
- * reporting is disabled for the bank
- */
- if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
- return 0;
- if (mce->status & MCI_STATUS_UC) {
- if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
- !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
- kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
- return 0;
- }
- if (banks[1] & MCI_STATUS_VAL)
- mce->status |= MCI_STATUS_OVER;
- banks[2] = mce->addr;
- banks[3] = mce->misc;
- vcpu->arch.mcg_status = mce->mcg_status;
- banks[1] = mce->status;
- kvm_queue_exception(vcpu, MC_VECTOR);
- } else if (!(banks[1] & MCI_STATUS_VAL)
- || !(banks[1] & MCI_STATUS_UC)) {
- if (banks[1] & MCI_STATUS_VAL)
- mce->status |= MCI_STATUS_OVER;
- banks[2] = mce->addr;
- banks[3] = mce->misc;
- banks[1] = mce->status;
- } else
- banks[1] |= MCI_STATUS_OVER;
- return 0;
-}
-
-static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
- struct kvm_vcpu_events *events)
-{
- process_nmi(vcpu);
- events->exception.injected =
- vcpu->arch.exception.pending &&
- !kvm_exception_is_soft(vcpu->arch.exception.nr);
- events->exception.nr = vcpu->arch.exception.nr;
- events->exception.has_error_code = vcpu->arch.exception.has_error_code;
- events->exception.pad = 0;
- events->exception.error_code = vcpu->arch.exception.error_code;
-
- events->interrupt.injected =
- vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft;
- events->interrupt.nr = vcpu->arch.interrupt.nr;
- events->interrupt.soft = 0;
- events->interrupt.shadow =
- kvm_x86_ops->get_interrupt_shadow(vcpu,
- KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI);
-
- events->nmi.injected = vcpu->arch.nmi_injected;
- events->nmi.pending = vcpu->arch.nmi_pending != 0;
- events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
- events->nmi.pad = 0;
-
- events->sipi_vector = vcpu->arch.sipi_vector;
-
- events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
- | KVM_VCPUEVENT_VALID_SIPI_VECTOR
- | KVM_VCPUEVENT_VALID_SHADOW);
- memset(&events->reserved, 0, sizeof(events->reserved));
-}
-
-static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
- struct kvm_vcpu_events *events)
-{
- if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
- | KVM_VCPUEVENT_VALID_SIPI_VECTOR
- | KVM_VCPUEVENT_VALID_SHADOW))
- return -EINVAL;
-
- process_nmi(vcpu);
- vcpu->arch.exception.pending = events->exception.injected;
- vcpu->arch.exception.nr = events->exception.nr;
- vcpu->arch.exception.has_error_code = events->exception.has_error_code;
- vcpu->arch.exception.error_code = events->exception.error_code;
-
- vcpu->arch.interrupt.pending = events->interrupt.injected;
- vcpu->arch.interrupt.nr = events->interrupt.nr;
- vcpu->arch.interrupt.soft = events->interrupt.soft;
- if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
- kvm_x86_ops->set_interrupt_shadow(vcpu,
- events->interrupt.shadow);
-
- vcpu->arch.nmi_injected = events->nmi.injected;
- if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
- vcpu->arch.nmi_pending = events->nmi.pending;
- kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
-
- if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
- vcpu->arch.sipi_vector = events->sipi_vector;
-
- kvm_make_request(KVM_REQ_EVENT, vcpu);
-
- return 0;
-}
-
-static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
- struct kvm_debugregs *dbgregs)
-{
- memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
- dbgregs->dr6 = vcpu->arch.dr6;
- dbgregs->dr7 = vcpu->arch.dr7;
- dbgregs->flags = 0;
- memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
-}
-
-static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
- struct kvm_debugregs *dbgregs)
-{
- if (dbgregs->flags)
- return -EINVAL;
-
- memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
- vcpu->arch.dr6 = dbgregs->dr6;
- vcpu->arch.dr7 = dbgregs->dr7;
-
- return 0;
-}
-
-static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
- struct kvm_xsave *guest_xsave)
-{
- if (cpu_has_xsave)
- memcpy(guest_xsave->region,
- &vcpu->arch.guest_fpu.state->xsave,
- xstate_size);
- else {
- memcpy(guest_xsave->region,
- &vcpu->arch.guest_fpu.state->fxsave,
- sizeof(struct i387_fxsave_struct));
- *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
- XSTATE_FPSSE;
- }
-}
-
-static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
- struct kvm_xsave *guest_xsave)
-{
- u64 xstate_bv =
- *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
-
- if (cpu_has_xsave)
- memcpy(&vcpu->arch.guest_fpu.state->xsave,
- guest_xsave->region, xstate_size);
- else {
- if (xstate_bv & ~XSTATE_FPSSE)
- return -EINVAL;
- memcpy(&vcpu->arch.guest_fpu.state->fxsave,
- guest_xsave->region, sizeof(struct i387_fxsave_struct));
- }
- return 0;
-}
-
-static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
- struct kvm_xcrs *guest_xcrs)
-{
- if (!cpu_has_xsave) {
- guest_xcrs->nr_xcrs = 0;
- return;
- }
-
- guest_xcrs->nr_xcrs = 1;
- guest_xcrs->flags = 0;
- guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
- guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
-}
-
-static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
- struct kvm_xcrs *guest_xcrs)
-{
- int i, r = 0;
-
- if (!cpu_has_xsave)
- return -EINVAL;
-
- if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
- return -EINVAL;
-
- for (i = 0; i < guest_xcrs->nr_xcrs; i++)
- /* Only support XCR0 currently */
- if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) {
- r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
- guest_xcrs->xcrs[0].value);
- break;
- }
- if (r)
- r = -EINVAL;
- return r;
-}
-
-long kvm_arch_vcpu_ioctl(struct file *filp,
- unsigned int ioctl, unsigned long arg)
-{
- struct kvm_vcpu *vcpu = filp->private_data;
- void __user *argp = (void __user *)arg;
- int r;
- union {
- struct kvm_lapic_state *lapic;
- struct kvm_xsave *xsave;
- struct kvm_xcrs *xcrs;
- void *buffer;
- } u;
-
- u.buffer = NULL;
- switch (ioctl) {
- case KVM_GET_LAPIC: {
- r = -EINVAL;
- if (!vcpu->arch.apic)
- goto out;
- u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
-
- r = -ENOMEM;
- if (!u.lapic)
- goto out;
- r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
- if (r)
- goto out;
- r = -EFAULT;
- if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
- goto out;
- r = 0;
- break;
- }
- case KVM_SET_LAPIC: {
- r = -EINVAL;
- if (!vcpu->arch.apic)
- goto out;
- u.lapic = memdup_user(argp, sizeof(*u.lapic));
- if (IS_ERR(u.lapic)) {
- r = PTR_ERR(u.lapic);
- goto out;
- }
-
- r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
- if (r)
- goto out;
- r = 0;
- break;
- }
- case KVM_INTERRUPT: {
- struct kvm_interrupt irq;
-
- r = -EFAULT;
- if (copy_from_user(&irq, argp, sizeof irq))
- goto out;
- r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
- if (r)
- goto out;
- r = 0;
- break;
- }
- case KVM_NMI: {
- r = kvm_vcpu_ioctl_nmi(vcpu);
- if (r)
- goto out;
- r = 0;
- break;
- }
- case KVM_SET_CPUID: {
- struct kvm_cpuid __user *cpuid_arg = argp;
- struct kvm_cpuid cpuid;
-
- r = -EFAULT;
- if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
- goto out;
- r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
- if (r)
- goto out;
- break;
- }
- case KVM_SET_CPUID2: {
- struct kvm_cpuid2 __user *cpuid_arg = argp;
- struct kvm_cpuid2 cpuid;
-
- r = -EFAULT;
- if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
- goto out;
- r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
- cpuid_arg->entries);
- if (r)
- goto out;
- break;
- }
- case KVM_GET_CPUID2: {
- struct kvm_cpuid2 __user *cpuid_arg = argp;
- struct kvm_cpuid2 cpuid;
-
- r = -EFAULT;
- if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
- goto out;
- r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
- cpuid_arg->entries);
- if (r)
- goto out;
- r = -EFAULT;
- if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
- goto out;
- r = 0;
- break;
- }
- case KVM_GET_MSRS:
- r = msr_io(vcpu, argp, kvm_get_msr, 1);
- break;
- case KVM_SET_MSRS:
- r = msr_io(vcpu, argp, do_set_msr, 0);
- break;
- case KVM_TPR_ACCESS_REPORTING: {
- struct kvm_tpr_access_ctl tac;
-
- r = -EFAULT;
- if (copy_from_user(&tac, argp, sizeof tac))
- goto out;
- r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
- if (r)
- goto out;
- r = -EFAULT;
- if (copy_to_user(argp, &tac, sizeof tac))
- goto out;
- r = 0;
- break;
- };
- case KVM_SET_VAPIC_ADDR: {
- struct kvm_vapic_addr va;
-
- r = -EINVAL;
- if (!irqchip_in_kernel(vcpu->kvm))
- goto out;
- r = -EFAULT;
- if (copy_from_user(&va, argp, sizeof va))
- goto out;
- r = 0;
- kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
- break;
- }
- case KVM_X86_SETUP_MCE: {
- u64 mcg_cap;
-
- r = -EFAULT;
- if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
- goto out;
- r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
- break;
- }
- case KVM_X86_SET_MCE: {
- struct kvm_x86_mce mce;
-
- r = -EFAULT;
- if (copy_from_user(&mce, argp, sizeof mce))
- goto out;
- r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
- break;
- }
- case KVM_GET_VCPU_EVENTS: {
- struct kvm_vcpu_events events;
-
- kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
-
- r = -EFAULT;
- if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
- break;
- r = 0;
- break;
- }
- case KVM_SET_VCPU_EVENTS: {
- struct kvm_vcpu_events events;
-
- r = -EFAULT;
- if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
- break;
-
- r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
- break;
- }
- case KVM_GET_DEBUGREGS: {
- struct kvm_debugregs dbgregs;
-
- kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
-
- r = -EFAULT;
- if (copy_to_user(argp, &dbgregs,
- sizeof(struct kvm_debugregs)))
- break;
- r = 0;
- break;
- }
- case KVM_SET_DEBUGREGS: {
- struct kvm_debugregs dbgregs;
-
- r = -EFAULT;
- if (copy_from_user(&dbgregs, argp,
- sizeof(struct kvm_debugregs)))
- break;
-
- r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
- break;
- }
- case KVM_GET_XSAVE: {
- u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
- r = -ENOMEM;
- if (!u.xsave)
- break;
-
- kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
-
- r = -EFAULT;
- if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
- break;
- r = 0;
- break;
- }
- case KVM_SET_XSAVE: {
- u.xsave = memdup_user(argp, sizeof(*u.xsave));
- if (IS_ERR(u.xsave)) {
- r = PTR_ERR(u.xsave);
- goto out;
- }
-
- r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
- break;
- }
- case KVM_GET_XCRS: {
- u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
- r = -ENOMEM;
- if (!u.xcrs)
- break;
-
- kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
-
- r = -EFAULT;
- if (copy_to_user(argp, u.xcrs,
- sizeof(struct kvm_xcrs)))
- break;
- r = 0;
- break;
- }
- case KVM_SET_XCRS: {
- u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
- if (IS_ERR(u.xcrs)) {
- r = PTR_ERR(u.xcrs);
- goto out;
- }
-
- r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
- break;
- }
- case KVM_SET_TSC_KHZ: {
- u32 user_tsc_khz;
-
- r = -EINVAL;
- user_tsc_khz = (u32)arg;
-
- if (user_tsc_khz >= kvm_max_guest_tsc_khz)
- goto out;
-
- if (user_tsc_khz == 0)
- user_tsc_khz = tsc_khz;
-
- kvm_set_tsc_khz(vcpu, user_tsc_khz);
-
- r = 0;
- goto out;
- }
- case KVM_GET_TSC_KHZ: {
- r = vcpu->arch.virtual_tsc_khz;
- goto out;
- }
- default:
- r = -EINVAL;
- }
-out:
- kfree(u.buffer);
- return r;
-}
-
-int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
-{
- return VM_FAULT_SIGBUS;
-}
-
-static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
-{
- int ret;
-
- if (addr > (unsigned int)(-3 * PAGE_SIZE))
- return -1;
- ret = kvm_x86_ops->set_tss_addr(kvm, addr);
- return ret;
-}
-
-static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
- u64 ident_addr)
-{
- kvm->arch.ept_identity_map_addr = ident_addr;
- return 0;
-}
-
-static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
- u32 kvm_nr_mmu_pages)
-{
- if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
- return -EINVAL;
-
- mutex_lock(&kvm->slots_lock);
- spin_lock(&kvm->mmu_lock);
-
- kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
- kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
-
- spin_unlock(&kvm->mmu_lock);
- mutex_unlock(&kvm->slots_lock);
- return 0;
-}
-
-static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
-{
- return kvm->arch.n_max_mmu_pages;
-}
-
-static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
-{
- int r;
-
- r = 0;
- switch (chip->chip_id) {
- case KVM_IRQCHIP_PIC_MASTER:
- memcpy(&chip->chip.pic,
- &pic_irqchip(kvm)->pics[0],
- sizeof(struct kvm_pic_state));
- break;
- case KVM_IRQCHIP_PIC_SLAVE:
- memcpy(&chip->chip.pic,
- &pic_irqchip(kvm)->pics[1],
- sizeof(struct kvm_pic_state));
- break;
- case KVM_IRQCHIP_IOAPIC:
- r = kvm_get_ioapic(kvm, &chip->chip.ioapic);
- break;
- default:
- r = -EINVAL;
- break;
- }
- return r;
-}
-
-static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
-{
- int r;
-
- r = 0;
- switch (chip->chip_id) {
- case KVM_IRQCHIP_PIC_MASTER:
- spin_lock(&pic_irqchip(kvm)->lock);
- memcpy(&pic_irqchip(kvm)->pics[0],
- &chip->chip.pic,
- sizeof(struct kvm_pic_state));
- spin_unlock(&pic_irqchip(kvm)->lock);
- break;
- case KVM_IRQCHIP_PIC_SLAVE:
- spin_lock(&pic_irqchip(kvm)->lock);
- memcpy(&pic_irqchip(kvm)->pics[1],
- &chip->chip.pic,
- sizeof(struct kvm_pic_state));
- spin_unlock(&pic_irqchip(kvm)->lock);
- break;
- case KVM_IRQCHIP_IOAPIC:
- r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
- break;
- default:
- r = -EINVAL;
- break;
- }
- kvm_pic_update_irq(pic_irqchip(kvm));
- return r;
-}
-
-static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
-{
- int r = 0;
-
- mutex_lock(&kvm->arch.vpit->pit_state.lock);
- memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
- mutex_unlock(&kvm->arch.vpit->pit_state.lock);
- return r;
-}
-
-static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
-{
- int r = 0;
-
- mutex_lock(&kvm->arch.vpit->pit_state.lock);
- memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
- kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);
- mutex_unlock(&kvm->arch.vpit->pit_state.lock);
- return r;
-}
-
-static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
-{
- int r = 0;
-
- mutex_lock(&kvm->arch.vpit->pit_state.lock);
- memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
- sizeof(ps->channels));
- ps->flags = kvm->arch.vpit->pit_state.flags;
- mutex_unlock(&kvm->arch.vpit->pit_state.lock);
- memset(&ps->reserved, 0, sizeof(ps->reserved));
- return r;
-}
-
-static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
-{
- int r = 0, start = 0;
- u32 prev_legacy, cur_legacy;
- mutex_lock(&kvm->arch.vpit->pit_state.lock);
- prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
- cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
- if (!prev_legacy && cur_legacy)
- start = 1;
- memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
- sizeof(kvm->arch.vpit->pit_state.channels));
- kvm->arch.vpit->pit_state.flags = ps->flags;
- kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);
- mutex_unlock(&kvm->arch.vpit->pit_state.lock);
- return r;
-}
-
-static int kvm_vm_ioctl_reinject(struct kvm *kvm,
- struct kvm_reinject_control *control)
-{
- if (!kvm->arch.vpit)
- return -ENXIO;
- mutex_lock(&kvm->arch.vpit->pit_state.lock);
- kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject;
- mutex_unlock(&kvm->arch.vpit->pit_state.lock);
- return 0;
-}
-
-/**
- * write_protect_slot - write protect a slot for dirty logging
- * @kvm: the kvm instance
- * @memslot: the slot we protect
- * @dirty_bitmap: the bitmap indicating which pages are dirty
- * @nr_dirty_pages: the number of dirty pages
- *
- * We have two ways to find all sptes to protect:
- * 1. Use kvm_mmu_slot_remove_write_access() which walks all shadow pages and
- * checks ones that have a spte mapping a page in the slot.
- * 2. Use kvm_mmu_rmap_write_protect() for each gfn found in the bitmap.
- *
- * Generally speaking, if there are not so many dirty pages compared to the
- * number of shadow pages, we should use the latter.
- *
- * Note that letting others write into a page marked dirty in the old bitmap
- * by using the remaining tlb entry is not a problem. That page will become
- * write protected again when we flush the tlb and then be reported dirty to
- * the user space by copying the old bitmap.
- */
-static void write_protect_slot(struct kvm *kvm,
- struct kvm_memory_slot *memslot,
- unsigned long *dirty_bitmap,
- unsigned long nr_dirty_pages)
-{
- spin_lock(&kvm->mmu_lock);
-
- /* Not many dirty pages compared to # of shadow pages. */
- if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) {
- unsigned long gfn_offset;
-
- for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) {
- unsigned long gfn = memslot->base_gfn + gfn_offset;
-
- kvm_mmu_rmap_write_protect(kvm, gfn, memslot);
- }
- kvm_flush_remote_tlbs(kvm);
- } else
- kvm_mmu_slot_remove_write_access(kvm, memslot->id);
-
- spin_unlock(&kvm->mmu_lock);
-}
-
-/*
- * Get (and clear) the dirty memory log for a memory slot.
- */
-int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
- struct kvm_dirty_log *log)
-{
- int r;
- struct kvm_memory_slot *memslot;
- unsigned long n, nr_dirty_pages;
-
- mutex_lock(&kvm->slots_lock);
-
- r = -EINVAL;
- if (log->slot >= KVM_MEMORY_SLOTS)
- goto out;
-
- memslot = id_to_memslot(kvm->memslots, log->slot);
- r = -ENOENT;
- if (!memslot->dirty_bitmap)
- goto out;
-
- n = kvm_dirty_bitmap_bytes(memslot);
- nr_dirty_pages = memslot->nr_dirty_pages;
-
- /* If nothing is dirty, don't bother messing with page tables. */
- if (nr_dirty_pages) {
- struct kvm_memslots *slots, *old_slots;
- unsigned long *dirty_bitmap, *dirty_bitmap_head;
-
- dirty_bitmap = memslot->dirty_bitmap;
- dirty_bitmap_head = memslot->dirty_bitmap_head;
- if (dirty_bitmap == dirty_bitmap_head)
- dirty_bitmap_head += n / sizeof(long);
- memset(dirty_bitmap_head, 0, n);
-
- r = -ENOMEM;
- slots = kmemdup(kvm->memslots, sizeof(*kvm->memslots), GFP_KERNEL);
- if (!slots)
- goto out;
-
- memslot = id_to_memslot(slots, log->slot);
- memslot->nr_dirty_pages = 0;
- memslot->dirty_bitmap = dirty_bitmap_head;
- update_memslots(slots, NULL);
-
- old_slots = kvm->memslots;
- rcu_assign_pointer(kvm->memslots, slots);
- synchronize_srcu_expedited(&kvm->srcu);
- kfree(old_slots);
-
- write_protect_slot(kvm, memslot, dirty_bitmap, nr_dirty_pages);
-
- r = -EFAULT;
- if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
- goto out;
- } else {
- r = -EFAULT;
- if (clear_user(log->dirty_bitmap, n))
- goto out;
- }
-
- r = 0;
-out:
- mutex_unlock(&kvm->slots_lock);
- return r;
-}
-
-long kvm_arch_vm_ioctl(struct file *filp,
- unsigned int ioctl, unsigned long arg)
-{
- struct kvm *kvm = filp->private_data;
- void __user *argp = (void __user *)arg;
- int r = -ENOTTY;
- /*
- * This union makes it completely explicit to gcc-3.x
- * that these two variables' stack usage should be
- * combined, not added together.
- */
- union {
- struct kvm_pit_state ps;
- struct kvm_pit_state2 ps2;
- struct kvm_pit_config pit_config;
- } u;
-
- switch (ioctl) {
- case KVM_SET_TSS_ADDR:
- r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
- if (r < 0)
- goto out;
- break;
- case KVM_SET_IDENTITY_MAP_ADDR: {
- u64 ident_addr;
-
- r = -EFAULT;
- if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
- goto out;
- r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
- if (r < 0)
- goto out;
- break;
- }
- case KVM_SET_NR_MMU_PAGES:
- r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
- if (r)
- goto out;
- break;
- case KVM_GET_NR_MMU_PAGES:
- r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
- break;
- case KVM_CREATE_IRQCHIP: {
- struct kvm_pic *vpic;
-
- mutex_lock(&kvm->lock);
- r = -EEXIST;
- if (kvm->arch.vpic)
- goto create_irqchip_unlock;
- r = -EINVAL;
- if (atomic_read(&kvm->online_vcpus))
- goto create_irqchip_unlock;
- r = -ENOMEM;
- vpic = kvm_create_pic(kvm);
- if (vpic) {
- r = kvm_ioapic_init(kvm);
- if (r) {
- mutex_lock(&kvm->slots_lock);
- kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
- &vpic->dev_master);
- kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
- &vpic->dev_slave);
- kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
- &vpic->dev_eclr);
- mutex_unlock(&kvm->slots_lock);
- kfree(vpic);
- goto create_irqchip_unlock;
- }
- } else
- goto create_irqchip_unlock;
- smp_wmb();
- kvm->arch.vpic = vpic;
- smp_wmb();
- r = kvm_setup_default_irq_routing(kvm);
- if (r) {
- mutex_lock(&kvm->slots_lock);
- mutex_lock(&kvm->irq_lock);
- kvm_ioapic_destroy(kvm);
- kvm_destroy_pic(kvm);
- mutex_unlock(&kvm->irq_lock);
- mutex_unlock(&kvm->slots_lock);
- }
- create_irqchip_unlock:
- mutex_unlock(&kvm->lock);
- break;
- }
- case KVM_CREATE_PIT:
- u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
- goto create_pit;
- case KVM_CREATE_PIT2:
- r = -EFAULT;
- if (copy_from_user(&u.pit_config, argp,
- sizeof(struct kvm_pit_config)))
- goto out;
- create_pit:
- mutex_lock(&kvm->slots_lock);
- r = -EEXIST;
- if (kvm->arch.vpit)
- goto create_pit_unlock;
- r = -ENOMEM;
- kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
- if (kvm->arch.vpit)
- r = 0;
- create_pit_unlock:
- mutex_unlock(&kvm->slots_lock);
- break;
- case KVM_IRQ_LINE_STATUS:
- case KVM_IRQ_LINE: {
- struct kvm_irq_level irq_event;
-
- r = -EFAULT;
- if (copy_from_user(&irq_event, argp, sizeof irq_event))
- goto out;
- r = -ENXIO;
- if (irqchip_in_kernel(kvm)) {
- __s32 status;
- status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
- irq_event.irq, irq_event.level);
- if (ioctl == KVM_IRQ_LINE_STATUS) {
- r = -EFAULT;
- irq_event.status = status;
- if (copy_to_user(argp, &irq_event,
- sizeof irq_event))
- goto out;
- }
- r = 0;
- }
- break;
- }
- case KVM_GET_IRQCHIP: {
- /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
- struct kvm_irqchip *chip;
-
- chip = memdup_user(argp, sizeof(*chip));
- if (IS_ERR(chip)) {
- r = PTR_ERR(chip);
- goto out;
- }
-
- r = -ENXIO;
- if (!irqchip_in_kernel(kvm))
- goto get_irqchip_out;
- r = kvm_vm_ioctl_get_irqchip(kvm, chip);
- if (r)
- goto get_irqchip_out;
- r = -EFAULT;
- if (copy_to_user(argp, chip, sizeof *chip))
- goto get_irqchip_out;
- r = 0;
- get_irqchip_out:
- kfree(chip);
- if (r)
- goto out;
- break;
- }
- case KVM_SET_IRQCHIP: {
- /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
- struct kvm_irqchip *chip;
-
- chip = memdup_user(argp, sizeof(*chip));
- if (IS_ERR(chip)) {
- r = PTR_ERR(chip);
- goto out;
- }
-
- r = -ENXIO;
- if (!irqchip_in_kernel(kvm))
- goto set_irqchip_out;
- r = kvm_vm_ioctl_set_irqchip(kvm, chip);
- if (r)
- goto set_irqchip_out;
- r = 0;
- set_irqchip_out:
- kfree(chip);
- if (r)
- goto out;
- break;
- }
- case KVM_GET_PIT: {
- r = -EFAULT;
- if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
- goto out;
- r = -ENXIO;
- if (!kvm->arch.vpit)
- goto out;
- r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
- if (r)
- goto out;
- r = -EFAULT;
- if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
- goto out;
- r = 0;
- break;
- }
- case KVM_SET_PIT: {
- r = -EFAULT;
- if (copy_from_user(&u.ps, argp, sizeof u.ps))
- goto out;
- r = -ENXIO;
- if (!kvm->arch.vpit)
- goto out;
- r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
- if (r)
- goto out;
- r = 0;
- break;
- }
- case KVM_GET_PIT2: {
- r = -ENXIO;
- if (!kvm->arch.vpit)
- goto out;
- r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);
- if (r)
- goto out;
- r = -EFAULT;
- if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))
- goto out;
- r = 0;
- break;
- }
- case KVM_SET_PIT2: {
- r = -EFAULT;
- if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
- goto out;
- r = -ENXIO;
- if (!kvm->arch.vpit)
- goto out;
- r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
- if (r)
- goto out;
- r = 0;
- break;
- }
- case KVM_REINJECT_CONTROL: {
- struct kvm_reinject_control control;
- r = -EFAULT;
- if (copy_from_user(&control, argp, sizeof(control)))
- goto out;
- r = kvm_vm_ioctl_reinject(kvm, &control);
- if (r)
- goto out;
- r = 0;
- break;
- }
- case KVM_XEN_HVM_CONFIG: {
- r = -EFAULT;
- if (copy_from_user(&kvm->arch.xen_hvm_config, argp,
- sizeof(struct kvm_xen_hvm_config)))
- goto out;
- r = -EINVAL;
- if (kvm->arch.xen_hvm_config.flags)
- goto out;
- r = 0;
- break;
- }
- case KVM_SET_CLOCK: {
- struct kvm_clock_data user_ns;
- u64 now_ns;
- s64 delta;
-
- r = -EFAULT;
- if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
- goto out;
-
- r = -EINVAL;
- if (user_ns.flags)
- goto out;
-
- r = 0;
- local_irq_disable();
- now_ns = get_kernel_ns();
- delta = user_ns.clock - now_ns;
- local_irq_enable();
- kvm->arch.kvmclock_offset = delta;
- break;
- }
- case KVM_GET_CLOCK: {
- struct kvm_clock_data user_ns;
- u64 now_ns;
-
- local_irq_disable();
- now_ns = get_kernel_ns();
- user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
- local_irq_enable();
- user_ns.flags = 0;
- memset(&user_ns.pad, 0, sizeof(user_ns.pad));
-
- r = -EFAULT;
- if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
- goto out;
- r = 0;
- break;
- }
-
- default:
- ;
- }
-out:
- return r;
-}
-
-static void kvm_init_msr_list(void)
-{
- u32 dummy[2];
- unsigned i, j;
-
- /* skip the first msrs in the list. KVM-specific */
- for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
- if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
- continue;
- if (j < i)
- msrs_to_save[j] = msrs_to_save[i];
- j++;
- }
- num_msrs_to_save = j;
-}
-
-static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
- const void *v)
-{
- int handled = 0;
- int n;
-
- do {
- n = min(len, 8);
- if (!(vcpu->arch.apic &&
- !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, n, v))
- && kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))
- break;
- handled += n;
- addr += n;
- len -= n;
- v += n;
- } while (len);
-
- return handled;
-}
-
-static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
-{
- int handled = 0;
- int n;
-
- do {
- n = min(len, 8);
- if (!(vcpu->arch.apic &&
- !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, n, v))
- && kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))
- break;
- trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v);
- handled += n;
- addr += n;
- len -= n;
- v += n;
- } while (len);
-
- return handled;
-}
-
-static void kvm_set_segment(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg)
-{
- kvm_x86_ops->set_segment(vcpu, var, seg);
-}
-
-void kvm_get_segment(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg)
-{
- kvm_x86_ops->get_segment(vcpu, var, seg);
-}
-
-gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
-{
- gpa_t t_gpa;
- struct x86_exception exception;
-
- BUG_ON(!mmu_is_nested(vcpu));
-
- /* NPT walks are always user-walks */
- access |= PFERR_USER_MASK;
- t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &exception);
-
- return t_gpa;
-}
-
-gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
- struct x86_exception *exception)
-{
- u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
- return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
-}
-
- gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
- struct x86_exception *exception)
-{
- u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
- access |= PFERR_FETCH_MASK;
- return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
-}
-
-gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
- struct x86_exception *exception)
-{
- u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
- access |= PFERR_WRITE_MASK;
- return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
-}
-
-/* uses this to access any guest's mapped memory without checking CPL */
-gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
- struct x86_exception *exception)
-{
- return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);
-}
-
-static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
- struct kvm_vcpu *vcpu, u32 access,
- struct x86_exception *exception)
-{
- void *data = val;
- int r = X86EMUL_CONTINUE;
-
- while (bytes) {
- gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
- exception);
- unsigned offset = addr & (PAGE_SIZE-1);
- unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
- int ret;
-
- if (gpa == UNMAPPED_GVA)
- return X86EMUL_PROPAGATE_FAULT;
- ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
- if (ret < 0) {
- r = X86EMUL_IO_NEEDED;
- goto out;
- }
-
- bytes -= toread;
- data += toread;
- addr += toread;
- }
-out:
- return r;
-}
-
-/* used for instruction fetching */
-static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
- gva_t addr, void *val, unsigned int bytes,
- struct x86_exception *exception)
-{
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
-
- return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
- access | PFERR_FETCH_MASK,
- exception);
-}
-
-int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
- gva_t addr, void *val, unsigned int bytes,
- struct x86_exception *exception)
-{
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
-
- return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
- exception);
-}
-EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
-
-static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt,
- gva_t addr, void *val, unsigned int bytes,
- struct x86_exception *exception)
-{
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
-}
-
-int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
- gva_t addr, void *val,
- unsigned int bytes,
- struct x86_exception *exception)
-{
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- void *data = val;
- int r = X86EMUL_CONTINUE;
-
- while (bytes) {
- gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
- PFERR_WRITE_MASK,
- exception);
- unsigned offset = addr & (PAGE_SIZE-1);
- unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
- int ret;
-
- if (gpa == UNMAPPED_GVA)
- return X86EMUL_PROPAGATE_FAULT;
- ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
- if (ret < 0) {
- r = X86EMUL_IO_NEEDED;
- goto out;
- }
-
- bytes -= towrite;
- data += towrite;
- addr += towrite;
- }
-out:
- return r;
-}
-EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
-
-static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
- gpa_t *gpa, struct x86_exception *exception,
- bool write)
-{
- u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
-
- if (vcpu_match_mmio_gva(vcpu, gva) &&
- check_write_user_access(vcpu, write, access,
- vcpu->arch.access)) {
- *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
- (gva & (PAGE_SIZE - 1));
- trace_vcpu_match_mmio(gva, *gpa, write, false);
- return 1;
- }
-
- if (write)
- access |= PFERR_WRITE_MASK;
-
- *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
-
- if (*gpa == UNMAPPED_GVA)
- return -1;
-
- /* For APIC access vmexit */
- if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
- return 1;
-
- if (vcpu_match_mmio_gpa(vcpu, *gpa)) {
- trace_vcpu_match_mmio(gva, *gpa, write, true);
- return 1;
- }
-
- return 0;
-}
-
-int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
- const void *val, int bytes)
-{
- int ret;
-
- ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
- if (ret < 0)
- return 0;
- kvm_mmu_pte_write(vcpu, gpa, val, bytes);
- return 1;
-}
-
-struct read_write_emulator_ops {
- int (*read_write_prepare)(struct kvm_vcpu *vcpu, void *val,
- int bytes);
- int (*read_write_emulate)(struct kvm_vcpu *vcpu, gpa_t gpa,
- void *val, int bytes);
- int (*read_write_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
- int bytes, void *val);
- int (*read_write_exit_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
- void *val, int bytes);
- bool write;
-};
-
-static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
-{
- if (vcpu->mmio_read_completed) {
- memcpy(val, vcpu->mmio_data, bytes);
- trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
- vcpu->mmio_phys_addr, *(u64 *)val);
- vcpu->mmio_read_completed = 0;
- return 1;
- }
-
- return 0;
-}
-
-static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
- void *val, int bytes)
-{
- return !kvm_read_guest(vcpu->kvm, gpa, val, bytes);
-}
-
-static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
- void *val, int bytes)
-{
- return emulator_write_phys(vcpu, gpa, val, bytes);
-}
-
-static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
-{
- trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
- return vcpu_mmio_write(vcpu, gpa, bytes, val);
-}
-
-static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
- void *val, int bytes)
-{
- trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
- return X86EMUL_IO_NEEDED;
-}
-
-static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
- void *val, int bytes)
-{
- memcpy(vcpu->mmio_data, val, bytes);
- memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8);
- return X86EMUL_CONTINUE;
-}
-
-static struct read_write_emulator_ops read_emultor = {
- .read_write_prepare = read_prepare,
- .read_write_emulate = read_emulate,
- .read_write_mmio = vcpu_mmio_read,
- .read_write_exit_mmio = read_exit_mmio,
-};
-
-static struct read_write_emulator_ops write_emultor = {
- .read_write_emulate = write_emulate,
- .read_write_mmio = write_mmio,
- .read_write_exit_mmio = write_exit_mmio,
- .write = true,
-};
-
-static int emulator_read_write_onepage(unsigned long addr, void *val,
- unsigned int bytes,
- struct x86_exception *exception,
- struct kvm_vcpu *vcpu,
- struct read_write_emulator_ops *ops)
-{
- gpa_t gpa;
- int handled, ret;
- bool write = ops->write;
-
- if (ops->read_write_prepare &&
- ops->read_write_prepare(vcpu, val, bytes))
- return X86EMUL_CONTINUE;
-
- ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
-
- if (ret < 0)
- return X86EMUL_PROPAGATE_FAULT;
-
- /* For APIC access vmexit */
- if (ret)
- goto mmio;
-
- if (ops->read_write_emulate(vcpu, gpa, val, bytes))
- return X86EMUL_CONTINUE;
-
-mmio:
- /*
- * Is this MMIO handled locally?
- */
- handled = ops->read_write_mmio(vcpu, gpa, bytes, val);
- if (handled == bytes)
- return X86EMUL_CONTINUE;
-
- gpa += handled;
- bytes -= handled;
- val += handled;
-
- vcpu->mmio_needed = 1;
- vcpu->run->exit_reason = KVM_EXIT_MMIO;
- vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
- vcpu->mmio_size = bytes;
- vcpu->run->mmio.len = min(vcpu->mmio_size, 8);
- vcpu->run->mmio.is_write = vcpu->mmio_is_write = write;
- vcpu->mmio_index = 0;
-
- return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
-}
-
-int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
- void *val, unsigned int bytes,
- struct x86_exception *exception,
- struct read_write_emulator_ops *ops)
-{
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
-
- /* Crossing a page boundary? */
- if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
- int rc, now;
-
- now = -addr & ~PAGE_MASK;
- rc = emulator_read_write_onepage(addr, val, now, exception,
- vcpu, ops);
-
- if (rc != X86EMUL_CONTINUE)
- return rc;
- addr += now;
- val += now;
- bytes -= now;
- }
-
- return emulator_read_write_onepage(addr, val, bytes, exception,
- vcpu, ops);
-}
-
-static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
- unsigned long addr,
- void *val,
- unsigned int bytes,
- struct x86_exception *exception)
-{
- return emulator_read_write(ctxt, addr, val, bytes,
- exception, &read_emultor);
-}
-
-int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
- unsigned long addr,
- const void *val,
- unsigned int bytes,
- struct x86_exception *exception)
-{
- return emulator_read_write(ctxt, addr, (void *)val, bytes,
- exception, &write_emultor);
-}
-
-#define CMPXCHG_TYPE(t, ptr, old, new) \
- (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
-
-#ifdef CONFIG_X86_64
-# define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
-#else
-# define CMPXCHG64(ptr, old, new) \
- (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
-#endif
-
-static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
- unsigned long addr,
- const void *old,
- const void *new,
- unsigned int bytes,
- struct x86_exception *exception)
-{
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- gpa_t gpa;
- struct page *page;
- char *kaddr;
- bool exchanged;
-
- /* guests cmpxchg8b have to be emulated atomically */
- if (bytes > 8 || (bytes & (bytes - 1)))
- goto emul_write;
-
- gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
-
- if (gpa == UNMAPPED_GVA ||
- (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
- goto emul_write;
-
- if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
- goto emul_write;
-
- page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
- if (is_error_page(page)) {
- kvm_release_page_clean(page);
- goto emul_write;
- }
-
- kaddr = kmap_atomic(page);
- kaddr += offset_in_page(gpa);
- switch (bytes) {
- case 1:
- exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
- break;
- case 2:
- exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
- break;
- case 4:
- exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
- break;
- case 8:
- exchanged = CMPXCHG64(kaddr, old, new);
- break;
- default:
- BUG();
- }
- kunmap_atomic(kaddr);
- kvm_release_page_dirty(page);
-
- if (!exchanged)
- return X86EMUL_CMPXCHG_FAILED;
-
- kvm_mmu_pte_write(vcpu, gpa, new, bytes);
-
- return X86EMUL_CONTINUE;
-
-emul_write:
- printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
-
- return emulator_write_emulated(ctxt, addr, new, bytes, exception);
-}
-
-static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
-{
- /* TODO: String I/O for in kernel device */
- int r;
-
- if (vcpu->arch.pio.in)
- r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
- vcpu->arch.pio.size, pd);
- else
- r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
- vcpu->arch.pio.port, vcpu->arch.pio.size,
- pd);
- return r;
-}
-
-static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
- unsigned short port, void *val,
- unsigned int count, bool in)
-{
- trace_kvm_pio(!in, port, size, count);
-
- vcpu->arch.pio.port = port;
- vcpu->arch.pio.in = in;
- vcpu->arch.pio.count = count;
- vcpu->arch.pio.size = size;
-
- if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
- vcpu->arch.pio.count = 0;
- return 1;
- }
-
- vcpu->run->exit_reason = KVM_EXIT_IO;
- vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
- vcpu->run->io.size = size;
- vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
- vcpu->run->io.count = count;
- vcpu->run->io.port = port;
-
- return 0;
-}
-
-static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
- int size, unsigned short port, void *val,
- unsigned int count)
-{
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- int ret;
-
- if (vcpu->arch.pio.count)
- goto data_avail;
-
- ret = emulator_pio_in_out(vcpu, size, port, val, count, true);
- if (ret) {
-data_avail:
- memcpy(val, vcpu->arch.pio_data, size * count);
- vcpu->arch.pio.count = 0;
- return 1;
- }
-
- return 0;
-}
-
-static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
- int size, unsigned short port,
- const void *val, unsigned int count)
-{
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
-
- memcpy(vcpu->arch.pio_data, val, size * count);
- return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
-}
-
-static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
-{
- return kvm_x86_ops->get_segment_base(vcpu, seg);
-}
-
-static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
-{
- kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
-}
-
-int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
-{
- if (!need_emulate_wbinvd(vcpu))
- return X86EMUL_CONTINUE;
-
- if (kvm_x86_ops->has_wbinvd_exit()) {
- int cpu = get_cpu();
-
- cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
- smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
- wbinvd_ipi, NULL, 1);
- put_cpu();
- cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
- } else
- wbinvd();
- return X86EMUL_CONTINUE;
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
-
-static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
-{
- kvm_emulate_wbinvd(emul_to_vcpu(ctxt));
-}
-
-int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
-{
- return _kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
-}
-
-int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
-{
-
- return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
-}
-
-static u64 mk_cr_64(u64 curr_cr, u32 new_val)
-{
- return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
-}
-
-static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
-{
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- unsigned long value;
-
- switch (cr) {
- case 0:
- value = kvm_read_cr0(vcpu);
- break;
- case 2:
- value = vcpu->arch.cr2;
- break;
- case 3:
- value = kvm_read_cr3(vcpu);
- break;
- case 4:
- value = kvm_read_cr4(vcpu);
- break;
- case 8:
- value = kvm_get_cr8(vcpu);
- break;
- default:
- vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
- return 0;
- }
-
- return value;
-}
-
-static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
-{
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- int res = 0;
-
- switch (cr) {
- case 0:
- res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
- break;
- case 2:
- vcpu->arch.cr2 = val;
- break;
- case 3:
- res = kvm_set_cr3(vcpu, val);
- break;
- case 4:
- res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
- break;
- case 8:
- res = kvm_set_cr8(vcpu, val);
- break;
- default:
- vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
- res = -1;
- }
-
- return res;
-}
-
-static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val)
-{
- kvm_set_rflags(emul_to_vcpu(ctxt), val);
-}
-
-static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
-{
- return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
-}
-
-static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
-{
- kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt);
-}
-
-static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
-{
- kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt);
-}
-
-static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
-{
- kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt);
-}
-
-static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
-{
- kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt);
-}
-
-static unsigned long emulator_get_cached_segment_base(
- struct x86_emulate_ctxt *ctxt, int seg)
-{
- return get_segment_base(emul_to_vcpu(ctxt), seg);
-}
-
-static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
- struct desc_struct *desc, u32 *base3,
- int seg)
-{
- struct kvm_segment var;
-
- kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
- *selector = var.selector;
-
- if (var.unusable)
- return false;
-
- if (var.g)
- var.limit >>= 12;
- set_desc_limit(desc, var.limit);
- set_desc_base(desc, (unsigned long)var.base);
-#ifdef CONFIG_X86_64
- if (base3)
- *base3 = var.base >> 32;
-#endif
- desc->type = var.type;
- desc->s = var.s;
- desc->dpl = var.dpl;
- desc->p = var.present;
- desc->avl = var.avl;
- desc->l = var.l;
- desc->d = var.db;
- desc->g = var.g;
-
- return true;
-}
-
-static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
- struct desc_struct *desc, u32 base3,
- int seg)
-{
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- struct kvm_segment var;
-
- var.selector = selector;
- var.base = get_desc_base(desc);
-#ifdef CONFIG_X86_64
- var.base |= ((u64)base3) << 32;
-#endif
- var.limit = get_desc_limit(desc);
- if (desc->g)
- var.limit = (var.limit << 12) | 0xfff;
- var.type = desc->type;
- var.present = desc->p;
- var.dpl = desc->dpl;
- var.db = desc->d;
- var.s = desc->s;
- var.l = desc->l;
- var.g = desc->g;
- var.avl = desc->avl;
- var.present = desc->p;
- var.unusable = !var.present;
- var.padding = 0;
-
- kvm_set_segment(vcpu, &var, seg);
- return;
-}
-
-static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
- u32 msr_index, u64 *pdata)
-{
- return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
-}
-
-static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
- u32 msr_index, u64 data)
-{
- return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data);
-}
-
-static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
- u32 pmc, u64 *pdata)
-{
- return kvm_pmu_read_pmc(emul_to_vcpu(ctxt), pmc, pdata);
-}
-
-static void emulator_halt(struct x86_emulate_ctxt *ctxt)
-{
- emul_to_vcpu(ctxt)->arch.halt_request = 1;
-}
-
-static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt)
-{
- preempt_disable();
- kvm_load_guest_fpu(emul_to_vcpu(ctxt));
- /*
- * CR0.TS may reference the host fpu state, not the guest fpu state,
- * so it may be clear at this point.
- */
- clts();
-}
-
-static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt)
-{
- preempt_enable();
-}
-
-static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
- struct x86_instruction_info *info,
- enum x86_intercept_stage stage)
-{
- return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
-}
-
-static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
- u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
-{
- struct kvm_cpuid_entry2 *cpuid = NULL;
-
- if (eax && ecx)
- cpuid = kvm_find_cpuid_entry(emul_to_vcpu(ctxt),
- *eax, *ecx);
-
- if (cpuid) {
- *eax = cpuid->eax;
- *ecx = cpuid->ecx;
- if (ebx)
- *ebx = cpuid->ebx;
- if (edx)
- *edx = cpuid->edx;
- return true;
- }
-
- return false;
-}
-
-static struct x86_emulate_ops emulate_ops = {
- .read_std = kvm_read_guest_virt_system,
- .write_std = kvm_write_guest_virt_system,
- .fetch = kvm_fetch_guest_virt,
- .read_emulated = emulator_read_emulated,
- .write_emulated = emulator_write_emulated,
- .cmpxchg_emulated = emulator_cmpxchg_emulated,
- .invlpg = emulator_invlpg,
- .pio_in_emulated = emulator_pio_in_emulated,
- .pio_out_emulated = emulator_pio_out_emulated,
- .get_segment = emulator_get_segment,
- .set_segment = emulator_set_segment,
- .get_cached_segment_base = emulator_get_cached_segment_base,
- .get_gdt = emulator_get_gdt,
- .get_idt = emulator_get_idt,
- .set_gdt = emulator_set_gdt,
- .set_idt = emulator_set_idt,
- .get_cr = emulator_get_cr,
- .set_cr = emulator_set_cr,
- .set_rflags = emulator_set_rflags,
- .cpl = emulator_get_cpl,
- .get_dr = emulator_get_dr,
- .set_dr = emulator_set_dr,
- .set_msr = emulator_set_msr,
- .get_msr = emulator_get_msr,
- .read_pmc = emulator_read_pmc,
- .halt = emulator_halt,
- .wbinvd = emulator_wbinvd,
- .fix_hypercall = emulator_fix_hypercall,
- .get_fpu = emulator_get_fpu,
- .put_fpu = emulator_put_fpu,
- .intercept = emulator_intercept,
- .get_cpuid = emulator_get_cpuid,
-};
-
-static void cache_all_regs(struct kvm_vcpu *vcpu)
-{
- kvm_register_read(vcpu, VCPU_REGS_RAX);
- kvm_register_read(vcpu, VCPU_REGS_RSP);
- kvm_register_read(vcpu, VCPU_REGS_RIP);
- vcpu->arch.regs_dirty = ~0;
-}
-
-static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
-{
- u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask);
- /*
- * an sti; sti; sequence only disable interrupts for the first
- * instruction. So, if the last instruction, be it emulated or
- * not, left the system with the INT_STI flag enabled, it
- * means that the last instruction is an sti. We should not
- * leave the flag on in this case. The same goes for mov ss
- */
- if (!(int_shadow & mask))
- kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
-}
-
-static void inject_emulated_exception(struct kvm_vcpu *vcpu)
-{
- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
- if (ctxt->exception.vector == PF_VECTOR)
- kvm_propagate_fault(vcpu, &ctxt->exception);
- else if (ctxt->exception.error_code_valid)
- kvm_queue_exception_e(vcpu, ctxt->exception.vector,
- ctxt->exception.error_code);
- else
- kvm_queue_exception(vcpu, ctxt->exception.vector);
-}
-
-static void init_decode_cache(struct x86_emulate_ctxt *ctxt,
- const unsigned long *regs)
-{
- memset(&ctxt->twobyte, 0,
- (void *)&ctxt->regs - (void *)&ctxt->twobyte);
- memcpy(ctxt->regs, regs, sizeof(ctxt->regs));
-
- ctxt->fetch.start = 0;
- ctxt->fetch.end = 0;
- ctxt->io_read.pos = 0;
- ctxt->io_read.end = 0;
- ctxt->mem_read.pos = 0;
- ctxt->mem_read.end = 0;
-}
-
-static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
-{
- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
- int cs_db, cs_l;
-
- /*
- * TODO: fix emulate.c to use guest_read/write_register
- * instead of direct ->regs accesses, can save hundred cycles
- * on Intel for instructions that don't read/change RSP, for
- * for example.
- */
- cache_all_regs(vcpu);
-
- kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
-
- ctxt->eflags = kvm_get_rflags(vcpu);
- ctxt->eip = kvm_rip_read(vcpu);
- ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
- (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 :
- cs_l ? X86EMUL_MODE_PROT64 :
- cs_db ? X86EMUL_MODE_PROT32 :
- X86EMUL_MODE_PROT16;
- ctxt->guest_mode = is_guest_mode(vcpu);
-
- init_decode_cache(ctxt, vcpu->arch.regs);
- vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
-}
-
-int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
-{
- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
- int ret;
-
- init_emulate_ctxt(vcpu);
-
- ctxt->op_bytes = 2;
- ctxt->ad_bytes = 2;
- ctxt->_eip = ctxt->eip + inc_eip;
- ret = emulate_int_real(ctxt, irq);
-
- if (ret != X86EMUL_CONTINUE)
- return EMULATE_FAIL;
-
- ctxt->eip = ctxt->_eip;
- memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
- kvm_rip_write(vcpu, ctxt->eip);
- kvm_set_rflags(vcpu, ctxt->eflags);
-
- if (irq == NMI_VECTOR)
- vcpu->arch.nmi_pending = 0;
- else
- vcpu->arch.interrupt.pending = false;
-
- return EMULATE_DONE;
-}
-EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
-
-static int handle_emulation_failure(struct kvm_vcpu *vcpu)
-{
- int r = EMULATE_DONE;
-
- ++vcpu->stat.insn_emulation_fail;
- trace_kvm_emulate_insn_failed(vcpu);
- if (!is_guest_mode(vcpu)) {
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
- r = EMULATE_FAIL;
- }
- kvm_queue_exception(vcpu, UD_VECTOR);
-
- return r;
-}
-
-static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
-{
- gpa_t gpa;
-
- if (tdp_enabled)
- return false;
-
- /*
- * if emulation was due to access to shadowed page table
- * and it failed try to unshadow page and re-entetr the
- * guest to let CPU execute the instruction.
- */
- if (kvm_mmu_unprotect_page_virt(vcpu, gva))
- return true;
-
- gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
-
- if (gpa == UNMAPPED_GVA)
- return true; /* let cpu generate fault */
-
- if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT)))
- return true;
-
- return false;
-}
-
-static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
- unsigned long cr2, int emulation_type)
-{
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- unsigned long last_retry_eip, last_retry_addr, gpa = cr2;
-
- last_retry_eip = vcpu->arch.last_retry_eip;
- last_retry_addr = vcpu->arch.last_retry_addr;
-
- /*
- * If the emulation is caused by #PF and it is non-page_table
- * writing instruction, it means the VM-EXIT is caused by shadow
- * page protected, we can zap the shadow page and retry this
- * instruction directly.
- *
- * Note: if the guest uses a non-page-table modifying instruction
- * on the PDE that points to the instruction, then we will unmap
- * the instruction and go to an infinite loop. So, we cache the
- * last retried eip and the last fault address, if we meet the eip
- * and the address again, we can break out of the potential infinite
- * loop.
- */
- vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
-
- if (!(emulation_type & EMULTYPE_RETRY))
- return false;
-
- if (x86_page_table_writing_insn(ctxt))
- return false;
-
- if (ctxt->eip == last_retry_eip && last_retry_addr == cr2)
- return false;
-
- vcpu->arch.last_retry_eip = ctxt->eip;
- vcpu->arch.last_retry_addr = cr2;
-
- if (!vcpu->arch.mmu.direct_map)
- gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
-
- kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
-
- return true;
-}
-
-int x86_emulate_instruction(struct kvm_vcpu *vcpu,
- unsigned long cr2,
- int emulation_type,
- void *insn,
- int insn_len)
-{
- int r;
- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
- bool writeback = true;
-
- kvm_clear_exception_queue(vcpu);
-
- if (!(emulation_type & EMULTYPE_NO_DECODE)) {
- init_emulate_ctxt(vcpu);
- ctxt->interruptibility = 0;
- ctxt->have_exception = false;
- ctxt->perm_ok = false;
-
- ctxt->only_vendor_specific_insn
- = emulation_type & EMULTYPE_TRAP_UD;
-
- r = x86_decode_insn(ctxt, insn, insn_len);
-
- trace_kvm_emulate_insn_start(vcpu);
- ++vcpu->stat.insn_emulation;
- if (r != EMULATION_OK) {
- if (emulation_type & EMULTYPE_TRAP_UD)
- return EMULATE_FAIL;
- if (reexecute_instruction(vcpu, cr2))
- return EMULATE_DONE;
- if (emulation_type & EMULTYPE_SKIP)
- return EMULATE_FAIL;
- return handle_emulation_failure(vcpu);
- }
- }
-
- if (emulation_type & EMULTYPE_SKIP) {
- kvm_rip_write(vcpu, ctxt->_eip);
- return EMULATE_DONE;
- }
-
- if (retry_instruction(ctxt, cr2, emulation_type))
- return EMULATE_DONE;
-
- /* this is needed for vmware backdoor interface to work since it
- changes registers values during IO operation */
- if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
- vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
- memcpy(ctxt->regs, vcpu->arch.regs, sizeof ctxt->regs);
- }
-
-restart:
- r = x86_emulate_insn(ctxt);
-
- if (r == EMULATION_INTERCEPTED)
- return EMULATE_DONE;
-
- if (r == EMULATION_FAILED) {
- if (reexecute_instruction(vcpu, cr2))
- return EMULATE_DONE;
-
- return handle_emulation_failure(vcpu);
- }
-
- if (ctxt->have_exception) {
- inject_emulated_exception(vcpu);
- r = EMULATE_DONE;
- } else if (vcpu->arch.pio.count) {
- if (!vcpu->arch.pio.in)
- vcpu->arch.pio.count = 0;
- else
- writeback = false;
- r = EMULATE_DO_MMIO;
- } else if (vcpu->mmio_needed) {
- if (!vcpu->mmio_is_write)
- writeback = false;
- r = EMULATE_DO_MMIO;
- } else if (r == EMULATION_RESTART)
- goto restart;
- else
- r = EMULATE_DONE;
-
- if (writeback) {
- toggle_interruptibility(vcpu, ctxt->interruptibility);
- kvm_set_rflags(vcpu, ctxt->eflags);
- kvm_make_request(KVM_REQ_EVENT, vcpu);
- memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
- vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
- kvm_rip_write(vcpu, ctxt->eip);
- } else
- vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
-
- return r;
-}
-EXPORT_SYMBOL_GPL(x86_emulate_instruction);
-
-int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
-{
- unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
- int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
- size, port, &val, 1);
- /* do not return to emulator after return from userspace */
- vcpu->arch.pio.count = 0;
- return ret;
-}
-EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
-
-static void tsc_bad(void *info)
-{
- __this_cpu_write(cpu_tsc_khz, 0);
-}
-
-static void tsc_khz_changed(void *data)
-{
- struct cpufreq_freqs *freq = data;
- unsigned long khz = 0;
-
- if (data)
- khz = freq->new;
- else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
- khz = cpufreq_quick_get(raw_smp_processor_id());
- if (!khz)
- khz = tsc_khz;
- __this_cpu_write(cpu_tsc_khz, khz);
-}
-
-static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
- void *data)
-{
- struct cpufreq_freqs *freq = data;
- struct kvm *kvm;
- struct kvm_vcpu *vcpu;
- int i, send_ipi = 0;
-
- /*
- * We allow guests to temporarily run on slowing clocks,
- * provided we notify them after, or to run on accelerating
- * clocks, provided we notify them before. Thus time never
- * goes backwards.
- *
- * However, we have a problem. We can't atomically update
- * the frequency of a given CPU from this function; it is
- * merely a notifier, which can be called from any CPU.
- * Changing the TSC frequency at arbitrary points in time
- * requires a recomputation of local variables related to
- * the TSC for each VCPU. We must flag these local variables
- * to be updated and be sure the update takes place with the
- * new frequency before any guests proceed.
- *
- * Unfortunately, the combination of hotplug CPU and frequency
- * change creates an intractable locking scenario; the order
- * of when these callouts happen is undefined with respect to
- * CPU hotplug, and they can race with each other. As such,
- * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is
- * undefined; you can actually have a CPU frequency change take
- * place in between the computation of X and the setting of the
- * variable. To protect against this problem, all updates of
- * the per_cpu tsc_khz variable are done in an interrupt
- * protected IPI, and all callers wishing to update the value
- * must wait for a synchronous IPI to complete (which is trivial
- * if the caller is on the CPU already). This establishes the
- * necessary total order on variable updates.
- *
- * Note that because a guest time update may take place
- * anytime after the setting of the VCPU's request bit, the
- * correct TSC value must be set before the request. However,
- * to ensure the update actually makes it to any guest which
- * starts running in hardware virtualization between the set
- * and the acquisition of the spinlock, we must also ping the
- * CPU after setting the request bit.
- *
- */
-
- if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
- return 0;
- if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
- return 0;
-
- smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
-
- raw_spin_lock(&kvm_lock);
- list_for_each_entry(kvm, &vm_list, vm_list) {
- kvm_for_each_vcpu(i, vcpu, kvm) {
- if (vcpu->cpu != freq->cpu)
- continue;
- kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
- if (vcpu->cpu != smp_processor_id())
- send_ipi = 1;
- }
- }
- raw_spin_unlock(&kvm_lock);
-
- if (freq->old < freq->new && send_ipi) {
- /*
- * We upscale the frequency. Must make the guest
- * doesn't see old kvmclock values while running with
- * the new frequency, otherwise we risk the guest sees
- * time go backwards.
- *
- * In case we update the frequency for another cpu
- * (which might be in guest context) send an interrupt
- * to kick the cpu out of guest context. Next time
- * guest context is entered kvmclock will be updated,
- * so the guest will not see stale values.
- */
- smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
- }
- return 0;
-}
-
-static struct notifier_block kvmclock_cpufreq_notifier_block = {
- .notifier_call = kvmclock_cpufreq_notifier
-};
-
-static int kvmclock_cpu_notifier(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
-{
- unsigned int cpu = (unsigned long)hcpu;
-
- switch (action) {
- case CPU_ONLINE:
- case CPU_DOWN_FAILED:
- smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
- break;
- case CPU_DOWN_PREPARE:
- smp_call_function_single(cpu, tsc_bad, NULL, 1);
- break;
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block kvmclock_cpu_notifier_block = {
- .notifier_call = kvmclock_cpu_notifier,
- .priority = -INT_MAX
-};
-
-static void kvm_timer_init(void)
-{
- int cpu;
-
- max_tsc_khz = tsc_khz;
- register_hotcpu_notifier(&kvmclock_cpu_notifier_block);
- if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
-#ifdef CONFIG_CPU_FREQ
- struct cpufreq_policy policy;
- memset(&policy, 0, sizeof(policy));
- cpu = get_cpu();
- cpufreq_get_policy(&policy, cpu);
- if (policy.cpuinfo.max_freq)
- max_tsc_khz = policy.cpuinfo.max_freq;
- put_cpu();
-#endif
- cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
- CPUFREQ_TRANSITION_NOTIFIER);
- }
- pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);
- for_each_online_cpu(cpu)
- smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
-}
-
-static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
-
-int kvm_is_in_guest(void)
-{
- return __this_cpu_read(current_vcpu) != NULL;
-}
-
-static int kvm_is_user_mode(void)
-{
- int user_mode = 3;
-
- if (__this_cpu_read(current_vcpu))
- user_mode = kvm_x86_ops->get_cpl(__this_cpu_read(current_vcpu));
-
- return user_mode != 0;
-}
-
-static unsigned long kvm_get_guest_ip(void)
-{
- unsigned long ip = 0;
-
- if (__this_cpu_read(current_vcpu))
- ip = kvm_rip_read(__this_cpu_read(current_vcpu));
-
- return ip;
-}
-
-static struct perf_guest_info_callbacks kvm_guest_cbs = {
- .is_in_guest = kvm_is_in_guest,
- .is_user_mode = kvm_is_user_mode,
- .get_guest_ip = kvm_get_guest_ip,
-};
-
-void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)
-{
- __this_cpu_write(current_vcpu, vcpu);
-}
-EXPORT_SYMBOL_GPL(kvm_before_handle_nmi);
-
-void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
-{
- __this_cpu_write(current_vcpu, NULL);
-}
-EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
-
-static void kvm_set_mmio_spte_mask(void)
-{
- u64 mask;
- int maxphyaddr = boot_cpu_data.x86_phys_bits;
-
- /*
- * Set the reserved bits and the present bit of an paging-structure
- * entry to generate page fault with PFER.RSV = 1.
- */
- mask = ((1ull << (62 - maxphyaddr + 1)) - 1) << maxphyaddr;
- mask |= 1ull;
-
-#ifdef CONFIG_X86_64
- /*
- * If reserved bit is not supported, clear the present bit to disable
- * mmio page fault.
- */
- if (maxphyaddr == 52)
- mask &= ~1ull;
-#endif
-
- kvm_mmu_set_mmio_spte_mask(mask);
-}
-
-int kvm_arch_init(void *opaque)
-{
- int r;
- struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
-
- if (kvm_x86_ops) {
- printk(KERN_ERR "kvm: already loaded the other module\n");
- r = -EEXIST;
- goto out;
- }
-
- if (!ops->cpu_has_kvm_support()) {
- printk(KERN_ERR "kvm: no hardware support\n");
- r = -EOPNOTSUPP;
- goto out;
- }
- if (ops->disabled_by_bios()) {
- printk(KERN_ERR "kvm: disabled by bios\n");
- r = -EOPNOTSUPP;
- goto out;
- }
-
- r = kvm_mmu_module_init();
- if (r)
- goto out;
-
- kvm_set_mmio_spte_mask();
- kvm_init_msr_list();
-
- kvm_x86_ops = ops;
- kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
- PT_DIRTY_MASK, PT64_NX_MASK, 0);
-
- kvm_timer_init();
-
- perf_register_guest_info_callbacks(&kvm_guest_cbs);
-
- if (cpu_has_xsave)
- host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-
- return 0;
-
-out:
- return r;
-}
-
-void kvm_arch_exit(void)
-{
- perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
-
- if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
- cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
- CPUFREQ_TRANSITION_NOTIFIER);
- unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block);
- kvm_x86_ops = NULL;
- kvm_mmu_module_exit();
-}
-
-int kvm_emulate_halt(struct kvm_vcpu *vcpu)
-{
- ++vcpu->stat.halt_exits;
- if (irqchip_in_kernel(vcpu->kvm)) {
- vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
- return 1;
- } else {
- vcpu->run->exit_reason = KVM_EXIT_HLT;
- return 0;
- }
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_halt);
-
-int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
-{
- u64 param, ingpa, outgpa, ret;
- uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0;
- bool fast, longmode;
- int cs_db, cs_l;
-
- /*
- * hypercall generates UD from non zero cpl and real mode
- * per HYPER-V spec
- */
- if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
- kvm_queue_exception(vcpu, UD_VECTOR);
- return 0;
- }
-
- kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
- longmode = is_long_mode(vcpu) && cs_l == 1;
-
- if (!longmode) {
- param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) |
- (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff);
- ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) |
- (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff);
- outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) |
- (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff);
- }
-#ifdef CONFIG_X86_64
- else {
- param = kvm_register_read(vcpu, VCPU_REGS_RCX);
- ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX);
- outgpa = kvm_register_read(vcpu, VCPU_REGS_R8);
- }
-#endif
-
- code = param & 0xffff;
- fast = (param >> 16) & 0x1;
- rep_cnt = (param >> 32) & 0xfff;
- rep_idx = (param >> 48) & 0xfff;
-
- trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
-
- switch (code) {
- case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT:
- kvm_vcpu_on_spin(vcpu);
- break;
- default:
- res = HV_STATUS_INVALID_HYPERCALL_CODE;
- break;
- }
-
- ret = res | (((u64)rep_done & 0xfff) << 32);
- if (longmode) {
- kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
- } else {
- kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32);
- kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff);
- }
-
- return 1;
-}
-
-int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
-{
- unsigned long nr, a0, a1, a2, a3, ret;
- int r = 1;
-
- if (kvm_hv_hypercall_enabled(vcpu->kvm))
- return kvm_hv_hypercall(vcpu);
-
- nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
- a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
- a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
- a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
- a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
-
- trace_kvm_hypercall(nr, a0, a1, a2, a3);
-
- if (!is_long_mode(vcpu)) {
- nr &= 0xFFFFFFFF;
- a0 &= 0xFFFFFFFF;
- a1 &= 0xFFFFFFFF;
- a2 &= 0xFFFFFFFF;
- a3 &= 0xFFFFFFFF;
- }
-
- if (kvm_x86_ops->get_cpl(vcpu) != 0) {
- ret = -KVM_EPERM;
- goto out;
- }
-
- switch (nr) {
- case KVM_HC_VAPIC_POLL_IRQ:
- ret = 0;
- break;
- default:
- ret = -KVM_ENOSYS;
- break;
- }
-out:
- kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
- ++vcpu->stat.hypercalls;
- return r;
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
-
-int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
-{
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- char instruction[3];
- unsigned long rip = kvm_rip_read(vcpu);
-
- /*
- * Blow out the MMU to ensure that no other VCPU has an active mapping
- * to ensure that the updated hypercall appears atomically across all
- * VCPUs.
- */
- kvm_mmu_zap_all(vcpu->kvm);
-
- kvm_x86_ops->patch_hypercall(vcpu, instruction);
-
- return emulator_write_emulated(ctxt, rip, instruction, 3, NULL);
-}
-
-/*
- * Check if userspace requested an interrupt window, and that the
- * interrupt window is open.
- *
- * No need to exit to userspace if we already have an interrupt queued.
- */
-static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
-{
- return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
- vcpu->run->request_interrupt_window &&
- kvm_arch_interrupt_allowed(vcpu));
-}
-
-static void post_kvm_run_save(struct kvm_vcpu *vcpu)
-{
- struct kvm_run *kvm_run = vcpu->run;
-
- kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
- kvm_run->cr8 = kvm_get_cr8(vcpu);
- kvm_run->apic_base = kvm_get_apic_base(vcpu);
- if (irqchip_in_kernel(vcpu->kvm))
- kvm_run->ready_for_interrupt_injection = 1;
- else
- kvm_run->ready_for_interrupt_injection =
- kvm_arch_interrupt_allowed(vcpu) &&
- !kvm_cpu_has_interrupt(vcpu) &&
- !kvm_event_needs_reinjection(vcpu);
-}
-
-static void vapic_enter(struct kvm_vcpu *vcpu)
-{
- struct kvm_lapic *apic = vcpu->arch.apic;
- struct page *page;
-
- if (!apic || !apic->vapic_addr)
- return;
-
- page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
-
- vcpu->arch.apic->vapic_page = page;
-}
-
-static void vapic_exit(struct kvm_vcpu *vcpu)
-{
- struct kvm_lapic *apic = vcpu->arch.apic;
- int idx;
-
- if (!apic || !apic->vapic_addr)
- return;
-
- idx = srcu_read_lock(&vcpu->kvm->srcu);
- kvm_release_page_dirty(apic->vapic_page);
- mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
-}
-
-static void update_cr8_intercept(struct kvm_vcpu *vcpu)
-{
- int max_irr, tpr;
-
- if (!kvm_x86_ops->update_cr8_intercept)
- return;
-
- if (!vcpu->arch.apic)
- return;
-
- if (!vcpu->arch.apic->vapic_addr)
- max_irr = kvm_lapic_find_highest_irr(vcpu);
- else
- max_irr = -1;
-
- if (max_irr != -1)
- max_irr >>= 4;
-
- tpr = kvm_lapic_get_cr8(vcpu);
-
- kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
-}
-
-static void inject_pending_event(struct kvm_vcpu *vcpu)
-{
- /* try to reinject previous events if any */
- if (vcpu->arch.exception.pending) {
- trace_kvm_inj_exception(vcpu->arch.exception.nr,
- vcpu->arch.exception.has_error_code,
- vcpu->arch.exception.error_code);
- kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
- vcpu->arch.exception.has_error_code,
- vcpu->arch.exception.error_code,
- vcpu->arch.exception.reinject);
- return;
- }
-
- if (vcpu->arch.nmi_injected) {
- kvm_x86_ops->set_nmi(vcpu);
- return;
- }
-
- if (vcpu->arch.interrupt.pending) {
- kvm_x86_ops->set_irq(vcpu);
- return;
- }
-
- /* try to inject new event if pending */
- if (vcpu->arch.nmi_pending) {
- if (kvm_x86_ops->nmi_allowed(vcpu)) {
- --vcpu->arch.nmi_pending;
- vcpu->arch.nmi_injected = true;
- kvm_x86_ops->set_nmi(vcpu);
- }
- } else if (kvm_cpu_has_interrupt(vcpu)) {
- if (kvm_x86_ops->interrupt_allowed(vcpu)) {
- kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
- false);
- kvm_x86_ops->set_irq(vcpu);
- }
- }
-}
-
-static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
-{
- if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
- !vcpu->guest_xcr0_loaded) {
- /* kvm_set_xcr() also depends on this */
- xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
- vcpu->guest_xcr0_loaded = 1;
- }
-}
-
-static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
-{
- if (vcpu->guest_xcr0_loaded) {
- if (vcpu->arch.xcr0 != host_xcr0)
- xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
- vcpu->guest_xcr0_loaded = 0;
- }
-}
-
-static void process_nmi(struct kvm_vcpu *vcpu)
-{
- unsigned limit = 2;
-
- /*
- * x86 is limited to one NMI running, and one NMI pending after it.
- * If an NMI is already in progress, limit further NMIs to just one.
- * Otherwise, allow two (and we'll inject the first one immediately).
- */
- if (kvm_x86_ops->get_nmi_mask(vcpu) || vcpu->arch.nmi_injected)
- limit = 1;
-
- vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
- vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
- kvm_make_request(KVM_REQ_EVENT, vcpu);
-}
-
-static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
-{
- int r;
- bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
- vcpu->run->request_interrupt_window;
- bool req_immediate_exit = 0;
-
- if (vcpu->requests) {
- if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
- kvm_mmu_unload(vcpu);
- if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
- __kvm_migrate_timers(vcpu);
- if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
- r = kvm_guest_time_update(vcpu);
- if (unlikely(r))
- goto out;
- }
- if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
- kvm_mmu_sync_roots(vcpu);
- if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
- kvm_x86_ops->tlb_flush(vcpu);
- if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
- vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
- r = 0;
- goto out;
- }
- if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
- vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
- r = 0;
- goto out;
- }
- if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {
- vcpu->fpu_active = 0;
- kvm_x86_ops->fpu_deactivate(vcpu);
- }
- if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
- /* Page is swapped out. Do synthetic halt */
- vcpu->arch.apf.halted = true;
- r = 1;
- goto out;
- }
- if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
- record_steal_time(vcpu);
- if (kvm_check_request(KVM_REQ_NMI, vcpu))
- process_nmi(vcpu);
- req_immediate_exit =
- kvm_check_request(KVM_REQ_IMMEDIATE_EXIT, vcpu);
- if (kvm_check_request(KVM_REQ_PMU, vcpu))
- kvm_handle_pmu_event(vcpu);
- if (kvm_check_request(KVM_REQ_PMI, vcpu))
- kvm_deliver_pmi(vcpu);
- }
-
- r = kvm_mmu_reload(vcpu);
- if (unlikely(r))
- goto out;
-
- if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
- inject_pending_event(vcpu);
-
- /* enable NMI/IRQ window open exits if needed */
- if (vcpu->arch.nmi_pending)
- kvm_x86_ops->enable_nmi_window(vcpu);
- else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
- kvm_x86_ops->enable_irq_window(vcpu);
-
- if (kvm_lapic_enabled(vcpu)) {
- update_cr8_intercept(vcpu);
- kvm_lapic_sync_to_vapic(vcpu);
- }
- }
-
- preempt_disable();
-
- kvm_x86_ops->prepare_guest_switch(vcpu);
- if (vcpu->fpu_active)
- kvm_load_guest_fpu(vcpu);
- kvm_load_guest_xcr0(vcpu);
-
- vcpu->mode = IN_GUEST_MODE;
-
- /* We should set ->mode before check ->requests,
- * see the comment in make_all_cpus_request.
- */
- smp_mb();
-
- local_irq_disable();
-
- if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests
- || need_resched() || signal_pending(current)) {
- vcpu->mode = OUTSIDE_GUEST_MODE;
- smp_wmb();
- local_irq_enable();
- preempt_enable();
- kvm_x86_ops->cancel_injection(vcpu);
- r = 1;
- goto out;
- }
-
- srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
-
- if (req_immediate_exit)
- smp_send_reschedule(vcpu->cpu);
-
- kvm_guest_enter();
-
- if (unlikely(vcpu->arch.switch_db_regs)) {
- set_debugreg(0, 7);
- set_debugreg(vcpu->arch.eff_db[0], 0);
- set_debugreg(vcpu->arch.eff_db[1], 1);
- set_debugreg(vcpu->arch.eff_db[2], 2);
- set_debugreg(vcpu->arch.eff_db[3], 3);
- }
-
- trace_kvm_entry(vcpu->vcpu_id);
- kvm_x86_ops->run(vcpu);
-
- /*
- * If the guest has used debug registers, at least dr7
- * will be disabled while returning to the host.
- * If we don't have active breakpoints in the host, we don't
- * care about the messed up debug address registers. But if
- * we have some of them active, restore the old state.
- */
- if (hw_breakpoint_active())
- hw_breakpoint_restore();
-
- vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
-
- vcpu->mode = OUTSIDE_GUEST_MODE;
- smp_wmb();
- local_irq_enable();
-
- ++vcpu->stat.exits;
-
- /*
- * We must have an instruction between local_irq_enable() and
- * kvm_guest_exit(), so the timer interrupt isn't delayed by
- * the interrupt shadow. The stat.exits increment will do nicely.
- * But we need to prevent reordering, hence this barrier():
- */
- barrier();
-
- kvm_guest_exit();
-
- preempt_enable();
-
- vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-
- /*
- * Profile KVM exit RIPs:
- */
- if (unlikely(prof_on == KVM_PROFILING)) {
- unsigned long rip = kvm_rip_read(vcpu);
- profile_hit(KVM_PROFILING, (void *)rip);
- }
-
- if (unlikely(vcpu->arch.tsc_always_catchup))
- kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
-
- kvm_lapic_sync_from_vapic(vcpu);
-
- r = kvm_x86_ops->handle_exit(vcpu);
-out:
- return r;
-}
-
-
-static int __vcpu_run(struct kvm_vcpu *vcpu)
-{
- int r;
- struct kvm *kvm = vcpu->kvm;
-
- if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
- pr_debug("vcpu %d received sipi with vector # %x\n",
- vcpu->vcpu_id, vcpu->arch.sipi_vector);
- kvm_lapic_reset(vcpu);
- r = kvm_arch_vcpu_reset(vcpu);
- if (r)
- return r;
- vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
- }
-
- vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
- vapic_enter(vcpu);
-
- r = 1;
- while (r > 0) {
- if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
- !vcpu->arch.apf.halted)
- r = vcpu_enter_guest(vcpu);
- else {
- srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
- kvm_vcpu_block(vcpu);
- vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
- if (kvm_check_request(KVM_REQ_UNHALT, vcpu))
- {
- switch(vcpu->arch.mp_state) {
- case KVM_MP_STATE_HALTED:
- vcpu->arch.mp_state =
- KVM_MP_STATE_RUNNABLE;
- case KVM_MP_STATE_RUNNABLE:
- vcpu->arch.apf.halted = false;
- break;
- case KVM_MP_STATE_SIPI_RECEIVED:
- default:
- r = -EINTR;
- break;
- }
- }
- }
-
- if (r <= 0)
- break;
-
- clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
- if (kvm_cpu_has_pending_timer(vcpu))
- kvm_inject_pending_timer_irqs(vcpu);
-
- if (dm_request_for_irq_injection(vcpu)) {
- r = -EINTR;
- vcpu->run->exit_reason = KVM_EXIT_INTR;
- ++vcpu->stat.request_irq_exits;
- }
-
- kvm_check_async_pf_completion(vcpu);
-
- if (signal_pending(current)) {
- r = -EINTR;
- vcpu->run->exit_reason = KVM_EXIT_INTR;
- ++vcpu->stat.signal_exits;
- }
- if (need_resched()) {
- srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
- kvm_resched(vcpu);
- vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
- }
- }
-
- srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
-
- vapic_exit(vcpu);
-
- return r;
-}
-
-static int complete_mmio(struct kvm_vcpu *vcpu)
-{
- struct kvm_run *run = vcpu->run;
- int r;
-
- if (!(vcpu->arch.pio.count || vcpu->mmio_needed))
- return 1;
-
- if (vcpu->mmio_needed) {
- vcpu->mmio_needed = 0;
- if (!vcpu->mmio_is_write)
- memcpy(vcpu->mmio_data + vcpu->mmio_index,
- run->mmio.data, 8);
- vcpu->mmio_index += 8;
- if (vcpu->mmio_index < vcpu->mmio_size) {
- run->exit_reason = KVM_EXIT_MMIO;
- run->mmio.phys_addr = vcpu->mmio_phys_addr + vcpu->mmio_index;
- memcpy(run->mmio.data, vcpu->mmio_data + vcpu->mmio_index, 8);
- run->mmio.len = min(vcpu->mmio_size - vcpu->mmio_index, 8);
- run->mmio.is_write = vcpu->mmio_is_write;
- vcpu->mmio_needed = 1;
- return 0;
- }
- if (vcpu->mmio_is_write)
- return 1;
- vcpu->mmio_read_completed = 1;
- }
- vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
- r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
- srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
- if (r != EMULATE_DONE)
- return 0;
- return 1;
-}
-
-int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
- int r;
- sigset_t sigsaved;
-
- if (!tsk_used_math(current) && init_fpu(current))
- return -ENOMEM;
-
- if (vcpu->sigset_active)
- sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
-
- if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
- kvm_vcpu_block(vcpu);
- clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
- r = -EAGAIN;
- goto out;
- }
-
- /* re-sync apic's tpr */
- if (!irqchip_in_kernel(vcpu->kvm)) {
- if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
- r = -EINVAL;
- goto out;
- }
- }
-
- r = complete_mmio(vcpu);
- if (r <= 0)
- goto out;
-
- r = __vcpu_run(vcpu);
-
-out:
- post_kvm_run_save(vcpu);
- if (vcpu->sigset_active)
- sigprocmask(SIG_SETMASK, &sigsaved, NULL);
-
- return r;
-}
-
-int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
-{
- if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
- /*
- * We are here if userspace calls get_regs() in the middle of
- * instruction emulation. Registers state needs to be copied
- * back from emulation context to vcpu. Usrapace shouldn't do
- * that usually, but some bad designed PV devices (vmware
- * backdoor interface) need this to work
- */
- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
- memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
- vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
- }
- regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
- regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
- regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
- regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
- regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
- regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
- regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
- regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
-#ifdef CONFIG_X86_64
- regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
- regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
- regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
- regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
- regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
- regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
- regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
- regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
-#endif
-
- regs->rip = kvm_rip_read(vcpu);
- regs->rflags = kvm_get_rflags(vcpu);
-
- return 0;
-}
-
-int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
-{
- vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
- vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
-
- kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
- kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
- kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
- kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
- kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
- kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
- kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
- kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
-#ifdef CONFIG_X86_64
- kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
- kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
- kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
- kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
- kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
- kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
- kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
- kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
-#endif
-
- kvm_rip_write(vcpu, regs->rip);
- kvm_set_rflags(vcpu, regs->rflags);
-
- vcpu->arch.exception.pending = false;
-
- kvm_make_request(KVM_REQ_EVENT, vcpu);
-
- return 0;
-}
-
-void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
-{
- struct kvm_segment cs;
-
- kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
- *db = cs.db;
- *l = cs.l;
-}
-EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
-
-int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
- struct kvm_sregs *sregs)
-{
- struct desc_ptr dt;
-
- kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
- kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
- kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
- kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
- kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
- kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
-
- kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
- kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
-
- kvm_x86_ops->get_idt(vcpu, &dt);
- sregs->idt.limit = dt.size;
- sregs->idt.base = dt.address;
- kvm_x86_ops->get_gdt(vcpu, &dt);
- sregs->gdt.limit = dt.size;
- sregs->gdt.base = dt.address;
-
- sregs->cr0 = kvm_read_cr0(vcpu);
- sregs->cr2 = vcpu->arch.cr2;
- sregs->cr3 = kvm_read_cr3(vcpu);
- sregs->cr4 = kvm_read_cr4(vcpu);
- sregs->cr8 = kvm_get_cr8(vcpu);
- sregs->efer = vcpu->arch.efer;
- sregs->apic_base = kvm_get_apic_base(vcpu);
-
- memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
-
- if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft)
- set_bit(vcpu->arch.interrupt.nr,
- (unsigned long *)sregs->interrupt_bitmap);
-
- return 0;
-}
-
-int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
- struct kvm_mp_state *mp_state)
-{
- mp_state->mp_state = vcpu->arch.mp_state;
- return 0;
-}
-
-int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
- struct kvm_mp_state *mp_state)
-{
- vcpu->arch.mp_state = mp_state->mp_state;
- kvm_make_request(KVM_REQ_EVENT, vcpu);
- return 0;
-}
-
-int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
- int reason, bool has_error_code, u32 error_code)
-{
- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
- int ret;
-
- init_emulate_ctxt(vcpu);
-
- ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
- has_error_code, error_code);
-
- if (ret)
- return EMULATE_FAIL;
-
- memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
- kvm_rip_write(vcpu, ctxt->eip);
- kvm_set_rflags(vcpu, ctxt->eflags);
- kvm_make_request(KVM_REQ_EVENT, vcpu);
- return EMULATE_DONE;
-}
-EXPORT_SYMBOL_GPL(kvm_task_switch);
-
-int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
- struct kvm_sregs *sregs)
-{
- int mmu_reset_needed = 0;
- int pending_vec, max_bits, idx;
- struct desc_ptr dt;
-
- dt.size = sregs->idt.limit;
- dt.address = sregs->idt.base;
- kvm_x86_ops->set_idt(vcpu, &dt);
- dt.size = sregs->gdt.limit;
- dt.address = sregs->gdt.base;
- kvm_x86_ops->set_gdt(vcpu, &dt);
-
- vcpu->arch.cr2 = sregs->cr2;
- mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
- vcpu->arch.cr3 = sregs->cr3;
- __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
-
- kvm_set_cr8(vcpu, sregs->cr8);
-
- mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
- kvm_x86_ops->set_efer(vcpu, sregs->efer);
- kvm_set_apic_base(vcpu, sregs->apic_base);
-
- mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
- kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
- vcpu->arch.cr0 = sregs->cr0;
-
- mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
- kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
- if (sregs->cr4 & X86_CR4_OSXSAVE)
- kvm_update_cpuid(vcpu);
-
- idx = srcu_read_lock(&vcpu->kvm->srcu);
- if (!is_long_mode(vcpu) && is_pae(vcpu)) {
- load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
- mmu_reset_needed = 1;
- }
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
-
- if (mmu_reset_needed)
- kvm_mmu_reset_context(vcpu);
-
- max_bits = (sizeof sregs->interrupt_bitmap) << 3;
- pending_vec = find_first_bit(
- (const unsigned long *)sregs->interrupt_bitmap, max_bits);
- if (pending_vec < max_bits) {
- kvm_queue_interrupt(vcpu, pending_vec, false);
- pr_debug("Set back pending irq %d\n", pending_vec);
- }
-
- kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
- kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
- kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
- kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
- kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
- kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
-
- kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
- kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
-
- update_cr8_intercept(vcpu);
-
- /* Older userspace won't unhalt the vcpu on reset. */
- if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
- sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
- !is_protmode(vcpu))
- vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
-
- kvm_make_request(KVM_REQ_EVENT, vcpu);
-
- return 0;
-}
-
-int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
- struct kvm_guest_debug *dbg)
-{
- unsigned long rflags;
- int i, r;
-
- if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
- r = -EBUSY;
- if (vcpu->arch.exception.pending)
- goto out;
- if (dbg->control & KVM_GUESTDBG_INJECT_DB)
- kvm_queue_exception(vcpu, DB_VECTOR);
- else
- kvm_queue_exception(vcpu, BP_VECTOR);
- }
-
- /*
- * Read rflags as long as potentially injected trace flags are still
- * filtered out.
- */
- rflags = kvm_get_rflags(vcpu);
-
- vcpu->guest_debug = dbg->control;
- if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
- vcpu->guest_debug = 0;
-
- if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
- for (i = 0; i < KVM_NR_DB_REGS; ++i)
- vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
- vcpu->arch.switch_db_regs =
- (dbg->arch.debugreg[7] & DR7_BP_EN_MASK);
- } else {
- for (i = 0; i < KVM_NR_DB_REGS; i++)
- vcpu->arch.eff_db[i] = vcpu->arch.db[i];
- vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
- }
-
- if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
- vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
- get_segment_base(vcpu, VCPU_SREG_CS);
-
- /*
- * Trigger an rflags update that will inject or remove the trace
- * flags.
- */
- kvm_set_rflags(vcpu, rflags);
-
- kvm_x86_ops->set_guest_debug(vcpu, dbg);
-
- r = 0;
-
-out:
-
- return r;
-}
-
-/*
- * Translate a guest virtual address to a guest physical address.
- */
-int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
- struct kvm_translation *tr)
-{
- unsigned long vaddr = tr->linear_address;
- gpa_t gpa;
- int idx;
-
- idx = srcu_read_lock(&vcpu->kvm->srcu);
- gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
- tr->physical_address = gpa;
- tr->valid = gpa != UNMAPPED_GVA;
- tr->writeable = 1;
- tr->usermode = 0;
-
- return 0;
-}
-
-int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
-{
- struct i387_fxsave_struct *fxsave =
- &vcpu->arch.guest_fpu.state->fxsave;
-
- memcpy(fpu->fpr, fxsave->st_space, 128);
- fpu->fcw = fxsave->cwd;
- fpu->fsw = fxsave->swd;
- fpu->ftwx = fxsave->twd;
- fpu->last_opcode = fxsave->fop;
- fpu->last_ip = fxsave->rip;
- fpu->last_dp = fxsave->rdp;
- memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
-
- return 0;
-}
-
-int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
-{
- struct i387_fxsave_struct *fxsave =
- &vcpu->arch.guest_fpu.state->fxsave;
-
- memcpy(fxsave->st_space, fpu->fpr, 128);
- fxsave->cwd = fpu->fcw;
- fxsave->swd = fpu->fsw;
- fxsave->twd = fpu->ftwx;
- fxsave->fop = fpu->last_opcode;
- fxsave->rip = fpu->last_ip;
- fxsave->rdp = fpu->last_dp;
- memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
-
- return 0;
-}
-
-int fx_init(struct kvm_vcpu *vcpu)
-{
- int err;
-
- err = fpu_alloc(&vcpu->arch.guest_fpu);
- if (err)
- return err;
-
- fpu_finit(&vcpu->arch.guest_fpu);
-
- /*
- * Ensure guest xcr0 is valid for loading
- */
- vcpu->arch.xcr0 = XSTATE_FP;
-
- vcpu->arch.cr0 |= X86_CR0_ET;
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(fx_init);
-
-static void fx_free(struct kvm_vcpu *vcpu)
-{
- fpu_free(&vcpu->arch.guest_fpu);
-}
-
-void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
-{
- if (vcpu->guest_fpu_loaded)
- return;
-
- /*
- * Restore all possible states in the guest,
- * and assume host would use all available bits.
- * Guest xcr0 would be loaded later.
- */
- kvm_put_guest_xcr0(vcpu);
- vcpu->guest_fpu_loaded = 1;
- unlazy_fpu(current);
- fpu_restore_checking(&vcpu->arch.guest_fpu);
- trace_kvm_fpu(1);
-}
-
-void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
-{
- kvm_put_guest_xcr0(vcpu);
-
- if (!vcpu->guest_fpu_loaded)
- return;
-
- vcpu->guest_fpu_loaded = 0;
- fpu_save_init(&vcpu->arch.guest_fpu);
- ++vcpu->stat.fpu_reload;
- kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
- trace_kvm_fpu(0);
-}
-
-void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
-{
- kvmclock_reset(vcpu);
-
- free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
- fx_free(vcpu);
- kvm_x86_ops->vcpu_free(vcpu);
-}
-
-struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
- unsigned int id)
-{
- if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
- printk_once(KERN_WARNING
- "kvm: SMP vm created on host with unstable TSC; "
- "guest TSC will not be reliable\n");
- return kvm_x86_ops->vcpu_create(kvm, id);
-}
-
-int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
-{
- int r;
-
- vcpu->arch.mtrr_state.have_fixed = 1;
- vcpu_load(vcpu);
- r = kvm_arch_vcpu_reset(vcpu);
- if (r == 0)
- r = kvm_mmu_setup(vcpu);
- vcpu_put(vcpu);
-
- return r;
-}
-
-void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
-{
- vcpu->arch.apf.msr_val = 0;
-
- vcpu_load(vcpu);
- kvm_mmu_unload(vcpu);
- vcpu_put(vcpu);
-
- fx_free(vcpu);
- kvm_x86_ops->vcpu_free(vcpu);
-}
-
-int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
-{
- atomic_set(&vcpu->arch.nmi_queued, 0);
- vcpu->arch.nmi_pending = 0;
- vcpu->arch.nmi_injected = false;
-
- vcpu->arch.switch_db_regs = 0;
- memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
- vcpu->arch.dr6 = DR6_FIXED_1;
- vcpu->arch.dr7 = DR7_FIXED_1;
-
- kvm_make_request(KVM_REQ_EVENT, vcpu);
- vcpu->arch.apf.msr_val = 0;
- vcpu->arch.st.msr_val = 0;
-
- kvmclock_reset(vcpu);
-
- kvm_clear_async_pf_completion_queue(vcpu);
- kvm_async_pf_hash_reset(vcpu);
- vcpu->arch.apf.halted = false;
-
- kvm_pmu_reset(vcpu);
-
- return kvm_x86_ops->vcpu_reset(vcpu);
-}
-
-int kvm_arch_hardware_enable(void *garbage)
-{
- struct kvm *kvm;
- struct kvm_vcpu *vcpu;
- int i;
- int ret;
- u64 local_tsc;
- u64 max_tsc = 0;
- bool stable, backwards_tsc = false;
-
- kvm_shared_msr_cpu_online();
- ret = kvm_x86_ops->hardware_enable(garbage);
- if (ret != 0)
- return ret;
-
- local_tsc = native_read_tsc();
- stable = !check_tsc_unstable();
- list_for_each_entry(kvm, &vm_list, vm_list) {
- kvm_for_each_vcpu(i, vcpu, kvm) {
- if (!stable && vcpu->cpu == smp_processor_id())
- set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
- if (stable && vcpu->arch.last_host_tsc > local_tsc) {
- backwards_tsc = true;
- if (vcpu->arch.last_host_tsc > max_tsc)
- max_tsc = vcpu->arch.last_host_tsc;
- }
- }
- }
-
- /*
- * Sometimes, even reliable TSCs go backwards. This happens on
- * platforms that reset TSC during suspend or hibernate actions, but
- * maintain synchronization. We must compensate. Fortunately, we can
- * detect that condition here, which happens early in CPU bringup,
- * before any KVM threads can be running. Unfortunately, we can't
- * bring the TSCs fully up to date with real time, as we aren't yet far
- * enough into CPU bringup that we know how much real time has actually
- * elapsed; our helper function, get_kernel_ns() will be using boot
- * variables that haven't been updated yet.
- *
- * So we simply find the maximum observed TSC above, then record the
- * adjustment to TSC in each VCPU. When the VCPU later gets loaded,
- * the adjustment will be applied. Note that we accumulate
- * adjustments, in case multiple suspend cycles happen before some VCPU
- * gets a chance to run again. In the event that no KVM threads get a
- * chance to run, we will miss the entire elapsed period, as we'll have
- * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may
- * loose cycle time. This isn't too big a deal, since the loss will be
- * uniform across all VCPUs (not to mention the scenario is extremely
- * unlikely). It is possible that a second hibernate recovery happens
- * much faster than a first, causing the observed TSC here to be
- * smaller; this would require additional padding adjustment, which is
- * why we set last_host_tsc to the local tsc observed here.
- *
- * N.B. - this code below runs only on platforms with reliable TSC,
- * as that is the only way backwards_tsc is set above. Also note
- * that this runs for ALL vcpus, which is not a bug; all VCPUs should
- * have the same delta_cyc adjustment applied if backwards_tsc
- * is detected. Note further, this adjustment is only done once,
- * as we reset last_host_tsc on all VCPUs to stop this from being
- * called multiple times (one for each physical CPU bringup).
- *
- * Platforms with unnreliable TSCs don't have to deal with this, they
- * will be compensated by the logic in vcpu_load, which sets the TSC to
- * catchup mode. This will catchup all VCPUs to real time, but cannot
- * guarantee that they stay in perfect synchronization.
- */
- if (backwards_tsc) {
- u64 delta_cyc = max_tsc - local_tsc;
- list_for_each_entry(kvm, &vm_list, vm_list) {
- kvm_for_each_vcpu(i, vcpu, kvm) {
- vcpu->arch.tsc_offset_adjustment += delta_cyc;
- vcpu->arch.last_host_tsc = local_tsc;
- }
-
- /*
- * We have to disable TSC offset matching.. if you were
- * booting a VM while issuing an S4 host suspend....
- * you may have some problem. Solving this issue is
- * left as an exercise to the reader.
- */
- kvm->arch.last_tsc_nsec = 0;
- kvm->arch.last_tsc_write = 0;
- }
-
- }
- return 0;
-}
-
-void kvm_arch_hardware_disable(void *garbage)
-{
- kvm_x86_ops->hardware_disable(garbage);
- drop_user_return_notifiers(garbage);
-}
-
-int kvm_arch_hardware_setup(void)
-{
- return kvm_x86_ops->hardware_setup();
-}
-
-void kvm_arch_hardware_unsetup(void)
-{
- kvm_x86_ops->hardware_unsetup();
-}
-
-void kvm_arch_check_processor_compat(void *rtn)
-{
- kvm_x86_ops->check_processor_compatibility(rtn);
-}
-
-bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
-{
- return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
-}
-
-int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
-{
- struct page *page;
- struct kvm *kvm;
- int r;
-
- BUG_ON(vcpu->kvm == NULL);
- kvm = vcpu->kvm;
-
- vcpu->arch.emulate_ctxt.ops = &emulate_ops;
- if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
- vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
- else
- vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
-
- page = alloc_page(GFP_KERNEL | __GFP_ZERO);
- if (!page) {
- r = -ENOMEM;
- goto fail;
- }
- vcpu->arch.pio_data = page_address(page);
-
- kvm_set_tsc_khz(vcpu, max_tsc_khz);
-
- r = kvm_mmu_create(vcpu);
- if (r < 0)
- goto fail_free_pio_data;
-
- if (irqchip_in_kernel(kvm)) {
- r = kvm_create_lapic(vcpu);
- if (r < 0)
- goto fail_mmu_destroy;
- }
-
- vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
- GFP_KERNEL);
- if (!vcpu->arch.mce_banks) {
- r = -ENOMEM;
- goto fail_free_lapic;
- }
- vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
-
- if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
- goto fail_free_mce_banks;
-
- kvm_async_pf_hash_reset(vcpu);
- kvm_pmu_init(vcpu);
-
- return 0;
-fail_free_mce_banks:
- kfree(vcpu->arch.mce_banks);
-fail_free_lapic:
- kvm_free_lapic(vcpu);
-fail_mmu_destroy:
- kvm_mmu_destroy(vcpu);
-fail_free_pio_data:
- free_page((unsigned long)vcpu->arch.pio_data);
-fail:
- return r;
-}
-
-void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
-{
- int idx;
-
- kvm_pmu_destroy(vcpu);
- kfree(vcpu->arch.mce_banks);
- kvm_free_lapic(vcpu);
- idx = srcu_read_lock(&vcpu->kvm->srcu);
- kvm_mmu_destroy(vcpu);
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
- free_page((unsigned long)vcpu->arch.pio_data);
-}
-
-int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
-{
- if (type)
- return -EINVAL;
-
- INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
- INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
-
- /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
- set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
-
- raw_spin_lock_init(&kvm->arch.tsc_write_lock);
-
- return 0;
-}
-
-static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
-{
- vcpu_load(vcpu);
- kvm_mmu_unload(vcpu);
- vcpu_put(vcpu);
-}
-
-static void kvm_free_vcpus(struct kvm *kvm)
-{
- unsigned int i;
- struct kvm_vcpu *vcpu;
-
- /*
- * Unpin any mmu pages first.
- */
- kvm_for_each_vcpu(i, vcpu, kvm) {
- kvm_clear_async_pf_completion_queue(vcpu);
- kvm_unload_vcpu_mmu(vcpu);
- }
- kvm_for_each_vcpu(i, vcpu, kvm)
- kvm_arch_vcpu_free(vcpu);
-
- mutex_lock(&kvm->lock);
- for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
- kvm->vcpus[i] = NULL;
-
- atomic_set(&kvm->online_vcpus, 0);
- mutex_unlock(&kvm->lock);
-}
-
-void kvm_arch_sync_events(struct kvm *kvm)
-{
- kvm_free_all_assigned_devices(kvm);
- kvm_free_pit(kvm);
-}
-
-void kvm_arch_destroy_vm(struct kvm *kvm)
-{
- kvm_iommu_unmap_guest(kvm);
- kfree(kvm->arch.vpic);
- kfree(kvm->arch.vioapic);
- kvm_free_vcpus(kvm);
- if (kvm->arch.apic_access_page)
- put_page(kvm->arch.apic_access_page);
- if (kvm->arch.ept_identity_pagetable)
- put_page(kvm->arch.ept_identity_pagetable);
-}
-
-void kvm_arch_free_memslot(struct kvm_memory_slot *free,
- struct kvm_memory_slot *dont)
-{
- int i;
-
- for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
- if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) {
- vfree(free->arch.lpage_info[i]);
- free->arch.lpage_info[i] = NULL;
- }
- }
-}
-
-int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
-{
- int i;
-
- for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
- unsigned long ugfn;
- int lpages;
- int level = i + 2;
-
- lpages = gfn_to_index(slot->base_gfn + npages - 1,
- slot->base_gfn, level) + 1;
-
- slot->arch.lpage_info[i] =
- vzalloc(lpages * sizeof(*slot->arch.lpage_info[i]));
- if (!slot->arch.lpage_info[i])
- goto out_free;
-
- if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
- slot->arch.lpage_info[i][0].write_count = 1;
- if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
- slot->arch.lpage_info[i][lpages - 1].write_count = 1;
- ugfn = slot->userspace_addr >> PAGE_SHIFT;
- /*
- * If the gfn and userspace address are not aligned wrt each
- * other, or if explicitly asked to, disable large page
- * support for this slot
- */
- if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
- !kvm_largepages_enabled()) {
- unsigned long j;
-
- for (j = 0; j < lpages; ++j)
- slot->arch.lpage_info[i][j].write_count = 1;
- }
- }
-
- return 0;
-
-out_free:
- for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
- vfree(slot->arch.lpage_info[i]);
- slot->arch.lpage_info[i] = NULL;
- }
- return -ENOMEM;
-}
-
-int kvm_arch_prepare_memory_region(struct kvm *kvm,
- struct kvm_memory_slot *memslot,
- struct kvm_memory_slot old,
- struct kvm_userspace_memory_region *mem,
- int user_alloc)
-{
- int npages = memslot->npages;
- int map_flags = MAP_PRIVATE | MAP_ANONYMOUS;
-
- /* Prevent internal slot pages from being moved by fork()/COW. */
- if (memslot->id >= KVM_MEMORY_SLOTS)
- map_flags = MAP_SHARED | MAP_ANONYMOUS;
-
- /*To keep backward compatibility with older userspace,
- *x86 needs to hanlde !user_alloc case.
- */
- if (!user_alloc) {
- if (npages && !old.rmap) {
- unsigned long userspace_addr;
-
- userspace_addr = vm_mmap(NULL, 0,
- npages * PAGE_SIZE,
- PROT_READ | PROT_WRITE,
- map_flags,
- 0);
-
- if (IS_ERR((void *)userspace_addr))
- return PTR_ERR((void *)userspace_addr);
-
- memslot->userspace_addr = userspace_addr;
- }
- }
-
-
- return 0;
-}
-
-void kvm_arch_commit_memory_region(struct kvm *kvm,
- struct kvm_userspace_memory_region *mem,
- struct kvm_memory_slot old,
- int user_alloc)
-{
-
- int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
-
- if (!user_alloc && !old.user_alloc && old.rmap && !npages) {
- int ret;
-
- ret = vm_munmap(old.userspace_addr,
- old.npages * PAGE_SIZE);
- if (ret < 0)
- printk(KERN_WARNING
- "kvm_vm_ioctl_set_memory_region: "
- "failed to munmap memory\n");
- }
-
- if (!kvm->arch.n_requested_mmu_pages)
- nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
-
- spin_lock(&kvm->mmu_lock);
- if (nr_mmu_pages)
- kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
- kvm_mmu_slot_remove_write_access(kvm, mem->slot);
- spin_unlock(&kvm->mmu_lock);
-}
-
-void kvm_arch_flush_shadow(struct kvm *kvm)
-{
- kvm_mmu_zap_all(kvm);
- kvm_reload_remote_mmus(kvm);
-}
-
-int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
-{
- return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
- !vcpu->arch.apf.halted)
- || !list_empty_careful(&vcpu->async_pf.done)
- || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
- || atomic_read(&vcpu->arch.nmi_queued) ||
- (kvm_arch_interrupt_allowed(vcpu) &&
- kvm_cpu_has_interrupt(vcpu));
-}
-
-void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
-{
- int me;
- int cpu = vcpu->cpu;
-
- if (waitqueue_active(&vcpu->wq)) {
- wake_up_interruptible(&vcpu->wq);
- ++vcpu->stat.halt_wakeup;
- }
-
- me = get_cpu();
- if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
- if (kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE)
- smp_send_reschedule(cpu);
- put_cpu();
-}
-
-int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
-{
- return kvm_x86_ops->interrupt_allowed(vcpu);
-}
-
-bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
-{
- unsigned long current_rip = kvm_rip_read(vcpu) +
- get_segment_base(vcpu, VCPU_SREG_CS);
-
- return current_rip == linear_rip;
-}
-EXPORT_SYMBOL_GPL(kvm_is_linear_rip);
-
-unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
-{
- unsigned long rflags;
-
- rflags = kvm_x86_ops->get_rflags(vcpu);
- if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
- rflags &= ~X86_EFLAGS_TF;
- return rflags;
-}
-EXPORT_SYMBOL_GPL(kvm_get_rflags);
-
-void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
-{
- if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
- kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
- rflags |= X86_EFLAGS_TF;
- kvm_x86_ops->set_rflags(vcpu, rflags);
- kvm_make_request(KVM_REQ_EVENT, vcpu);
-}
-EXPORT_SYMBOL_GPL(kvm_set_rflags);
-
-void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
-{
- int r;
-
- if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) ||
- is_error_page(work->page))
- return;
-
- r = kvm_mmu_reload(vcpu);
- if (unlikely(r))
- return;
-
- if (!vcpu->arch.mmu.direct_map &&
- work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu))
- return;
-
- vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true);
-}
-
-static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
-{
- return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
-}
-
-static inline u32 kvm_async_pf_next_probe(u32 key)
-{
- return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
-}
-
-static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
- u32 key = kvm_async_pf_hash_fn(gfn);
-
- while (vcpu->arch.apf.gfns[key] != ~0)
- key = kvm_async_pf_next_probe(key);
-
- vcpu->arch.apf.gfns[key] = gfn;
-}
-
-static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
- int i;
- u32 key = kvm_async_pf_hash_fn(gfn);
-
- for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
- (vcpu->arch.apf.gfns[key] != gfn &&
- vcpu->arch.apf.gfns[key] != ~0); i++)
- key = kvm_async_pf_next_probe(key);
-
- return key;
-}
-
-bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
- return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
-}
-
-static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
- u32 i, j, k;
-
- i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
- while (true) {
- vcpu->arch.apf.gfns[i] = ~0;
- do {
- j = kvm_async_pf_next_probe(j);
- if (vcpu->arch.apf.gfns[j] == ~0)
- return;
- k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
- /*
- * k lies cyclically in ]i,j]
- * | i.k.j |
- * |....j i.k.| or |.k..j i...|
- */
- } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));
- vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
- i = j;
- }
-}
-
-static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
-{
-
- return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
- sizeof(val));
-}
-
-void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
- struct kvm_async_pf *work)
-{
- struct x86_exception fault;
-
- trace_kvm_async_pf_not_present(work->arch.token, work->gva);
- kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
-
- if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
- (vcpu->arch.apf.send_user_only &&
- kvm_x86_ops->get_cpl(vcpu) == 0))
- kvm_make_request(KVM_REQ_APF_HALT, vcpu);
- else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
- fault.vector = PF_VECTOR;
- fault.error_code_valid = true;
- fault.error_code = 0;
- fault.nested_page_fault = false;
- fault.address = work->arch.token;
- kvm_inject_page_fault(vcpu, &fault);
- }
-}
-
-void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
- struct kvm_async_pf *work)
-{
- struct x86_exception fault;
-
- trace_kvm_async_pf_ready(work->arch.token, work->gva);
- if (is_error_page(work->page))
- work->arch.token = ~0; /* broadcast wakeup */
- else
- kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
-
- if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) &&
- !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
- fault.vector = PF_VECTOR;
- fault.error_code_valid = true;
- fault.error_code = 0;
- fault.nested_page_fault = false;
- fault.address = work->arch.token;
- kvm_inject_page_fault(vcpu, &fault);
- }
- vcpu->arch.apf.halted = false;
- vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
-}
-
-bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
-{
- if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
- return true;
- else
- return !kvm_event_needs_reinjection(vcpu) &&
- kvm_x86_ops->interrupt_allowed(vcpu);
-}
-
-EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
-EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
-EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
-EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
-EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
-EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);
-EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
-EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
-EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
-EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
-EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
-EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
diff --git a/ANDROID_3.4.5/arch/x86/kvm/x86.h b/ANDROID_3.4.5/arch/x86/kvm/x86.h
deleted file mode 100644
index cb80c293..00000000
--- a/ANDROID_3.4.5/arch/x86/kvm/x86.h
+++ /dev/null
@@ -1,127 +0,0 @@
-#ifndef ARCH_X86_KVM_X86_H
-#define ARCH_X86_KVM_X86_H
-
-#include <linux/kvm_host.h>
-#include "kvm_cache_regs.h"
-
-static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
-{
- vcpu->arch.exception.pending = false;
-}
-
-static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector,
- bool soft)
-{
- vcpu->arch.interrupt.pending = true;
- vcpu->arch.interrupt.soft = soft;
- vcpu->arch.interrupt.nr = vector;
-}
-
-static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
-{
- vcpu->arch.interrupt.pending = false;
-}
-
-static inline bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.exception.pending || vcpu->arch.interrupt.pending ||
- vcpu->arch.nmi_injected;
-}
-
-static inline bool kvm_exception_is_soft(unsigned int nr)
-{
- return (nr == BP_VECTOR) || (nr == OF_VECTOR);
-}
-
-static inline bool is_protmode(struct kvm_vcpu *vcpu)
-{
- return kvm_read_cr0_bits(vcpu, X86_CR0_PE);
-}
-
-static inline int is_long_mode(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_X86_64
- return vcpu->arch.efer & EFER_LMA;
-#else
- return 0;
-#endif
-}
-
-static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
-}
-
-static inline int is_pae(struct kvm_vcpu *vcpu)
-{
- return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);
-}
-
-static inline int is_pse(struct kvm_vcpu *vcpu)
-{
- return kvm_read_cr4_bits(vcpu, X86_CR4_PSE);
-}
-
-static inline int is_paging(struct kvm_vcpu *vcpu)
-{
- return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
-}
-
-static inline u32 bit(int bitno)
-{
- return 1 << (bitno & 31);
-}
-
-static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
- gva_t gva, gfn_t gfn, unsigned access)
-{
- vcpu->arch.mmio_gva = gva & PAGE_MASK;
- vcpu->arch.access = access;
- vcpu->arch.mmio_gfn = gfn;
-}
-
-/*
- * Clear the mmio cache info for the given gva,
- * specially, if gva is ~0ul, we clear all mmio cache info.
- */
-static inline void vcpu_clear_mmio_info(struct kvm_vcpu *vcpu, gva_t gva)
-{
- if (gva != (~0ul) && vcpu->arch.mmio_gva != (gva & PAGE_MASK))
- return;
-
- vcpu->arch.mmio_gva = 0;
-}
-
-static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva)
-{
- if (vcpu->arch.mmio_gva && vcpu->arch.mmio_gva == (gva & PAGE_MASK))
- return true;
-
- return false;
-}
-
-static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
-{
- if (vcpu->arch.mmio_gfn && vcpu->arch.mmio_gfn == gpa >> PAGE_SHIFT)
- return true;
-
- return false;
-}
-
-void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
-void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
-int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
-
-void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data);
-
-int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
- gva_t addr, void *val, unsigned int bytes,
- struct x86_exception *exception);
-
-int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
- gva_t addr, void *val, unsigned int bytes,
- struct x86_exception *exception);
-
-extern u64 host_xcr0;
-
-#endif