[PATCH 5/14] KVM: virtualization infrastructure

- ioctl()
- mmap()
- vcpu context management (vcpu_load/vcpu_put)
- some control register logic

Signed-off-by: Yaniv Kamay <yaniv@qumranet.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

Index: linux-2.6/drivers/kvm/kvm_main.c
===================================================================
--- /dev/null
+++ linux-2.6/drivers/kvm/kvm_main.c
@@ -0,0 +1,1355 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ *
+ * Authors:
+ *   Avi Kivity   <avi@qumranet.com>
+ *   Yaniv Kamay  <yaniv@qumranet.com>
+ *
+ */
+
+#include "kvm.h"
+
+#include <linux/kvm.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <asm/processor.h>
+#include <linux/percpu.h>
+#include <linux/gfp.h>
+#include <asm/msr.h>
+#include <linux/mm.h>
+#include <linux/miscdevice.h>
+#include <linux/vmalloc.h>
+#include <asm/uaccess.h>
+#include <linux/reboot.h>
+#include <asm/io.h>
+#include <linux/debugfs.h>
+#include <linux/highmem.h>
+#include <linux/file.h>
+
+#include "vmx.h"
+#include "x86_emulate.h"
+
+MODULE_AUTHOR("Qumranet");
+MODULE_LICENSE("GPL");
+
+struct kvm_stat kvm_stat;
+
+static struct kvm_stats_debugfs_item {
+	const char *name;
+	u32 *data;
+	struct dentry *dentry;
+} debugfs_entries[] = {
+	{ "pf_fixed", &kvm_stat.pf_fixed },
+	{ "pf_guest", &kvm_stat.pf_guest },
+	{ "tlb_flush", &kvm_stat.tlb_flush },
+	{ "invlpg", &kvm_stat.invlpg },
+	{ "exits", &kvm_stat.exits },
+	{ "io_exits", &kvm_stat.io_exits },
+	{ "mmio_exits", &kvm_stat.mmio_exits },
+	{ "signal_exits", &kvm_stat.signal_exits },
+	{ "irq_exits", &kvm_stat.irq_exits },
+	{ 0, 0 }
+};
+
+static struct dentry *debugfs_dir;
+
+static const u32 vmx_msr_index[] = {
+	MSR_EFER, MSR_K6_STAR,
+#ifdef __x86_64__
+	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR
+#endif
+};
+#define NR_VMX_MSR (sizeof(vmx_msr_index) / sizeof(*vmx_msr_index))
+
+
+#ifdef __x86_64__
+/*
+ * avoid save/load MSR_SYSCALL_MASK and MSR_LSTAR by std vt
+ * mechanism (cpu bug AA24)
+ */
+#define NUM_AUTO_MSRS (NR_VMX_MSR-2)
+#else
+#define NUM_AUTO_MSRS NR_VMX_MSR
+#endif
+
+#define TSS_IOPB_BASE_OFFSET 0x66
+#define TSS_BASE_SIZE 0x68
+#define TSS_IOPB_SIZE (65536 / 8)
+#define TSS_REDIRECTION_SIZE (256 / 8)
+#define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
+
+#define MSR_IA32_FEATURE_CONTROL 		0x03a
+#define MSR_IA32_VMX_BASIC_MSR   		0x480
+#define MSR_IA32_VMX_PINBASED_CTLS_MSR		0x481
+#define MSR_IA32_VMX_PROCBASED_CTLS_MSR		0x482
+#define MSR_IA32_VMX_EXIT_CTLS_MSR		0x483
+#define MSR_IA32_VMX_ENTRY_CTLS_MSR		0x484
+
+#define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL
+#define LMSW_GUEST_MASK 0x0eULL
+#define CR4_RESEVED_BITS (~((1ULL << 11) - 1))
+#define CR4_VMXE 0x2000
+#define CR8_RESEVED_BITS (~0x0fULL)
+#define EFER_RESERVED_BITS 0xfffffffffffff2fe
+
+#ifdef __x86_64__
+#define HOST_IS_64 1
+#else
+#define HOST_IS_64 0
+#endif
+
+static struct vmx_msr_entry *find_msr_entry(struct kvm_vcpu *vcpu, u32 msr)
+{
+	int i;
+
+	for (i = 0; i < NR_VMX_MSR; ++i)
+		if (vmx_msr_index[i] == msr)
+			return &vcpu->guest_msrs[i];
+	return 0;
+}
+
+struct descriptor_table {
+	u16 limit;
+	unsigned long base;
+} __attribute__((packed));
+
+static void get_gdt(struct descriptor_table *table)
+{
+	asm ("sgdt %0" : "=m"(*table));
+}
+
+static void get_idt(struct descriptor_table *table)
+{
+	asm ("sidt %0" : "=m"(*table));
+}
+
+static u16 read_fs(void)
+{
+	u16 seg;
+	asm ("mov %%fs, %0" : "=g"(seg));
+	return seg;
+}
+
+static u16 read_gs(void)
+{
+	u16 seg;
+	asm ("mov %%gs, %0" : "=g"(seg));
+	return seg;
+}
+
+static u16 read_ldt(void)
+{
+	u16 ldt;
+	asm ("sldt %0" : "=g"(ldt));
+	return ldt;
+}
+
+static void load_fs(u16 sel)
+{
+	asm ("mov %0, %%fs" : : "g"(sel));
+}
+
+static void load_gs(u16 sel)
+{
+	asm ("mov %0, %%gs" : : "g"(sel));
+}
+
+#ifndef load_ldt
+static void load_ldt(u16 sel)
+{
+	asm ("lldt %0" : : "g"(sel));
+}
+#endif
+
+static void fx_save(void *image)
+{
+	asm ("fxsave (%0)":: "r" (image));
+}
+
+static void fx_restore(void *image)
+{
+	asm ("fxrstor (%0)":: "r" (image));
+}
+
+static void fpu_init(void)
+{
+	asm ("finit");
+}
+
+struct segment_descriptor {
+	u16 limit_low;
+	u16 base_low;
+	u8  base_mid;
+	u8  type : 4;
+	u8  system : 1;
+	u8  dpl : 2;
+	u8  present : 1;
+	u8  limit_high : 4;
+	u8  avl : 1;
+	u8  long_mode : 1;
+	u8  default_op : 1;
+	u8  granularity : 1;
+	u8  base_high;
+} __attribute__((packed));
+
+#ifdef __x86_64__
+// LDT or TSS descriptor in the GDT. 16 bytes.
+struct segment_descriptor_64 {
+	struct segment_descriptor s;
+	u32 base_higher;
+	u32 pad_zero;
+};
+
+#endif
+
+static unsigned long segment_base(u16 selector)
+{
+	struct descriptor_table gdt;
+	struct segment_descriptor *d;
+	unsigned long table_base;
+	typedef unsigned long ul;
+	unsigned long v;
+
+	asm ("sgdt %0" : "=m"(gdt));
+	table_base = gdt.base;
+
+	if (selector & 4) {           /* from ldt */
+		u16 ldt_selector;
+
+		asm ("sldt %0" : "=g"(ldt_selector));
+		table_base = segment_base(ldt_selector);
+	}
+	d = (struct segment_descriptor *)(table_base + (selector & ~7));
+	v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24);
+#ifdef __x86_64__
+	if (d->system == 0
+	    && (d->type == 2 || d->type == 9 || d->type == 11))
+		v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32;
+#endif
+	return v;
+}
+
+static unsigned long read_tr_base(void)
+{
+	u16 tr;
+	asm ("str %0" : "=g"(tr));
+	return segment_base(tr);
+}
+
+static void reload_tss(void)
+{
+#ifndef __x86_64__
+
+	/*
+	 * VT restores TR but not its size.  Useless.
+	 */
+	struct descriptor_table gdt;
+	struct segment_descriptor *descs;
+
+	get_gdt(&gdt);
+	descs = (void *)gdt.base;
+	descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
+	load_TR_desc();
+#endif
+}
+
+static DEFINE_PER_CPU(struct vmcs *, vmxarea);
+static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
+
+static struct vmcs_descriptor {
+	int size;
+	int order;
+	u32 revision_id;
+} vmcs_descriptor;
+
+#ifdef __x86_64__
+static unsigned long read_msr(unsigned long msr)
+{
+	u64 value;
+
+	rdmsrl(msr, value);
+	return value;
+}
+#endif
+
+static inline struct page *_gfn_to_page(struct kvm *kvm, gfn_t gfn)
+{
+	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
+	return (slot) ? slot->phys_mem[gfn - slot->base_gfn] : 0;
+}
+
+
+
+int kvm_read_guest(struct kvm_vcpu *vcpu,
+			     gva_t addr,
+			     unsigned long size,
+			     void *dest)
+{
+	unsigned char *host_buf = dest;
+	unsigned long req_size = size;
+
+	while (size) {
+		hpa_t paddr;
+		unsigned now;
+		unsigned offset;
+		hva_t guest_buf;
+
+		paddr = gva_to_hpa(vcpu, addr);
+
+		if (is_error_hpa(paddr))
+			break;
+
+		guest_buf = (hva_t)kmap_atomic(
+					pfn_to_page(paddr >> PAGE_SHIFT),
+					KM_USER0);
+		offset = addr & ~PAGE_MASK;
+		guest_buf |= offset;
+		now = min(size, PAGE_SIZE - offset);
+		memcpy(host_buf, (void*)guest_buf, now);
+		host_buf += now;
+		addr += now;
+		size -= now;
+		kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0);
+	}
+	return req_size - size;
+}
+
+int kvm_write_guest(struct kvm_vcpu *vcpu,
+			     gva_t addr,
+			     unsigned long size,
+			     void *data)
+{
+	unsigned char *host_buf = data;
+	unsigned long req_size = size;
+
+	while (size) {
+		hpa_t paddr;
+		unsigned now;
+		unsigned offset;
+		hva_t guest_buf;
+
+		paddr = gva_to_hpa(vcpu, addr);
+
+		if (is_error_hpa(paddr))
+			break;
+
+		guest_buf = (hva_t)kmap_atomic(
+				pfn_to_page(paddr >> PAGE_SHIFT), KM_USER0);
+		offset = addr & ~PAGE_MASK;
+		guest_buf |= offset;
+		now = min(size, PAGE_SIZE - offset);
+		memcpy((void*)guest_buf, host_buf, now);
+		host_buf += now;
+		addr += now;
+		size -= now;
+		kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0);
+	}
+	return req_size - size;
+}
+
+static __init void setup_vmcs_descriptor(void)
+{
+	u32 vmx_msr_low, vmx_msr_high;
+
+	rdmsr(MSR_IA32_VMX_BASIC_MSR, vmx_msr_low, vmx_msr_high);
+	vmcs_descriptor.size = vmx_msr_high & 0x1fff;
+	vmcs_descriptor.order = get_order(vmcs_descriptor.size);
+	vmcs_descriptor.revision_id = vmx_msr_low;
+};
+
+static void vmcs_clear(struct vmcs *vmcs)
+{
+	u64 phys_addr = __pa(vmcs);
+	u8 error;
+
+	asm volatile ("vmclear %1; setna %0"
+		       : "=m"(error) : "m"(phys_addr) : "cc", "memory" );
+	if (error)
+		printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
+		       vmcs, phys_addr);
+}
+
+static void __vcpu_clear(void *arg)
+{
+	struct kvm_vcpu *vcpu = arg;
+	int cpu = smp_processor_id();
+
+	if (vcpu->cpu == cpu)
+		vmcs_clear(vcpu->vmcs);
+	if (per_cpu(current_vmcs, cpu) == vcpu->vmcs)
+		per_cpu(current_vmcs, cpu) = 0;
+}
+
+static int vcpu_slot(struct kvm_vcpu *vcpu)
+{
+	return vcpu - vcpu->kvm->vcpus;
+}
+
+/*
+ * Switches to specified vcpu, until a matching vcpu_put(), but assumes
+ * vcpu mutex is already taken.
+ */
+static struct kvm_vcpu *__vcpu_load(struct kvm_vcpu *vcpu)
+{
+	u64 phys_addr = __pa(vcpu->vmcs);
+	int cpu;
+
+	cpu = get_cpu();
+
+	if (vcpu->cpu != cpu) {
+		smp_call_function(__vcpu_clear, vcpu, 0, 1);
+		vcpu->launched = 0;
+	}
+
+	if (per_cpu(current_vmcs, cpu) != vcpu->vmcs) {
+		u8 error;
+
+		per_cpu(current_vmcs, cpu) = vcpu->vmcs;
+		asm volatile ("vmptrld %1; setna %0"
+			       : "=m"(error) : "m"(phys_addr) : "cc" );
+		if (error)
+			printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
+			       vcpu->vmcs, phys_addr);
+	}
+
+	if (vcpu->cpu != cpu) {
+		struct descriptor_table dt;
+		unsigned long sysenter_esp;
+
+		vcpu->cpu = cpu;
+		/*
+		 * Linux uses per-cpu TSS and GDT, so set these when switching
+		 * processors.
+		 */
+		vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */
+		get_gdt(&dt);
+		vmcs_writel(HOST_GDTR_BASE, dt.base);   /* 22.2.4 */
+
+		rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
+		vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
+	}
+	return vcpu;
+}
+
+/*
+ * Switches to specified vcpu, until a matching vcpu_put()
+ */
+static struct kvm_vcpu *vcpu_load(struct kvm *kvm, int vcpu_slot)
+{
+	struct kvm_vcpu *vcpu = &kvm->vcpus[vcpu_slot];
+
+	mutex_lock(&vcpu->mutex);
+	if (unlikely(!vcpu->vmcs)) {
+		mutex_unlock(&vcpu->mutex);
+		return 0;
+	}
+	return __vcpu_load(vcpu);
+}
+
+static void vcpu_put(struct kvm_vcpu *vcpu)
+{
+	put_cpu();
+	mutex_unlock(&vcpu->mutex);
+}
+
+
+static struct vmcs *alloc_vmcs_cpu(int cpu)
+{
+	int node = cpu_to_node(cpu);
+	struct page *pages;
+	struct vmcs *vmcs;
+
+	pages = alloc_pages_node(node, GFP_KERNEL, vmcs_descriptor.order);
+	if (!pages)
+		return 0;
+	vmcs = page_address(pages);
+	memset(vmcs, 0, vmcs_descriptor.size);
+	vmcs->revision_id = vmcs_descriptor.revision_id; /* vmcs revision id */
+	return vmcs;
+}
+
+static struct vmcs *alloc_vmcs(void)
+{
+	return alloc_vmcs_cpu(smp_processor_id());
+}
+
+static void free_vmcs(struct vmcs *vmcs)
+{
+	free_pages((unsigned long)vmcs, vmcs_descriptor.order);
+}
+
+static __init int cpu_has_kvm_support(void)
+{
+	unsigned long ecx = cpuid_ecx(1);
+	return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
+}
+
+static __exit void free_kvm_area(void)
+{
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		free_vmcs(per_cpu(vmxarea, cpu));
+}
+
+static __init int alloc_kvm_area(void)
+{
+	int cpu;
+
+	for_each_online_cpu(cpu) {
+		struct vmcs *vmcs;
+
+		vmcs = alloc_vmcs_cpu(cpu);
+		if (!vmcs) {
+			free_kvm_area();
+			return -ENOMEM;
+		}
+
+		per_cpu(vmxarea, cpu) = vmcs;
+	}
+	return 0;
+}
+
+static __init int vmx_disabled_by_bios(void)
+{
+	u64 msr;
+
+	rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
+	return (msr & 5) == 1; /* locked but not enabled */
+}
+
+static __init void kvm_enable(void *garbage)
+{
+	int cpu = raw_smp_processor_id();
+	u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
+	u64 old;
+
+	rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
+	if ((old & 5) == 0)
+		/* enable and lock */
+		wrmsrl(MSR_IA32_FEATURE_CONTROL, old | 5);
+	write_cr4(read_cr4() | CR4_VMXE); /* FIXME: not cpu hotplug safe */
+	asm volatile ("vmxon %0" : : "m"(phys_addr) : "memory", "cc");
+}
+
+static void kvm_disable(void *garbage)
+{
+	asm volatile ("vmxoff" : : : "cc");
+}
+
+static int kvm_dev_open(struct inode *inode, struct file *filp)
+{
+	struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+	int i;
+
+	if (!kvm)
+		return -ENOMEM;
+
+	spin_lock_init(&kvm->lock);
+	INIT_LIST_HEAD(&kvm->active_mmu_pages);
+	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+		struct kvm_vcpu *vcpu = &kvm->vcpus[i];
+
+		mutex_init(&vcpu->mutex);
+		vcpu->mmu.root_hpa = INVALID_PAGE;
+		INIT_LIST_HEAD(&vcpu->free_pages);
+	}
+	filp->private_data = kvm;
+	return 0;
+}
+
+/*
+ * Free any memory in @free but not in @dont.
+ */
+static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
+				  struct kvm_memory_slot *dont)
+{
+	int i;
+
+	if (!dont || free->phys_mem != dont->phys_mem)
+		if (free->phys_mem) {
+			for (i = 0; i < free->npages; ++i)
+				__free_page(free->phys_mem[i]);
+			vfree(free->phys_mem);
+		}
+
+	if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
+		vfree(free->dirty_bitmap);
+
+	free->phys_mem = 0;
+	free->npages = 0;
+	free->dirty_bitmap = 0;
+}
+
+static void kvm_free_physmem(struct kvm *kvm)
+{
+	int i;
+
+	for (i = 0; i < kvm->nmemslots; ++i)
+		kvm_free_physmem_slot(&kvm->memslots[i], 0);
+}
+
+static void kvm_free_vmcs(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->vmcs) {
+		on_each_cpu(__vcpu_clear, vcpu, 0, 1);
+		free_vmcs(vcpu->vmcs);
+		vcpu->vmcs = 0;
+	}
+}
+
+static void kvm_free_vcpu(struct kvm_vcpu *vcpu)
+{
+	kvm_free_vmcs(vcpu);
+	kvm_mmu_destroy(vcpu);
+}
+
+static void kvm_free_vcpus(struct kvm *kvm)
+{
+	unsigned int i;
+
+	for (i = 0; i < KVM_MAX_VCPUS; ++i)
+		kvm_free_vcpu(&kvm->vcpus[i]);
+}
+
+static int kvm_dev_release(struct inode *inode, struct file *filp)
+{
+	struct kvm *kvm = filp->private_data;
+
+	kvm_free_vcpus(kvm);
+	kvm_free_physmem(kvm);
+	kfree(kvm);
+	return 0;
+}
+
+unsigned long vmcs_readl(unsigned long field)
+{
+	unsigned long value;
+
+	asm volatile ("vmread %1, %0" : "=g"(value) : "r"(field) : "cc");
+	return value;
+}
+
+void vmcs_writel(unsigned long field, unsigned long value)
+{
+	u8 error;
+
+	asm volatile ("vmwrite %1, %2; setna %0"
+		       : "=g"(error) : "r"(value), "r"(field) : "cc" );
+	if (error)
+		printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
+		       field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
+}
+
+static void vmcs_write16(unsigned long field, u16 value)
+{
+	vmcs_writel(field, value);
+}
+
+static void vmcs_write64(unsigned long field, u64 value)
+{
+#ifdef __x86_64__
+	vmcs_writel(field, value);
+#else
+	vmcs_writel(field, value);
+	asm volatile ("");
+	vmcs_writel(field+1, value >> 32);
+#endif
+}
+
+static void inject_gp(struct kvm_vcpu *vcpu)
+{
+	printk(KERN_DEBUG "inject_general_protection: rip 0x%lx\n",
+	       vmcs_readl(GUEST_RIP));
+	vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0);
+	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+		     GP_VECTOR |
+		     INTR_TYPE_EXCEPTION |
+		     INTR_INFO_DELIEVER_CODE_MASK |
+		     INTR_INFO_VALID_MASK);
+}
+
+static void update_exception_bitmap(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->rmode.active)
+		vmcs_write32(EXCEPTION_BITMAP, ~0);
+	else
+		vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
+}
+
+static void enter_pmode(struct kvm_vcpu *vcpu)
+{
+	unsigned long flags;
+
+	vcpu->rmode.active = 0;
+
+	vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base);
+	vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit);
+	vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar);
+
+	flags = vmcs_readl(GUEST_RFLAGS);
+	flags &= ~(IOPL_MASK | X86_EFLAGS_VM);
+	flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT);
+	vmcs_writel(GUEST_RFLAGS, flags);
+
+	vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~CR4_VME_MASK) |
+			(vmcs_readl(CR0_READ_SHADOW) & CR4_VME_MASK) );
+
+	update_exception_bitmap(vcpu);
+
+	#define FIX_PMODE_DATASEG(seg, save) {				\
+			vmcs_write16(GUEST_##seg##_SELECTOR, 0); 	\
+			vmcs_writel(GUEST_##seg##_BASE, 0); 		\
+			vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);	\
+			vmcs_write32(GUEST_##seg##_AR_BYTES, 0x93);	\
+	}
+
+	FIX_PMODE_DATASEG(SS, vcpu->rmode.ss);
+	FIX_PMODE_DATASEG(ES, vcpu->rmode.es);
+	FIX_PMODE_DATASEG(DS, vcpu->rmode.ds);
+	FIX_PMODE_DATASEG(GS, vcpu->rmode.gs);
+	FIX_PMODE_DATASEG(FS, vcpu->rmode.fs);
+
+	vmcs_write16(GUEST_CS_SELECTOR,
+		     vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
+	vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
+}
+
+static int rmode_tss_base(struct kvm* kvm)
+{
+	gfn_t base_gfn = kvm->memslots[0].base_gfn + kvm->memslots[0].npages - 3;
+	return base_gfn << PAGE_SHIFT;
+}
+
+static void enter_rmode(struct kvm_vcpu *vcpu)
+{
+	unsigned long flags;
+
+	vcpu->rmode.active = 1;
+
+	vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
+	vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
+
+	vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
+	vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
+
+	vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
+	vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
+
+	flags = vmcs_readl(GUEST_RFLAGS);
+	vcpu->rmode.save_iopl = (flags & IOPL_MASK) >> IOPL_SHIFT;
+
+	flags |= IOPL_MASK | X86_EFLAGS_VM;
+
+	vmcs_writel(GUEST_RFLAGS, flags);
+	vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | CR4_VME_MASK);
+	update_exception_bitmap(vcpu);
+
+	#define FIX_RMODE_SEG(seg, save) {				   \
+		vmcs_write16(GUEST_##seg##_SELECTOR, 			   \
+					vmcs_readl(GUEST_##seg##_BASE) >> 4); \
+		vmcs_write32(GUEST_##seg##_LIMIT, 0xffff);		   \
+		vmcs_write32(GUEST_##seg##_AR_BYTES, 0xf3);		   \
+	}
+
+	vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
+	vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
+
+	FIX_RMODE_SEG(ES, vcpu->rmode.es);
+	FIX_RMODE_SEG(DS, vcpu->rmode.ds);
+	FIX_RMODE_SEG(SS, vcpu->rmode.ss);
+	FIX_RMODE_SEG(GS, vcpu->rmode.gs);
+	FIX_RMODE_SEG(FS, vcpu->rmode.fs);
+}
+
+#ifdef __x86_64__
+
+static void __set_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+	struct vmx_msr_entry *msr = find_msr_entry(vcpu, MSR_EFER);
+
+	vcpu->shadow_efer = efer;
+	if (efer & EFER_LMA) {
+		vmcs_write32(VM_ENTRY_CONTROLS,
+				     vmcs_read32(VM_ENTRY_CONTROLS) |
+				     VM_ENTRY_CONTROLS_IA32E_MASK);
+		msr->data = efer;
+
+	} else {
+		vmcs_write32(VM_ENTRY_CONTROLS,
+				     vmcs_read32(VM_ENTRY_CONTROLS) &
+				     ~VM_ENTRY_CONTROLS_IA32E_MASK);
+
+		msr->data = efer & ~EFER_LME;
+	}
+}
+
+static void enter_lmode(struct kvm_vcpu *vcpu)
+{
+	u32 guest_tr_ar;
+
+	guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
+	if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
+		printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
+		       __FUNCTION__);
+		vmcs_write32(GUEST_TR_AR_BYTES,
+			     (guest_tr_ar & ~AR_TYPE_MASK)
+			     | AR_TYPE_BUSY_64_TSS);
+	}
+
+	vcpu->shadow_efer |= EFER_LMA;
+
+	find_msr_entry(vcpu, MSR_EFER)->data |= EFER_LMA | EFER_LME;
+	vmcs_write32(VM_ENTRY_CONTROLS,
+		     vmcs_read32(VM_ENTRY_CONTROLS)
+		     | VM_ENTRY_CONTROLS_IA32E_MASK);
+}
+
+static void exit_lmode(struct kvm_vcpu *vcpu)
+{
+	vcpu->shadow_efer &= ~EFER_LMA;
+
+	vmcs_write32(VM_ENTRY_CONTROLS,
+		     vmcs_read32(VM_ENTRY_CONTROLS)
+		     & ~VM_ENTRY_CONTROLS_IA32E_MASK);
+}
+
+#endif
+
+static void __set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+	if (vcpu->rmode.active && (cr0 & CR0_PE_MASK))
+		enter_pmode(vcpu);
+
+	if (!vcpu->rmode.active && !(cr0 & CR0_PE_MASK))
+		enter_rmode(vcpu);
+
+#ifdef __x86_64__
+	if (vcpu->shadow_efer & EFER_LME) {
+		if (!is_paging() && (cr0 & CR0_PG_MASK))
+			enter_lmode(vcpu);
+		if (is_paging() && !(cr0 & CR0_PG_MASK))
+			exit_lmode(vcpu);
+	}
+#endif
+
+	vmcs_writel(CR0_READ_SHADOW, cr0);
+	vmcs_writel(GUEST_CR0, cr0 | KVM_VM_CR0_ALWAYS_ON);
+}
+
+static int pdptrs_have_reserved_bits_set(struct kvm_vcpu *vcpu,
+					 unsigned long cr3)
+{
+	gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
+	unsigned offset = (cr3 & (PAGE_SIZE-1)) >> 5;
+	int i;
+	u64 pdpte;
+	u64 *pdpt;
+	struct kvm_memory_slot *memslot;
+
+	spin_lock(&vcpu->kvm->lock);
+	memslot = gfn_to_memslot(vcpu->kvm, pdpt_gfn);
+	/* FIXME: !memslot - emulate? 0xff? */
+	pdpt = kmap_atomic(gfn_to_page(memslot, pdpt_gfn), KM_USER0);
+
+	for (i = 0; i < 4; ++i) {
+		pdpte = pdpt[offset + i];
+		if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull))
+			break;
+	}
+
+	kunmap_atomic(pdpt, KM_USER0);
+	spin_unlock(&vcpu->kvm->lock);
+
+	return i != 4;
+}
+
+static void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+	if (cr0 & CR0_RESEVED_BITS) {
+		printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
+		       cr0, guest_cr0());
+		inject_gp(vcpu);
+		return;
+	}
+
+	if ((cr0 & CR0_NW_MASK) && !(cr0 & CR0_CD_MASK)) {
+		printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
+		inject_gp(vcpu);
+		return;
+	}
+
+	if ((cr0 & CR0_PG_MASK) && !(cr0 & CR0_PE_MASK)) {
+		printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
+		       "and a clear PE flag\n");
+		inject_gp(vcpu);
+		return;
+	}
+
+	if (!is_paging() && (cr0 & CR0_PG_MASK)) {
+#ifdef __x86_64__
+		if ((vcpu->shadow_efer & EFER_LME)) {
+			u32 guest_cs_ar;
+			if (!is_pae()) {
+				printk(KERN_DEBUG "set_cr0: #GP, start paging "
+				       "in long mode while PAE is disabled\n");
+				inject_gp(vcpu);
+				return;
+			}
+			guest_cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
+			if (guest_cs_ar & SEGMENT_AR_L_MASK) {
+				printk(KERN_DEBUG "set_cr0: #GP, start paging "
+				       "in long mode while CS.L == 1\n");
+				inject_gp(vcpu);
+				return;
+
+			}
+		} else
+#endif
+		if (is_pae() &&
+			    pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
+			printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
+			       "reserved bits\n");
+			inject_gp(vcpu);
+			return;
+		}
+
+	}
+
+	__set_cr0(vcpu, cr0);
+	kvm_mmu_reset_context(vcpu);
+	return;
+}
+
+static void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
+{
+	unsigned long cr0 = guest_cr0();
+
+	if ((msw & CR0_PE_MASK) && !(cr0 & CR0_PE_MASK)) {
+		enter_pmode(vcpu);
+		vmcs_writel(CR0_READ_SHADOW, cr0 | CR0_PE_MASK);
+
+	} else
+		printk(KERN_DEBUG "lmsw: unexpected\n");
+
+	vmcs_writel(GUEST_CR0, (vmcs_readl(GUEST_CR0) & ~LMSW_GUEST_MASK)
+				| (msw & LMSW_GUEST_MASK));
+}
+
+static void __set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+	vmcs_writel(CR4_READ_SHADOW, cr4);
+	vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ?
+		    KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON));
+}
+
+static void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+	if (cr4 & CR4_RESEVED_BITS) {
+		printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
+		inject_gp(vcpu);
+		return;
+	}
+
+	if (is_long_mode()) {
+		if (!(cr4 & CR4_PAE_MASK)) {
+			printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
+			       "in long mode\n");
+			inject_gp(vcpu);
+			return;
+		}
+	} else if (is_paging() && !is_pae() && (cr4 & CR4_PAE_MASK)
+		   && pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
+		printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
+		inject_gp(vcpu);
+	}
+
+	if (cr4 & CR4_VMXE_MASK) {
+		printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
+		inject_gp(vcpu);
+		return;
+	}
+	__set_cr4(vcpu, cr4);
+	spin_lock(&vcpu->kvm->lock);
+	kvm_mmu_reset_context(vcpu);
+	spin_unlock(&vcpu->kvm->lock);
+}
+
+static void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+	if (is_long_mode()) {
+		if ( cr3 & CR3_L_MODE_RESEVED_BITS) {
+			printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
+			inject_gp(vcpu);
+			return;
+		}
+	} else {
+		if (cr3 & CR3_RESEVED_BITS) {
+			printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
+			inject_gp(vcpu);
+			return;
+		}
+		if (is_paging() && is_pae() &&
+		    pdptrs_have_reserved_bits_set(vcpu, cr3)) {
+			printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
+			       "reserved bits\n");
+			inject_gp(vcpu);
+			return;
+		}
+	}
+
+	vcpu->cr3 = cr3;
+	spin_lock(&vcpu->kvm->lock);
+	vcpu->mmu.new_cr3(vcpu);
+	spin_unlock(&vcpu->kvm->lock);
+}
+
+static void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
+{
+	if ( cr8 & CR8_RESEVED_BITS) {
+		printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
+		inject_gp(vcpu);
+		return;
+	}
+	vcpu->cr8 = cr8;
+}
+
+/*
+ * Sync the rsp and rip registers into the vcpu structure.  This allows
+ * registers to be accessed by indexing vcpu->regs.
+ */
+static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
+{
+	vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
+	vcpu->rip = vmcs_readl(GUEST_RIP);
+}
+
+/*
+ * Syncs rsp and rip back into the vmcs.  Should be called after possible
+ * modification.
+ */
+static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu)
+{
+	vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]);
+	vmcs_writel(GUEST_RIP, vcpu->rip);
+}
+
+struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
+{
+	int i;
+
+	for (i = 0; i < kvm->nmemslots; ++i) {
+		struct kvm_memory_slot *memslot = &kvm->memslots[i];
+
+		if (gfn >= memslot->base_gfn
+		    && gfn < memslot->base_gfn + memslot->npages)
+			return memslot;
+	}
+	return 0;
+}
+
+void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
+{
+	int i;
+	struct kvm_memory_slot *memslot = 0;
+	unsigned long rel_gfn;
+
+	for (i = 0; i < kvm->nmemslots; ++i) {
+		memslot = &kvm->memslots[i];
+
+		if (gfn >= memslot->base_gfn
+		    && gfn < memslot->base_gfn + memslot->npages) {
+
+			if (!memslot || !memslot->dirty_bitmap)
+				return;
+
+			rel_gfn = gfn - memslot->base_gfn;
+
+			/* avoid RMW */
+			if (!test_bit(rel_gfn, memslot->dirty_bitmap))
+				set_bit(rel_gfn, memslot->dirty_bitmap);
+			return;
+		}
+	}
+}
+
+static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
+{
+	unsigned long rip;
+	u32 interruptibility;
+
+	rip = vmcs_readl(GUEST_RIP);
+	rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+	vmcs_writel(GUEST_RIP, rip);
+
+	/*
+	 * We emulated an instruction, so temporary interrupt blocking
+	 * should be removed, if set.
+	 */
+	interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+	if (interruptibility & 3)
+		vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
+			     interruptibility & ~3);
+}
+
+static u64 mk_cr_64(u64 curr_cr, u32 new_val)
+{
+	return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
+}
+
+void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
+{
+	vmcs_writel(GUEST_GDTR_BASE, base);
+	vmcs_write32(GUEST_GDTR_LIMIT, limit);
+}
+
+void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
+{
+	vmcs_writel(GUEST_IDTR_BASE, base);
+	vmcs_write32(GUEST_IDTR_LIMIT, limit);
+}
+
+void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
+		   unsigned long *rflags)
+{
+	lmsw(vcpu, msw);
+	*rflags = vmcs_readl(GUEST_RFLAGS);
+}
+
+unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
+{
+	switch (cr) {
+	case 0:
+		return guest_cr0();
+	case 2:
+		return vcpu->cr2;
+	case 3:
+		return vcpu->cr3;
+	case 4:
+		return guest_cr4();
+	default:
+		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
+		return 0;
+	}
+}
+
+void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
+		     unsigned long *rflags)
+{
+	switch (cr) {
+	case 0:
+		set_cr0(vcpu, mk_cr_64(guest_cr0(), val));
+		*rflags = vmcs_readl(GUEST_RFLAGS);
+		break;
+	case 2:
+		vcpu->cr2 = val;
+		break;
+	case 3:
+		set_cr3(vcpu, val);
+		break;
+	case 4:
+		set_cr4(vcpu, mk_cr_64(guest_cr4(), val));
+		break;
+	default:
+		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
+	}
+}
+
+#ifdef __x86_64__
+
+static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+	struct vmx_msr_entry *msr;
+
+	if (efer & EFER_RESERVED_BITS) {
+		printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
+		       efer);
+		inject_gp(vcpu);
+		return;
+	}
+
+	if (is_paging() && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
+		printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
+		inject_gp(vcpu);
+		return;
+	}
+
+	efer &= ~EFER_LMA;
+	efer |= vcpu->shadow_efer & EFER_LMA;
+
+	vcpu->shadow_efer = efer;
+
+	msr = find_msr_entry(vcpu, MSR_EFER);
+
+	if (!(efer & EFER_LMA))
+	    efer &= ~EFER_LME;
+	msr->data = efer;
+	skip_emulated_instruction(vcpu);
+}
+
+#endif
+
+static long kvm_dev_ioctl(struct file *filp,
+			  unsigned int ioctl, unsigned long arg)
+{
+	struct kvm *kvm = filp->private_data;
+	int r = -EINVAL;
+
+	switch (ioctl) {
+	default:
+		;
+	}
+out:
+	return r;
+}
+
+static struct page *kvm_dev_nopage(struct vm_area_struct *vma,
+				   unsigned long address,
+				   int *type)
+{
+	struct kvm *kvm = vma->vm_file->private_data;
+	unsigned long pgoff;
+	struct kvm_memory_slot *slot;
+	struct page *page;
+
+	*type = VM_FAULT_MINOR;
+	pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+	slot = gfn_to_memslot(kvm, pgoff);
+	if (!slot)
+		return NOPAGE_SIGBUS;
+	page = gfn_to_page(slot, pgoff);
+	if (!page)
+		return NOPAGE_SIGBUS;
+	get_page(page);
+	return page;
+}
+
+static struct vm_operations_struct kvm_dev_vm_ops = {
+	.nopage = kvm_dev_nopage,
+};
+
+static int kvm_dev_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	vma->vm_ops = &kvm_dev_vm_ops;
+	return 0;
+}
+
+static struct file_operations kvm_chardev_ops = {
+	.owner		= THIS_MODULE,
+	.open		= kvm_dev_open,
+	.release        = kvm_dev_release,
+	.unlocked_ioctl = kvm_dev_ioctl,
+	.compat_ioctl   = kvm_dev_ioctl,
+	.mmap           = kvm_dev_mmap,
+};
+
+static struct miscdevice kvm_dev = {
+	MISC_DYNAMIC_MINOR,
+	"kvm",
+	&kvm_chardev_ops,
+};
+
+static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
+                       void *v)
+{
+	if (val == SYS_RESTART) {
+		/*
+		 * Some (well, at least mine) BIOSes hang on reboot if
+		 * in vmx root mode.
+		 */
+		printk(KERN_INFO "kvm: exiting vmx mode\n");
+		on_each_cpu(kvm_disable, 0, 0, 1);
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block kvm_reboot_notifier = {
+	.notifier_call = kvm_reboot,
+	.priority = 0,
+};
+
+static __init void kvm_init_debug(void)
+{
+	struct kvm_stats_debugfs_item *p;
+
+	debugfs_dir = debugfs_create_dir("kvm", 0);
+	for (p = debugfs_entries; p->name; ++p)
+		p->dentry = debugfs_create_u32(p->name, 0444, debugfs_dir,
+					       p->data);
+}
+
+static void kvm_exit_debug(void)
+{
+	struct kvm_stats_debugfs_item *p;
+
+	for (p = debugfs_entries; p->name; ++p)
+		debugfs_remove(p->dentry);
+	debugfs_remove(debugfs_dir);
+}
+
+hpa_t bad_page_address;
+
+static __init int kvm_init(void)
+{
+	static struct page *bad_page;
+	int r = 0;
+
+	if (!cpu_has_kvm_support()) {
+		printk(KERN_ERR "kvm: no hardware support\n");
+		return -EOPNOTSUPP;
+	}
+	if (vmx_disabled_by_bios()) {
+		printk(KERN_ERR "kvm: disabled by bios\n");
+		return -EOPNOTSUPP;
+	}
+
+	kvm_init_debug();
+
+	setup_vmcs_descriptor();
+	r = alloc_kvm_area();
+	if (r)
+		goto out;
+	on_each_cpu(kvm_enable, 0, 0, 1);
+	register_reboot_notifier(&kvm_reboot_notifier);
+
+	r = misc_register(&kvm_dev);
+	if (r) {
+		printk (KERN_ERR "kvm: misc device register failed\n");
+		goto out_free;
+	}
+
+
+	if ((bad_page = alloc_page(GFP_KERNEL)) == NULL) {
+		r = -ENOMEM;
+		goto out_free;
+	}
+
+	bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT;
+	memset(__va(bad_page_address), 0, PAGE_SIZE);
+
+	return r;
+
+out_free:
+	free_kvm_area();
+out:
+	kvm_exit_debug();
+	return r;
+}
+
+static __exit void kvm_exit(void)
+{
+	kvm_exit_debug();
+	misc_deregister(&kvm_dev);
+	unregister_reboot_notifier(&kvm_reboot_notifier);
+	on_each_cpu(kvm_disable, 0, 0, 1);
+	free_kvm_area();
+	__free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
+}
+
+module_init(kvm_init)
+module_exit(kvm_exit)
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
References:
- [PATCH 0/14] KVM: Kernel-based Virtual Machine (v4)
  - From: Avi Kivity <avi@qumranet.com>
Prev by Date: [PATCH 4/14] KVM: random accessors and constants
Next by Date: [PATCH 6/14] KVM: memory slot management
Previous by thread: [PATCH 4/14] KVM: random accessors and constants
Next by thread: [PATCH 6/14] KVM: memory slot management
Index(es):
- Date
- Thread
[Index of Archives] [Kernel Newbies] [Netfilter] [Bugtraq] [Photo] [Stuff] [Gimp] [Yosemite News] [MIPS Linux] [ARM Linux] [Linux Security] [Linux RAID] [Video 4 Linux] [Linux for the blind] [Linux Resources]