I had originally hoped to get this in for 2.6.20. It now looks like .20
will have a shorter cycle than usual, and the mmu took a bit longer than
expected, so it's more realistic to aim for 2.6.21.
The current kvm userspace interface has several deficiencies:
- open("/dev/kvm") returns a different object (a new vm) per invocation;
this is "unusual" by Linux standards
- all vcpus share the same inode and struct file, which can cause
scalability problems on very large smps. This isn't a problem for
current hardware, which has moderate core counts and huge vmexit
latencies, not to mention a limit of one vcpu per vm, but I'd like to
future-proof the interface.
- the KVM_VCPU_RUN ioctl() copies a needless chuck of data back and forth
- the PIO handlers communicate by means of registers (for single I/O) or
virtual addresses (for string I/O). Instead the values should be
explicit fields in some structure, and physical addresses should be used
to remove the need to translate addresses in userspace.
- the interrupt code still needs work to properly support the local apic
with Windows guests.
- userspace must rely on delivered signals, which are slow, and cannot
use queued signals (a la pselect()/ppoll()).
I propose the following as the new, stable, kvm api:
// open a handle to the kvm interface. does not create a vm.
int kvm_fd = open("/dev/kvm", O_RDWR);
// the kvm interface supports just three ioctls:
ioctl(kvm_fd, KVM_GET_API_VERSION, 0);
ioctl(kvm_fd, KVM_GET_MSR_LIST, &msr_list);
int vm_fd = ioctl(kvm_fd, KVM_CREATE_VM, 0);
// vm ioctls:
ioctl(vm_fd, KVM_VM_CREATE_MEMORY_REGION, &slot);
ioctl(vm_fd, KVM_VM_GET_DIRTY_LOG, &dirty_log);
int vcpu_fd = ioctl(vm_fd, KVM_CREATE_VCPU, vcpu_slot_number);
// each vcpu is a separate fd/inode. this ensures no cacheline bouncing
// when the kernel refcounts the inodes on syscalls.
// kvm_vcpu_area contains the exit reasons and associated data, and
// results returned by userspace to resolve the exit reasons.
struct kvm_vcpu_area *vcpu_area = mmap(NULL, PAGE_SIZE, ..., vcpu_fd, 0);
struct kvm_vcpu_area {
u32 vcpu_area_size;
u32 exit_reason;
sigset_t sigmask; // for use during vcpu execution
union {
struct kvm_pio pio;
struct kvm_mmio mmio;
struct kvm_cpuid cpuid;
// etc.
char padding[...];
};
struct kvm_irq irq; // acks from vm; injection from userspace
};
// vcpu ioctls
ioctl(vcpu_fd, KVM_VCPU_RUN, 0); // all comms through mmap()ed vcpu_area
ioctl(vcpu_fd, KVM_VCPU_GET_REGS, ®s);
ioctl(vcpu_fd, KVM_VCPU_SET_REGS, ®s);
ioctl(vcpu_fd, KVM_VCPU_GET_SREGS, &sregs);
ioctl(vcpu_fd, KVM_VCPU_SET_SREGS, &sregs);
ioctl(vcpu_fd, KVM_VCPU_GET_MSRS, &msrs);
ioctl(vcpu_fd, KVM_VCPU_SET_MSRS, &msrs);
ioctl(vcpu_fd, KVM_VCPU_DEBUG_GUEST, &debug);
/* for KVM_VM_CREATE_MEMORY_REGION */
struct kvm_memory_region {
__u32 slot;
__u32 flags;
__u64 guest_phys_addr;
__u64 memory_size; /* bytes */
};
/* for kvm_memory_region::flags */
#define KVM_MEM_LOG_DIRTY_PAGES 1UL
#define KVM_EXIT_TYPE_FAIL_ENTRY 1
#define KVM_EXIT_TYPE_VM_EXIT 2
enum kvm_exit_reason {
KVM_EXIT_UNKNOWN = 0,
KVM_EXIT_EXCEPTION = 1,
KVM_EXIT_IO = 2,
KVM_EXIT_CPUID = 3,
KVM_EXIT_DEBUG = 4,
KVM_EXIT_HLT = 5,
KVM_EXIT_MMIO = 6,
KVM_EXIT_IRQ_WINDOW_OPEN = 7,
KVM_EXIT_HYPERCALL = 8,
};
/* for KVM_GET_REGS and KVM_SET_REGS */
struct kvm_regs {
// note: no vcpu!
/* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
__u64 rax, rbx, rcx, rdx;
__u64 rsi, rdi, rsp, rbp;
__u64 r8, r9, r10, r11;
__u64 r12, r13, r14, r15;
__u64 rip, rflags;
};
struct kvm_segment {
__u64 base;
__u32 limit;
__u16 selector;
__u8 type;
__u8 present, dpl, db, s, l, g, avl;
__u8 unusable;
__u8 padding;
};
struct kvm_dtable {
__u64 base;
__u16 limit;
__u16 padding[3];
};
/* for KVM_VCPU_GET_SREGS and KVM_VCPU_SET_SREGS */
struct kvm_sregs {
/* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */
struct kvm_segment cs, ds, es, fs, gs, ss;
struct kvm_segment tr, ldt;
struct kvm_dtable gdt, idt;
__u64 cr0, cr2, cr3, cr4, cr8;
};
struct kvm_msr_entry {
__u32 index;
__u32 reserved;
__u64 data;
};
/* for KVM_VCPU_GET_MSRS and KVM_VCPU_SET_MSRS */
struct kvm_msrs {
__u32 nmsrs; /* number of msrs in entries */
__u32 padding;
struct kvm_msr_entry entries[0];
};
/* for KVM_GET_MSR_INDEX_LIST */
struct kvm_msr_list {
__u32 nmsrs; /* number of msrs in entries */
__u32 indices[0];
};
struct kvm_breakpoint {
__u32 enabled;
__u32 padding;
__u64 address;
};
/* for KVM_VCPU_DEBUG_GUEST */
struct kvm_debug_guest {
__u32 enabled;
__u32 singlestep;
struct kvm_breakpoint breakpoints[4];
};
/* for KVM_VM_GET_DIRTY_LOG */
struct kvm_dirty_log {
__u32 slot;
__u32 padding;
union {
void __user *dirty_bitmap; /* one bit per page */
__u64 padding;
};
};
Comments and questions are welcome.
Thanks to Arnd Bergmann for his contributions and advice on this issue.
--
error compiling committee.c: too many arguments to function
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
[Index of Archives]
[Kernel Newbies]
[Netfilter]
[Bugtraq]
[Photo]
[Stuff]
[Gimp]
[Yosemite News]
[MIPS Linux]
[ARM Linux]
[Linux Security]
[Linux RAID]
[Video 4 Linux]
[Linux for the blind]
[Linux Resources]