From 172a1491b99d8394dfc0de201f97ea1e6d1be846 Mon Sep 17 00:00:00 2001 From: William Pitcock Date: Thu, 13 Jan 2011 06:57:09 -0600 Subject: testing/linux-xen0: new aport --- testing/linux-xen0/pvops.patch | 37837 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 37837 insertions(+) create mode 100644 testing/linux-xen0/pvops.patch (limited to 'testing/linux-xen0/pvops.patch') diff --git a/testing/linux-xen0/pvops.patch b/testing/linux-xen0/pvops.patch new file mode 100644 index 0000000000..49969705be --- /dev/null +++ b/testing/linux-xen0/pvops.patch @@ -0,0 +1,37837 @@ +diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt +index 5f6aa11..9ec8558 100644 +--- a/Documentation/kernel-parameters.txt ++++ b/Documentation/kernel-parameters.txt +@@ -113,6 +113,7 @@ parameter is applicable: + More X86-64 boot options can be found in + Documentation/x86/x86_64/boot-options.txt . + X86 Either 32bit or 64bit x86 (same as X86-32+X86-64) ++ XEN Xen support is enabled + + In addition, the following text indicates that the option: + +@@ -2760,6 +2761,18 @@ and is between 256 and 4096 characters. It is defined in the file + xd= [HW,XT] Original XT pre-IDE (RLL encoded) disks. + xd_geo= See header of drivers/block/xd.c. + ++ xen_emul_unplug= [HW,X86,XEN] ++ Unplug Xen emulated devices ++ Format: [unplug0,][unplug1] ++ ide-disks -- unplug primary master IDE devices ++ aux-ide-disks -- unplug non-primary-master IDE devices ++ nics -- unplug network devices ++ all -- unplug all emulated devices (NICs and IDE disks) ++ unnecessary -- unplugging emulated devices is ++ unnecessary even if the host did not respond to ++ the unplug protocol ++ never -- do not unplug even if version check succeeds ++ + xirc2ps_cs= [NET,PCMCIA] + Format: + ,,,,,[,[,[,]]] +diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt +index 29a6ff8..81f9b94 100644 +--- a/Documentation/x86/x86_64/boot-options.txt ++++ b/Documentation/x86/x86_64/boot-options.txt +@@ -267,10 +267,14 @@ IOMMU (input/output memory management unit) + + iommu options only relevant to the software bounce buffering (SWIOTLB) IOMMU + implementation: +- swiotlb=[,force] ++ swiotlb=[npages=] ++ swiotlb=[force] ++ swiotlb=[overflow=] ++ + Prereserve that many 128K pages for the software IO + bounce buffering. + force Force all IO through the software TLB. ++ Size in bytes of the overflow buffer. + + Settings for the IBM Calgary hardware IOMMU currently found in IBM + pSeries and xSeries machines: +diff --git a/arch/ia64/include/asm/dma-mapping.h b/arch/ia64/include/asm/dma-mapping.h +index 8d3c79c..7d09a09 100644 +--- a/arch/ia64/include/asm/dma-mapping.h ++++ b/arch/ia64/include/asm/dma-mapping.h +@@ -73,7 +73,7 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) + if (!dev->dma_mask) + return 0; + +- return addr + size <= *dev->dma_mask; ++ return addr + size - 1 <= *dev->dma_mask; + } + + static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) +diff --git a/arch/ia64/include/asm/swiotlb.h b/arch/ia64/include/asm/swiotlb.h +index dcbaea7..f0acde6 100644 +--- a/arch/ia64/include/asm/swiotlb.h ++++ b/arch/ia64/include/asm/swiotlb.h +@@ -4,8 +4,6 @@ + #include + #include + +-extern int swiotlb_force; +- + #ifdef CONFIG_SWIOTLB + extern int swiotlb; + extern void pci_swiotlb_init(void); +diff --git a/arch/ia64/include/asm/xen/events.h b/arch/ia64/include/asm/xen/events.h +index b8370c8..baa74c8 100644 +--- a/arch/ia64/include/asm/xen/events.h ++++ b/arch/ia64/include/asm/xen/events.h +@@ -36,10 +36,6 @@ static inline int xen_irqs_disabled(struct pt_regs *regs) + return !(ia64_psr(regs)->i); + } + +-static inline void handle_irq(int irq, struct pt_regs *regs) +-{ +- __do_IRQ(irq); +-} + #define irq_ctx_init(cpu) do { } while (0) + + #endif /* _ASM_IA64_XEN_EVENTS_H */ +diff --git a/arch/ia64/kernel/pci-swiotlb.c b/arch/ia64/kernel/pci-swiotlb.c +index 285aae8..53292ab 100644 +--- a/arch/ia64/kernel/pci-swiotlb.c ++++ b/arch/ia64/kernel/pci-swiotlb.c +@@ -41,7 +41,7 @@ struct dma_map_ops swiotlb_dma_ops = { + void __init swiotlb_dma_init(void) + { + dma_ops = &swiotlb_dma_ops; +- swiotlb_init(); ++ swiotlb_init(1); + } + + void __init pci_swiotlb_init(void) +@@ -51,7 +51,7 @@ void __init pci_swiotlb_init(void) + swiotlb = 1; + printk(KERN_INFO "PCI-DMA: Re-initialize machine vector.\n"); + machvec_init("dig"); +- swiotlb_init(); ++ swiotlb_init(1); + dma_ops = &swiotlb_dma_ops; + #else + panic("Unable to find Intel IOMMU"); +diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h +index e281dae..80a973b 100644 +--- a/arch/powerpc/include/asm/dma-mapping.h ++++ b/arch/powerpc/include/asm/dma-mapping.h +@@ -197,7 +197,7 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) + if (!dev->dma_mask) + return 0; + +- return addr + size <= *dev->dma_mask; ++ return addr + size - 1 <= *dev->dma_mask; + } + + static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) +diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c +index 53bcf3d..b152de3 100644 +--- a/arch/powerpc/kernel/setup_32.c ++++ b/arch/powerpc/kernel/setup_32.c +@@ -345,7 +345,7 @@ void __init setup_arch(char **cmdline_p) + + #ifdef CONFIG_SWIOTLB + if (ppc_swiotlb_enable) +- swiotlb_init(); ++ swiotlb_init(1); + #endif + + paging_init(); +diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c +index 04f638d..df2c9e9 100644 +--- a/arch/powerpc/kernel/setup_64.c ++++ b/arch/powerpc/kernel/setup_64.c +@@ -550,7 +550,7 @@ void __init setup_arch(char **cmdline_p) + + #ifdef CONFIG_SWIOTLB + if (ppc_swiotlb_enable) +- swiotlb_init(); ++ swiotlb_init(1); + #endif + + paging_init(); +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index cb5a57c..a3b7475 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -1885,6 +1885,10 @@ config PCI_OLPC + def_bool y + depends on PCI && OLPC && (PCI_GOOLPC || PCI_GOANY) + ++config PCI_XEN ++ bool ++ select SWIOTLB ++ + config PCI_DOMAINS + def_bool y + depends on PCI +diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h +index 18aa3f8..4413ba4 100644 +--- a/arch/x86/include/asm/amd_iommu.h ++++ b/arch/x86/include/asm/amd_iommu.h +@@ -23,20 +23,16 @@ + #include + + #ifdef CONFIG_AMD_IOMMU +-extern int amd_iommu_init(void); + extern int amd_iommu_init_dma_ops(void); + extern int amd_iommu_init_passthrough(void); + extern void amd_iommu_detect(void); + extern irqreturn_t amd_iommu_int_handler(int irq, void *data); + extern void amd_iommu_flush_all_domains(void); + extern void amd_iommu_flush_all_devices(void); +-extern void amd_iommu_shutdown(void); + extern void amd_iommu_apply_erratum_63(u16 devid); + extern void amd_iommu_init_api(void); + #else +-static inline int amd_iommu_init(void) { return -ENODEV; } + static inline void amd_iommu_detect(void) { } +-static inline void amd_iommu_shutdown(void) { } + #endif + + #endif /* _ASM_X86_AMD_IOMMU_H */ +diff --git a/arch/x86/include/asm/calgary.h b/arch/x86/include/asm/calgary.h +index b03bedb..0918654 100644 +--- a/arch/x86/include/asm/calgary.h ++++ b/arch/x86/include/asm/calgary.h +@@ -62,10 +62,8 @@ struct cal_chipset_ops { + extern int use_calgary; + + #ifdef CONFIG_CALGARY_IOMMU +-extern int calgary_iommu_init(void); + extern void detect_calgary(void); + #else +-static inline int calgary_iommu_init(void) { return 1; } + static inline void detect_calgary(void) { return; } + #endif + +diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h +index 6a25d5d..ac91eed 100644 +--- a/arch/x86/include/asm/dma-mapping.h ++++ b/arch/x86/include/asm/dma-mapping.h +@@ -20,7 +20,8 @@ + # define ISA_DMA_BIT_MASK DMA_BIT_MASK(32) + #endif + +-extern dma_addr_t bad_dma_address; ++#define DMA_ERROR_CODE 0 ++ + extern int iommu_merge; + extern struct device x86_dma_fallback_dev; + extern int panic_on_overflow; +@@ -48,7 +49,7 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) + if (ops->mapping_error) + return ops->mapping_error(dev, dma_addr); + +- return (dma_addr == bad_dma_address); ++ return (dma_addr == DMA_ERROR_CODE); + } + + #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) +@@ -66,7 +67,7 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) + if (!dev->dma_mask) + return 0; + +- return addr + size <= *dev->dma_mask; ++ return addr + size - 1 <= *dev->dma_mask; + } + + static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) +diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h +index 40b4e61..fa3fd43 100644 +--- a/arch/x86/include/asm/e820.h ++++ b/arch/x86/include/asm/e820.h +@@ -109,6 +109,8 @@ extern void reserve_early(u64 start, u64 end, char *name); + extern void reserve_early_overlap_ok(u64 start, u64 end, char *name); + extern void free_early(u64 start, u64 end); + extern void early_res_to_bootmem(u64 start, u64 end); ++extern u64 early_res_next_free(u64 start); ++extern u64 early_res_next_reserved(u64 addr, u64 max); + extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align); + + extern unsigned long e820_end_of_ram_pfn(void); +diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h +index 6cfdafa..4ac5b0f 100644 +--- a/arch/x86/include/asm/gart.h ++++ b/arch/x86/include/asm/gart.h +@@ -35,8 +35,7 @@ extern int gart_iommu_aperture_allowed; + extern int gart_iommu_aperture_disabled; + + extern void early_gart_iommu_check(void); +-extern void gart_iommu_init(void); +-extern void gart_iommu_shutdown(void); ++extern int gart_iommu_init(void); + extern void __init gart_parse_options(char *); + extern void gart_iommu_hole_init(void); + +@@ -48,12 +47,6 @@ extern void gart_iommu_hole_init(void); + static inline void early_gart_iommu_check(void) + { + } +-static inline void gart_iommu_init(void) +-{ +-} +-static inline void gart_iommu_shutdown(void) +-{ +-} + static inline void gart_parse_options(char *options) + { + } +diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h +index 3251e23..fa152cb 100644 +--- a/arch/x86/include/asm/hpet.h ++++ b/arch/x86/include/asm/hpet.h +@@ -68,6 +68,7 @@ extern unsigned long force_hpet_address; + extern int hpet_force_user; + extern u8 hpet_msi_disable; + extern int is_hpet_enabled(void); ++extern int disable_hpet(char *); + extern int hpet_enable(void); + extern void hpet_disable(void); + extern unsigned long hpet_readl(unsigned long a); +@@ -108,6 +109,7 @@ extern void hpet_unregister_irq_handler(rtc_irq_handler handler); + #else /* CONFIG_HPET_TIMER */ + + static inline int hpet_enable(void) { return 0; } ++static inline int disable_hpet(char *s) { return 0; } + static inline int is_hpet_enabled(void) { return 0; } + #define hpet_readl(a) 0 + +diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h +index 439a9ac..bf88684 100644 +--- a/arch/x86/include/asm/hugetlb.h ++++ b/arch/x86/include/asm/hugetlb.h +@@ -36,16 +36,28 @@ static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, + free_pgd_range(tlb, addr, end, floor, ceiling); + } + ++static inline pte_t huge_ptep_get(pte_t *ptep) ++{ ++ return *ptep; ++} ++ + static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) + { +- set_pte_at(mm, addr, ptep, pte); ++#if PAGETABLE_LEVELS >= 3 ++ set_pmd((pmd_t *)ptep, native_make_pmd(native_pte_val(pte))); ++#else ++ set_pgd((pgd_t *)ptep, native_make_pgd(native_pte_val(pte))); ++#endif + } + + static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) + { +- return ptep_get_and_clear(mm, addr, ptep); ++ pte_t pte = huge_ptep_get(ptep); ++ ++ set_huge_pte_at(mm, addr, ptep, __pte(0)); ++ return pte; + } + + static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, +@@ -66,19 +78,25 @@ static inline pte_t huge_pte_wrprotect(pte_t pte) + static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) + { +- ptep_set_wrprotect(mm, addr, ptep); ++ pte_t pte = huge_ptep_get(ptep); ++ ++ pte = pte_wrprotect(pte); ++ set_huge_pte_at(mm, addr, ptep, pte); + } + + static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t pte, int dirty) + { +- return ptep_set_access_flags(vma, addr, ptep, pte, dirty); +-} ++ pte_t oldpte = huge_ptep_get(ptep); ++ int changed = !pte_same(oldpte, pte); + +-static inline pte_t huge_ptep_get(pte_t *ptep) +-{ +- return *ptep; ++ if (changed && dirty) { ++ set_huge_pte_at(vma->vm_mm, addr, ptep, pte); ++ flush_tlb_page(vma, addr); ++ } ++ ++ return changed; + } + + static inline int arch_prepare_hugepage(struct page *page) +diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h +index 6a63b86..9ad387e 100644 +--- a/arch/x86/include/asm/io.h ++++ b/arch/x86/include/asm/io.h +@@ -7,6 +7,10 @@ + #include + #include + ++#include ++ ++extern int isapnp_disable; ++ + #define build_mmio_read(name, size, type, reg, barrier) \ + static inline type name(const volatile void __iomem *addr) \ + { type ret; asm volatile("mov" size " %1,%0":reg (ret) \ +@@ -199,6 +203,18 @@ extern void __iomem *early_ioremap(resource_size_t phys_addr, + extern void __iomem *early_memremap(resource_size_t phys_addr, + unsigned long size); + extern void early_iounmap(void __iomem *addr, unsigned long size); ++extern bool is_early_ioremap_ptep(pte_t *ptep); ++ ++#ifdef CONFIG_XEN ++struct bio_vec; ++ ++extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, ++ const struct bio_vec *vec2); ++ ++#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ ++ (__BIOVEC_PHYS_MERGEABLE(vec1, vec2) && \ ++ (!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2))) ++#endif /* CONFIG_XEN */ + + #define IO_SPACE_LIMIT 0xffff + +diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h +index 5f61f6e..b852da9 100644 +--- a/arch/x86/include/asm/io_apic.h ++++ b/arch/x86/include/asm/io_apic.h +@@ -172,6 +172,7 @@ extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); + extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); + + extern void probe_nr_irqs_gsi(void); ++extern int get_nr_irqs_gsi(void); + + extern int setup_ioapic_entry(int apic, int irq, + struct IO_APIC_route_entry *entry, +@@ -201,4 +202,6 @@ static inline void probe_nr_irqs_gsi(void) { } + + #endif + ++void xen_io_apic_init(void); ++ + #endif /* _ASM_X86_IO_APIC_H */ +diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h +index fd6d21b..345c99c 100644 +--- a/arch/x86/include/asm/iommu.h ++++ b/arch/x86/include/asm/iommu.h +@@ -1,8 +1,6 @@ + #ifndef _ASM_X86_IOMMU_H + #define _ASM_X86_IOMMU_H + +-extern void pci_iommu_shutdown(void); +-extern void no_iommu_init(void); + extern struct dma_map_ops nommu_dma_ops; + extern int force_iommu, no_iommu; + extern int iommu_detected; +diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h +index 6e90a04..ba4dc7b 100644 +--- a/arch/x86/include/asm/irq_vectors.h ++++ b/arch/x86/include/asm/irq_vectors.h +@@ -120,6 +120,12 @@ + */ + #define MCE_SELF_VECTOR 0xeb + ++#ifdef CONFIG_XEN ++/* Xen vector callback to receive events in a HVM domain */ ++#define XEN_HVM_EVTCHN_CALLBACK 0xe9 ++#endif ++ ++ + /* + * First APIC vector available to drivers: (vectors 0x30-0xee) we + * start at 0x31(0x41) to spread out vectors evenly between priority +@@ -157,6 +163,14 @@ static inline int invalid_vm86_irq(int irq) + #define CPU_VECTOR_LIMIT ( 8 * NR_CPUS ) + #define IO_APIC_VECTOR_LIMIT ( 32 * MAX_IO_APICS ) + ++#ifndef __ASSEMBLY__ ++# if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SPARSE_IRQ) ++extern int nr_dynamic_irqs; ++# else ++# define NR_DYNAMIC_IRQS 256 ++# endif ++#endif ++ + #ifdef CONFIG_X86_IO_APIC + # ifdef CONFIG_SPARSE_IRQ + # define NR_IRQS \ +@@ -165,13 +179,13 @@ static inline int invalid_vm86_irq(int irq) + (NR_VECTORS + IO_APIC_VECTOR_LIMIT)) + # else + # if NR_CPUS < MAX_IO_APICS +-# define NR_IRQS (NR_VECTORS + 4*CPU_VECTOR_LIMIT) ++# define NR_IRQS (NR_VECTORS + 4*CPU_VECTOR_LIMIT) + NR_DYNAMIC_IRQS + # else +-# define NR_IRQS (NR_VECTORS + IO_APIC_VECTOR_LIMIT) ++# define NR_IRQS (NR_VECTORS + IO_APIC_VECTOR_LIMIT) + NR_DYNAMIC_IRQS + # endif + # endif + #else /* !CONFIG_X86_IO_APIC: */ +-# define NR_IRQS NR_IRQS_LEGACY ++# define NR_IRQS NR_IRQS_LEGACY + NR_DYNAMIC_IRQS + #endif + + #endif /* _ASM_X86_IRQ_VECTORS_H */ +diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h +index ef51b50..e15fca1 100644 +--- a/arch/x86/include/asm/microcode.h ++++ b/arch/x86/include/asm/microcode.h +@@ -55,4 +55,13 @@ static inline struct microcode_ops * __init init_amd_microcode(void) + } + #endif + ++#ifdef CONFIG_MICROCODE_XEN ++extern struct microcode_ops * __init init_xen_microcode(void); ++#else ++static inline struct microcode_ops * __init init_xen_microcode(void) ++{ ++ return NULL; ++} ++#endif ++ + #endif /* _ASM_X86_MICROCODE_H */ +diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h +index 80a1dee..67eaa91 100644 +--- a/arch/x86/include/asm/mmu.h ++++ b/arch/x86/include/asm/mmu.h +@@ -13,6 +13,9 @@ typedef struct { + int size; + struct mutex lock; + void *vdso; ++#ifdef CONFIG_XEN ++ int has_foreign_mappings; ++#endif + } mm_context_t; + + #ifdef CONFIG_SMP +diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h +index efb3899..e571db4 100644 +--- a/arch/x86/include/asm/paravirt.h ++++ b/arch/x86/include/asm/paravirt.h +@@ -330,11 +330,18 @@ static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g) + { + PVOP_VCALL3(pv_cpu_ops.write_idt_entry, dt, entry, g); + } ++ + static inline void set_iopl_mask(unsigned mask) + { + PVOP_VCALL1(pv_cpu_ops.set_iopl_mask, mask); + } + ++static inline void set_io_bitmap(struct thread_struct *thread, ++ unsigned long bytes_updated) ++{ ++ PVOP_VCALL2(pv_cpu_ops.set_io_bitmap, thread, bytes_updated); ++} ++ + /* The paravirtualized I/O functions */ + static inline void slow_down_io(void) + { +@@ -770,15 +777,28 @@ static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock) + #define PV_RESTORE_REGS "popl %edx; popl %ecx;" + + /* save and restore all caller-save registers, except return value */ +-#define PV_SAVE_ALL_CALLER_REGS "pushl %ecx;" +-#define PV_RESTORE_ALL_CALLER_REGS "popl %ecx;" ++#define __PV_SAVE_ALL_CALLER_REGS "pushl %ecx;" ++#define __PV_RESTORE_ALL_CALLER_REGS "popl %ecx;" ++ ++#ifdef CONFIG_FRAME_POINTER ++#define PV_SAVE_ALL_CALLER_REGS \ ++ "push %ebp;" \ ++ "mov %esp, %ebp;" \ ++ __PV_SAVE_ALL_CALLER_REGS ++#define PV_RESTORE_ALL_CALLER_REGS \ ++ __PV_RESTORE_ALL_CALLER_REGS \ ++ "leave;" ++#else ++#define PV_SAVE_ALL_CALLER_REGS __PV_SAVE_ALL_CALLER_REGS ++#define PV_RESTORE_ALL_CALLER_REGS __PV_RESTORE_ALL_CALLER_REGS ++#endif + + #define PV_FLAGS_ARG "0" + #define PV_EXTRA_CLOBBERS + #define PV_VEXTRA_CLOBBERS + #else + /* save and restore all caller-save registers, except return value */ +-#define PV_SAVE_ALL_CALLER_REGS \ ++#define __PV_SAVE_ALL_CALLER_REGS \ + "push %rcx;" \ + "push %rdx;" \ + "push %rsi;" \ +@@ -787,7 +807,7 @@ static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock) + "push %r9;" \ + "push %r10;" \ + "push %r11;" +-#define PV_RESTORE_ALL_CALLER_REGS \ ++#define __PV_RESTORE_ALL_CALLER_REGS \ + "pop %r11;" \ + "pop %r10;" \ + "pop %r9;" \ +@@ -797,6 +817,19 @@ static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock) + "pop %rdx;" \ + "pop %rcx;" + ++#ifdef CONFIG_FRAME_POINTER ++#define PV_SAVE_ALL_CALLER_REGS \ ++ "push %rbp;" \ ++ "mov %rsp, %rbp;" \ ++ __PV_SAVE_ALL_CALLER_REGS ++#define PV_RESTORE_ALL_CALLER_REGS \ ++ __PV_RESTORE_ALL_CALLER_REGS \ ++ "leaveq;" ++#else ++#define PV_SAVE_ALL_CALLER_REGS __PV_SAVE_ALL_CALLER_REGS ++#define PV_RESTORE_ALL_CALLER_REGS __PV_RESTORE_ALL_CALLER_REGS ++#endif ++ + /* We save some registers, but all of them, that's too much. We clobber all + * caller saved registers but the argument parameter */ + #define PV_SAVE_REGS "pushq %%rdi;" +diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h +index 9357473..3202dcc 100644 +--- a/arch/x86/include/asm/paravirt_types.h ++++ b/arch/x86/include/asm/paravirt_types.h +@@ -135,6 +135,8 @@ struct pv_cpu_ops { + void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t); + + void (*set_iopl_mask)(unsigned mask); ++ void (*set_io_bitmap)(struct thread_struct *thread, ++ unsigned long bytes_updated); + + void (*wbinvd)(void); + void (*io_delay)(void); +diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h +index ada8c20..faa0af1 100644 +--- a/arch/x86/include/asm/pci.h ++++ b/arch/x86/include/asm/pci.h +@@ -21,6 +21,7 @@ struct pci_sysdata { + extern int pci_routeirq; + extern int noioapicquirk; + extern int noioapicreroute; ++extern int pci_scan_all_fns; + + /* scan a bus after allocating a pci_sysdata for it */ + extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops, +@@ -49,6 +50,11 @@ extern unsigned int pcibios_assign_all_busses(void); + #define pcibios_assign_all_busses() 0 + #endif + ++static inline int pcibios_scan_all_fns(struct pci_bus *bus, int devfn) ++{ ++ return pci_scan_all_fns; ++} ++ + extern unsigned long pci_mem_start; + #define PCIBIOS_MIN_IO 0x1000 + #define PCIBIOS_MIN_MEM (pci_mem_start) +@@ -87,6 +93,7 @@ extern void pci_iommu_alloc(void); + + /* MSI arch hook */ + #define arch_setup_msi_irqs arch_setup_msi_irqs ++#define arch_teardown_msi_irqs arch_teardown_msi_irqs + + #define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys) + +@@ -128,6 +135,7 @@ extern void pci_iommu_alloc(void); + #include + + /* generic pci stuff */ ++#define HAVE_ARCH_PCIBIOS_SCAN_ALL_FNS + #include + #define PCIBIOS_MAX_MEM_32 0xffffffff + +diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h +index b399988..30cbf49 100644 +--- a/arch/x86/include/asm/pci_x86.h ++++ b/arch/x86/include/asm/pci_x86.h +@@ -45,6 +45,7 @@ enum pci_bf_sort_state { + extern unsigned int pcibios_max_latency; + + void pcibios_resource_survey(void); ++void pcibios_set_cache_line_size(void); + + /* pci-pc.c */ + +@@ -106,6 +107,7 @@ extern int pci_direct_probe(void); + extern void pci_direct_init(int type); + extern void pci_pcbios_init(void); + extern int pci_olpc_init(void); ++extern int pci_xen_init(void); + extern void __init dmi_check_pciprobe(void); + extern void __init dmi_check_skip_isa_align(void); + +diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h +index af6fd36..430e3cc 100644 +--- a/arch/x86/include/asm/pgtable.h ++++ b/arch/x86/include/asm/pgtable.h +@@ -15,7 +15,6 @@ + : (prot)) + + #ifndef __ASSEMBLY__ +- + /* + * ZERO_PAGE is a global shared page that is always zero: used + * for zero-mapped memory areas etc.. +@@ -26,6 +25,8 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; + extern spinlock_t pgd_lock; + extern struct list_head pgd_list; + ++extern struct mm_struct *pgd_page_get_mm(struct page *page); ++ + #ifdef CONFIG_PARAVIRT + #include + #else /* !CONFIG_PARAVIRT */ +@@ -76,6 +77,11 @@ extern struct list_head pgd_list; + + #endif /* CONFIG_PARAVIRT */ + ++static inline pteval_t pte_flags(pte_t pte) ++{ ++ return pte_val(pte) & PTE_FLAGS_MASK; ++} ++ + /* + * The following only work if pte_present() is true. + * Undefined behaviour if not.. +@@ -397,6 +403,9 @@ static inline unsigned long pages_to_mb(unsigned long npg) + #define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ + remap_pfn_range(vma, vaddr, pfn, size, prot) + ++#define arch_vm_get_page_prot arch_vm_get_page_prot ++extern pgprot_t arch_vm_get_page_prot(unsigned vm_flags); ++ + #if PAGETABLE_LEVELS > 2 + static inline int pud_none(pud_t pud) + { +@@ -616,6 +625,9 @@ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) + memcpy(dst, src, count * sizeof(pgd_t)); + } + ++int create_lookup_pte_addr(struct mm_struct *mm, ++ unsigned long address, ++ uint64_t *ptep); + + #include + #endif /* __ASSEMBLY__ */ +diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h +index c57a301..4e46931 100644 +--- a/arch/x86/include/asm/pgtable_64.h ++++ b/arch/x86/include/asm/pgtable_64.h +@@ -160,7 +160,7 @@ extern void cleanup_highmap(void); + #define pgtable_cache_init() do { } while (0) + #define check_pgt_cache() do { } while (0) + +-#define PAGE_AGP PAGE_KERNEL_NOCACHE ++#define PAGE_AGP PAGE_KERNEL_IO_NOCACHE + #define HAVE_PAGE_AGP 1 + + /* fs/proc/kcore.c */ +diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h +index d1f4a76..a81b0ed 100644 +--- a/arch/x86/include/asm/pgtable_types.h ++++ b/arch/x86/include/asm/pgtable_types.h +@@ -265,11 +265,6 @@ static inline pteval_t native_pte_val(pte_t pte) + return pte.pte; + } + +-static inline pteval_t pte_flags(pte_t pte) +-{ +- return native_pte_val(pte) & PTE_FLAGS_MASK; +-} +- + #define pgprot_val(x) ((x).pgprot) + #define __pgprot(x) ((pgprot_t) { (x) } ) + +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index 78bb4d7..2232bd2 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -551,6 +551,9 @@ static inline void native_set_iopl_mask(unsigned mask) + #endif + } + ++extern void native_set_io_bitmap(struct thread_struct *thread, ++ unsigned long updated_bytes); ++ + static inline void + native_load_sp0(struct tss_struct *tss, struct thread_struct *thread) + { +@@ -592,6 +595,7 @@ static inline void load_sp0(struct tss_struct *tss, + } + + #define set_iopl_mask native_set_iopl_mask ++#define set_io_bitmap native_set_io_bitmap + #endif /* CONFIG_PARAVIRT */ + + /* +diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h +index 53235fd..daaacab 100644 +--- a/arch/x86/include/asm/pvclock.h ++++ b/arch/x86/include/asm/pvclock.h +@@ -10,5 +10,6 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src); + void pvclock_read_wallclock(struct pvclock_wall_clock *wall, + struct pvclock_vcpu_time_info *vcpu, + struct timespec *ts); ++void pvclock_resume(void); + + #endif /* _ASM_X86_PVCLOCK_H */ +diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h +index 18e496c..154a5f1 100644 +--- a/arch/x86/include/asm/setup.h ++++ b/arch/x86/include/asm/setup.h +@@ -95,6 +95,11 @@ void *extend_brk(size_t size, size_t align); + : : "i" (sz)); \ + } + ++/* Helper for reserving space for arrays of things */ ++#define RESERVE_BRK_ARRAY(type, name, entries) \ ++ type *name; \ ++ RESERVE_BRK(name, sizeof(type) * entries) ++ + #ifdef __i386__ + + void __init i386_start_kernel(void); +diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h +index b9e4e20..8085277 100644 +--- a/arch/x86/include/asm/swiotlb.h ++++ b/arch/x86/include/asm/swiotlb.h +@@ -3,15 +3,16 @@ + + #include + +-/* SWIOTLB interface */ +- +-extern int swiotlb_force; +- + #ifdef CONFIG_SWIOTLB + extern int swiotlb; +-extern void pci_swiotlb_init(void); ++extern int __init pci_swiotlb_detect(void); ++extern void __init pci_swiotlb_init(void); + #else + #define swiotlb 0 ++static inline int pci_swiotlb_detect(void) ++{ ++ return 0; ++} + static inline void pci_swiotlb_init(void) + { + } +diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h +index 1bb6e39..ef0fa4d 100644 +--- a/arch/x86/include/asm/syscalls.h ++++ b/arch/x86/include/asm/syscalls.h +@@ -33,11 +33,11 @@ long sys_rt_sigreturn(struct pt_regs *); + asmlinkage int sys_set_thread_area(struct user_desc __user *); + asmlinkage int sys_get_thread_area(struct user_desc __user *); + +-/* X86_32 only */ +-#ifdef CONFIG_X86_32 + /* kernel/ioport.c */ +-long sys_iopl(struct pt_regs *); ++asmlinkage long sys_iopl(unsigned int); + ++/* X86_32 only */ ++#ifdef CONFIG_X86_32 + /* kernel/process_32.c */ + int sys_clone(struct pt_regs *); + int sys_execve(struct pt_regs *); +@@ -68,8 +68,6 @@ int sys_vm86(struct pt_regs *); + #else /* CONFIG_X86_32 */ + + /* X86_64 only */ +-/* kernel/ioport.c */ +-asmlinkage long sys_iopl(unsigned int, struct pt_regs *); + + /* kernel/process_64.c */ + asmlinkage long sys_clone(unsigned long, unsigned long, +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index 7f3eba0..e4fc8ea 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -89,6 +89,10 @@ static inline void __flush_tlb_one(unsigned long addr) + + #ifndef CONFIG_SMP + ++static inline void __init init_smp_flush(void) ++{ ++} ++ + #define flush_tlb() __flush_tlb() + #define flush_tlb_all() __flush_tlb_all() + #define local_flush_tlb() __flush_tlb() +@@ -129,6 +133,8 @@ static inline void reset_lazy_tlbstate(void) + + #define local_flush_tlb() __flush_tlb() + ++extern void init_smp_flush(void); ++ + extern void flush_tlb_all(void); + extern void flush_tlb_current_task(void); + extern void flush_tlb_mm(struct mm_struct *); +diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h +index 2c756fd..d8e7145 100644 +--- a/arch/x86/include/asm/x86_init.h ++++ b/arch/x86/include/asm/x86_init.h +@@ -91,6 +91,14 @@ struct x86_init_timers { + }; + + /** ++ * struct x86_init_iommu - platform specific iommu setup ++ * @iommu_init: platform specific iommu setup ++ */ ++struct x86_init_iommu { ++ int (*iommu_init)(void); ++}; ++ ++/** + * struct x86_init_ops - functions for platform specific setup + * + */ +@@ -101,6 +109,7 @@ struct x86_init_ops { + struct x86_init_oem oem; + struct x86_init_paging paging; + struct x86_init_timers timers; ++ struct x86_init_iommu iommu; + }; + + /** +@@ -121,6 +130,7 @@ struct x86_platform_ops { + unsigned long (*calibrate_tsc)(void); + unsigned long (*get_wallclock)(void); + int (*set_wallclock)(unsigned long nowtime); ++ void (*iommu_shutdown)(void); + }; + + extern struct x86_init_ops x86_init; +diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h +index 9c371e4..41c4be0 100644 +--- a/arch/x86/include/asm/xen/hypercall.h ++++ b/arch/x86/include/asm/xen/hypercall.h +@@ -45,6 +45,8 @@ + #include + #include + #include ++#include ++#include + + /* + * The hypercall asms have to meet several constraints: +@@ -200,6 +202,23 @@ extern struct { char _entry[32]; } hypercall_page[]; + (type)__res; \ + }) + ++static inline long ++privcmd_call(unsigned call, ++ unsigned long a1, unsigned long a2, ++ unsigned long a3, unsigned long a4, ++ unsigned long a5) ++{ ++ __HYPERCALL_DECLS; ++ __HYPERCALL_5ARG(a1, a2, a3, a4, a5); ++ ++ asm volatile("call *%[call]" ++ : __HYPERCALL_5PARAM ++ : [call] "a" (&hypercall_page[call]) ++ : __HYPERCALL_CLOBBER5); ++ ++ return (long)__res; ++} ++ + static inline int + HYPERVISOR_set_trap_table(struct trap_info *table) + { +@@ -282,6 +301,20 @@ HYPERVISOR_set_timer_op(u64 timeout) + } + + static inline int ++HYPERVISOR_mca(struct xen_mc *mc_op) ++{ ++ mc_op->interface_version = XEN_MCA_INTERFACE_VERSION; ++ return _hypercall1(int, mca, mc_op); ++} ++ ++static inline int ++HYPERVISOR_dom0_op(struct xen_platform_op *platform_op) ++{ ++ platform_op->interface_version = XENPF_INTERFACE_VERSION; ++ return _hypercall1(int, dom0_op, platform_op); ++} ++ ++static inline int + HYPERVISOR_set_debugreg(int reg, unsigned long value) + { + return _hypercall2(int, set_debugreg, reg, value); +@@ -417,6 +450,12 @@ HYPERVISOR_nmi_op(unsigned long op, unsigned long arg) + return _hypercall2(int, nmi_op, op, arg); + } + ++static inline unsigned long __must_check ++HYPERVISOR_hvm_op(int op, void *arg) ++{ ++ return _hypercall2(unsigned long, hvm_op, op, arg); ++} ++ + static inline void + MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) + { +@@ -424,6 +463,14 @@ MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) + mcl->args[0] = set; + } + ++#if defined(CONFIG_X86_64) ++#define MULTI_UVMFLAGS_INDEX 2 ++#define MULTI_UVMDOMID_INDEX 3 ++#else ++#define MULTI_UVMFLAGS_INDEX 3 ++#define MULTI_UVMDOMID_INDEX 4 ++#endif ++ + static inline void + MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va, + pte_t new_val, unsigned long flags) +@@ -432,12 +479,11 @@ MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va, + mcl->args[0] = va; + if (sizeof(new_val) == sizeof(long)) { + mcl->args[1] = new_val.pte; +- mcl->args[2] = flags; + } else { + mcl->args[1] = new_val.pte; + mcl->args[2] = new_val.pte >> 32; +- mcl->args[3] = flags; + } ++ mcl->args[MULTI_UVMFLAGS_INDEX] = flags; + } + + static inline void +diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h +index d5b7e90..396ff4c 100644 +--- a/arch/x86/include/asm/xen/hypervisor.h ++++ b/arch/x86/include/asm/xen/hypervisor.h +@@ -37,31 +37,4 @@ + extern struct shared_info *HYPERVISOR_shared_info; + extern struct start_info *xen_start_info; + +-enum xen_domain_type { +- XEN_NATIVE, /* running on bare hardware */ +- XEN_PV_DOMAIN, /* running in a PV domain */ +- XEN_HVM_DOMAIN, /* running in a Xen hvm domain */ +-}; +- +-#ifdef CONFIG_XEN +-extern enum xen_domain_type xen_domain_type; +-#else +-#define xen_domain_type XEN_NATIVE +-#endif +- +-#define xen_domain() (xen_domain_type != XEN_NATIVE) +-#define xen_pv_domain() (xen_domain() && \ +- xen_domain_type == XEN_PV_DOMAIN) +-#define xen_hvm_domain() (xen_domain() && \ +- xen_domain_type == XEN_HVM_DOMAIN) +- +-#ifdef CONFIG_XEN_DOM0 +-#include +- +-#define xen_initial_domain() (xen_pv_domain() && \ +- xen_start_info->flags & SIF_INITDOMAIN) +-#else /* !CONFIG_XEN_DOM0 */ +-#define xen_initial_domain() (0) +-#endif /* CONFIG_XEN_DOM0 */ +- + #endif /* _ASM_X86_XEN_HYPERVISOR_H */ +diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h +index e8506c1..9539998 100644 +--- a/arch/x86/include/asm/xen/interface.h ++++ b/arch/x86/include/asm/xen/interface.h +@@ -61,9 +61,9 @@ DEFINE_GUEST_HANDLE(void); + #define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START) + #endif + +-#ifndef machine_to_phys_mapping +-#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START) +-#endif ++#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START) ++#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END) ++#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>__MACH2PHYS_SHIFT) + + /* Maximum number of virtual CPUs in multi-processor guests. */ + #define MAX_VIRT_CPUS 32 +@@ -97,6 +97,8 @@ DEFINE_GUEST_HANDLE(void); + #define TI_SET_IF(_ti, _if) ((_ti)->flags |= ((!!(_if))<<2)) + + #ifndef __ASSEMBLY__ ++#include ++ + struct trap_info { + uint8_t vector; /* exception vector */ + uint8_t flags; /* 0-3: privilege level; 4: clear event enable? */ +diff --git a/arch/x86/include/asm/xen/interface_32.h b/arch/x86/include/asm/xen/interface_32.h +index 42a7e00..8413688 100644 +--- a/arch/x86/include/asm/xen/interface_32.h ++++ b/arch/x86/include/asm/xen/interface_32.h +@@ -32,6 +32,11 @@ + /* And the trap vector is... */ + #define TRAP_INSTR "int $0x82" + ++#define __MACH2PHYS_VIRT_START 0xF5800000 ++#define __MACH2PHYS_VIRT_END 0xF6800000 ++ ++#define __MACH2PHYS_SHIFT 2 ++ + /* + * Virtual addresses beyond this are not modifiable by guest OSes. The + * machine->physical mapping table starts at this address, read-only. +diff --git a/arch/x86/include/asm/xen/interface_64.h b/arch/x86/include/asm/xen/interface_64.h +index 100d266..839a481 100644 +--- a/arch/x86/include/asm/xen/interface_64.h ++++ b/arch/x86/include/asm/xen/interface_64.h +@@ -39,18 +39,7 @@ + #define __HYPERVISOR_VIRT_END 0xFFFF880000000000 + #define __MACH2PHYS_VIRT_START 0xFFFF800000000000 + #define __MACH2PHYS_VIRT_END 0xFFFF804000000000 +- +-#ifndef HYPERVISOR_VIRT_START +-#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START) +-#define HYPERVISOR_VIRT_END mk_unsigned_long(__HYPERVISOR_VIRT_END) +-#endif +- +-#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START) +-#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END) +-#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>3) +-#ifndef machine_to_phys_mapping +-#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START) +-#endif ++#define __MACH2PHYS_SHIFT 3 + + /* + * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base) +diff --git a/arch/x86/include/asm/xen/iommu.h b/arch/x86/include/asm/xen/iommu.h +new file mode 100644 +index 0000000..75df312 +--- /dev/null ++++ b/arch/x86/include/asm/xen/iommu.h +@@ -0,0 +1,12 @@ ++#ifndef ASM_X86__XEN_IOMMU_H ++ ++#ifdef CONFIG_PCI_XEN ++extern void xen_iommu_init(void); ++#else ++static inline void xen_iommu_init(void) ++{ ++} ++#endif ++ ++#endif ++ +diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h +index 018a0a4..05c5cf5 100644 +--- a/arch/x86/include/asm/xen/page.h ++++ b/arch/x86/include/asm/xen/page.h +@@ -5,6 +5,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -28,23 +29,32 @@ typedef struct xpaddr { + + /**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/ + #define INVALID_P2M_ENTRY (~0UL) +-#define FOREIGN_FRAME_BIT (1UL<<31) ++#define FOREIGN_FRAME_BIT (1UL << (sizeof(unsigned long) * 8 - 1)) + #define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) + + /* Maximum amount of memory we can handle in a domain in pages */ + #define MAX_DOMAIN_PAGES \ + ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE)) + ++extern unsigned long *machine_to_phys_mapping; ++extern unsigned int machine_to_phys_order; + + extern unsigned long get_phys_to_machine(unsigned long pfn); +-extern void set_phys_to_machine(unsigned long pfn, unsigned long mfn); ++extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); + + static inline unsigned long pfn_to_mfn(unsigned long pfn) + { ++ unsigned long mfn; ++ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return pfn; + +- return get_phys_to_machine(pfn) & ~FOREIGN_FRAME_BIT; ++ mfn = get_phys_to_machine(pfn); ++ ++ if (mfn != INVALID_P2M_ENTRY) ++ mfn &= ~FOREIGN_FRAME_BIT; ++ ++ return mfn; + } + + static inline int phys_to_machine_mapping_valid(unsigned long pfn) +@@ -62,10 +72,8 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) + if (xen_feature(XENFEAT_auto_translated_physmap)) + return mfn; + +-#if 0 + if (unlikely((mfn >> machine_to_phys_order) != 0)) +- return max_mapnr; +-#endif ++ return ~0; + + pfn = 0; + /* +@@ -112,13 +120,9 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine) + */ + static inline unsigned long mfn_to_local_pfn(unsigned long mfn) + { +- extern unsigned long max_mapnr; + unsigned long pfn = mfn_to_pfn(mfn); +- if ((pfn < max_mapnr) +- && !xen_feature(XENFEAT_auto_translated_physmap) +- && (get_phys_to_machine(pfn) != mfn)) +- return max_mapnr; /* force !pfn_valid() */ +- /* XXX fixme; not true with sparsemem */ ++ if (get_phys_to_machine(pfn) != mfn) ++ return -1; /* force !pfn_valid() */ + return pfn; + } + +@@ -163,6 +167,7 @@ static inline pte_t __pte_ma(pteval_t x) + + #define pgd_val_ma(x) ((x).pgd) + ++void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid); + + xmaddr_t arbitrary_virt_to_machine(void *address); + unsigned long arbitrary_virt_to_mfn(void *vaddr); +diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h +new file mode 100644 +index 0000000..6683196 +--- /dev/null ++++ b/arch/x86/include/asm/xen/pci.h +@@ -0,0 +1,104 @@ ++#ifndef _ASM_X86_XEN_PCI_H ++#define _ASM_X86_XEN_PCI_H ++ ++#if defined(CONFIG_PCI_MSI) ++#if defined(CONFIG_PCI_XEN) ++int xen_register_pirq(u32 gsi, int triggering); ++int xen_register_gsi(u32 gsi, int triggering, int polarity); ++int xen_create_msi_irq(struct pci_dev *dev, ++ struct msi_desc *msidesc, ++ int type); ++void xen_pci_teardown_msi_dev(struct pci_dev *dev); ++void xen_pci_teardown_msi_irq(int irq); ++int xen_pci_setup_msi_irqs(struct pci_dev *dev, int nvec, int type); ++ ++/* The drivers/pci/xen-pcifront.c sets this structure to ++ * its own functions. ++ */ ++struct xen_pci_frontend_ops { ++ int (*enable_msi)(struct pci_dev *dev, int **vectors); ++ void (*disable_msi)(struct pci_dev *dev); ++ int (*enable_msix)(struct pci_dev *dev, int **vectors, int nvec); ++ void (*disable_msix)(struct pci_dev *dev); ++}; ++ ++extern struct xen_pci_frontend_ops *xen_pci_frontend; ++ ++static inline int xen_pci_frontend_enable_msi(struct pci_dev *dev, ++ int **vectors) ++{ ++ if (xen_pci_frontend && xen_pci_frontend->enable_msi) ++ return xen_pci_frontend->enable_msi(dev, vectors); ++ return -ENODEV; ++} ++static inline void xen_pci_frontend_disable_msi(struct pci_dev *dev) ++{ ++ if (xen_pci_frontend && xen_pci_frontend->disable_msi) ++ xen_pci_frontend->disable_msi(dev); ++} ++static inline int xen_pci_frontend_enable_msix(struct pci_dev *dev, ++ int **vectors, int nvec) ++{ ++ if (xen_pci_frontend && xen_pci_frontend->enable_msix) ++ return xen_pci_frontend->enable_msix(dev, vectors, nvec); ++ return -ENODEV; ++} ++static inline void xen_pci_frontend_disable_msix(struct pci_dev *dev) ++{ ++ if (xen_pci_frontend && xen_pci_frontend->disable_msix) ++ xen_pci_frontend->disable_msix(dev); ++} ++#else ++static inline int xen_create_msi_irq(struct pci_dev *dev, ++ struct msi_desc *msidesc, ++ int type) ++{ ++ return -1; ++} ++static inline void xen_pci_teardown_msi_dev(struct pci_dev *dev) { } ++static inline void xen_pci_teardown_msi_irq(int irq) { } ++static inline int xen_pci_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) ++{ ++ return -ENODEV; ++} ++#endif /* CONFIG_PCI_XEN */ ++ ++#endif /* CONFIG_PCI_MSI */ ++ ++#ifdef CONFIG_XEN_DOM0_PCI ++int xen_register_gsi(u32 gsi, int triggering, int polarity); ++int xen_find_device_domain_owner(struct pci_dev *dev); ++int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain); ++int xen_unregister_device_domain_owner(struct pci_dev *dev); ++ ++#else ++static inline int xen_register_gsi(u32 gsi, int triggering, int polarity) ++{ ++ return -1; ++} ++ ++static inline int xen_find_device_domain_owner(struct pci_dev *dev) ++{ ++ return -1; ++} ++static inline int xen_register_device_domain_owner(struct pci_dev *dev, ++ uint16_t domain) ++{ ++ return -1; ++} ++static inline int xen_unregister_device_domain_owner(struct pci_dev *dev) ++{ ++ return -1; ++} ++#endif ++ ++#if defined(CONFIG_PCI_MSI) && defined(CONFIG_XEN_DOM0_PCI) ++int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type); ++#else ++static inline int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) ++{ ++ return -1; ++} ++#endif ++ ++#endif /* _ASM_X86_XEN_PCI_H */ +diff --git a/arch/x86/include/asm/xen/swiotlb-xen.h b/arch/x86/include/asm/xen/swiotlb-xen.h +new file mode 100644 +index 0000000..e4fe299 +--- /dev/null ++++ b/arch/x86/include/asm/xen/swiotlb-xen.h +@@ -0,0 +1,14 @@ ++#ifndef _ASM_X86_SWIOTLB_XEN_H ++#define _ASM_X86_SWIOTLB_XEN_H ++ ++#ifdef CONFIG_PCI_XEN ++extern int xen_swiotlb; ++extern int __init pci_xen_swiotlb_detect(void); ++extern void __init pci_xen_swiotlb_init(void); ++#else ++#define xen_swiotlb 0 ++static inline int __init pci_xen_swiotlb_detect(void) { return 0; } ++static inline void __init pci_xen_swiotlb_init(void) { } ++#endif ++ ++#endif /* _ASM_X86_SWIOTLB_XEN_H */ +diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile +index d1911ab..cfe00bc 100644 +--- a/arch/x86/kernel/Makefile ++++ b/arch/x86/kernel/Makefile +@@ -113,6 +113,7 @@ obj-$(CONFIG_X86_MRST) += mrst.o + microcode-y := microcode_core.o + microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o + microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o ++microcode-$(CONFIG_MICROCODE_XEN) += microcode_xen.o + obj-$(CONFIG_MICROCODE) += microcode.o + + obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o +diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c +index 23c2da8..a2a5125 100644 +--- a/arch/x86/kernel/acpi/boot.c ++++ b/arch/x86/kernel/acpi/boot.c +@@ -42,6 +42,10 @@ + #include + #include + ++#include ++ ++#include ++ + static int __initdata acpi_force = 0; + u32 acpi_rsdt_forced; + int acpi_disabled; +@@ -149,6 +153,10 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled) + { + unsigned int ver = 0; + ++ /* We don't want to register lapics when in Xen dom0 */ ++ if (xen_initial_domain()) ++ return; ++ + if (!enabled) { + ++disabled_cpus; + return; +@@ -461,9 +469,13 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) + */ + int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) + { +- unsigned int irq; ++ int irq; + unsigned int plat_gsi = gsi; + ++ irq = xen_register_gsi(gsi, trigger, polarity); ++ if (irq >= 0) ++ return irq; ++ + #ifdef CONFIG_PCI + /* + * Make sure all (legacy) PCI IRQs are set as level-triggered. +@@ -740,6 +752,10 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table) + + static void __init acpi_register_lapic_address(unsigned long address) + { ++ /* Xen dom0 doesn't have usable lapics */ ++ if (xen_initial_domain()) ++ return; ++ + mp_lapic_addr = address; + + set_fixmap_nocache(FIX_APIC_BASE, address); +@@ -860,6 +876,9 @@ int __init acpi_probe_gsi(void) + max_gsi = gsi; + } + ++ if (xen_initial_domain()) ++ max_gsi += 255; /* Plus maximum entries of an ioapic. */ ++ + return max_gsi + 1; + } + +diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c +index d85d1b2..8aabedd 100644 +--- a/arch/x86/kernel/acpi/processor.c ++++ b/arch/x86/kernel/acpi/processor.c +@@ -12,6 +12,8 @@ + #include + #include + ++#include ++ + static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c) + { + struct acpi_object_list *obj_list; +@@ -59,7 +61,7 @@ static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c) + /* + * If mwait/monitor is unsupported, C2/C3_FFH will be disabled + */ +- if (!cpu_has(c, X86_FEATURE_MWAIT)) ++ if (!cpu_has(c, X86_FEATURE_MWAIT) && !xen_initial_domain()) + buf[2] &= ~(ACPI_PDC_C_C2C3_FFH); + + obj->type = ACPI_TYPE_BUFFER; +@@ -88,6 +90,19 @@ void arch_acpi_processor_init_pdc(struct acpi_processor *pr) + + EXPORT_SYMBOL(arch_acpi_processor_init_pdc); + ++/* Initialize _PDC data based on the CPU vendor */ ++void xen_arch_acpi_processor_init_pdc(struct acpi_processor *pr) ++{ ++ struct cpuinfo_x86 *c = &cpu_data(0); ++ ++ pr->pdc = NULL; ++ if (c->x86_vendor == X86_VENDOR_INTEL) ++ init_intel_pdc(pr, c); ++ ++ return; ++} ++EXPORT_SYMBOL(xen_arch_acpi_processor_init_pdc); ++ + void arch_acpi_processor_cleanup_pdc(struct acpi_processor *pr) + { + if (pr->pdc) { +diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c +index ca93638..9eff23c 100644 +--- a/arch/x86/kernel/acpi/sleep.c ++++ b/arch/x86/kernel/acpi/sleep.c +@@ -12,6 +12,8 @@ + #include + #include + ++#include ++ + #include "realmode/wakeup.h" + #include "sleep.h" + +diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c +index 7cd33f7..b8497c6 100644 +--- a/arch/x86/kernel/amd_iommu.c ++++ b/arch/x86/kernel/amd_iommu.c +@@ -928,7 +928,7 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev, + } + + if (unlikely(address == -1)) +- address = bad_dma_address; ++ address = DMA_ERROR_CODE; + + WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size); + +@@ -1545,7 +1545,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, + + pte = dma_ops_get_pte(dom, address); + if (!pte) +- return bad_dma_address; ++ return DMA_ERROR_CODE; + + __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; + +@@ -1626,7 +1626,7 @@ static dma_addr_t __map_single(struct device *dev, + retry: + address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask, + dma_mask); +- if (unlikely(address == bad_dma_address)) { ++ if (unlikely(address == DMA_ERROR_CODE)) { + /* + * setting next_address here will let the address + * allocator only scan the new allocated range in the +@@ -1647,7 +1647,7 @@ retry: + start = address; + for (i = 0; i < pages; ++i) { + ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); +- if (ret == bad_dma_address) ++ if (ret == DMA_ERROR_CODE) + goto out_unmap; + + paddr += PAGE_SIZE; +@@ -1675,7 +1675,7 @@ out_unmap: + + dma_ops_free_addresses(dma_dom, address, pages); + +- return bad_dma_address; ++ return DMA_ERROR_CODE; + } + + /* +@@ -1692,7 +1692,7 @@ static void __unmap_single(struct amd_iommu *iommu, + dma_addr_t i, start; + unsigned int pages; + +- if ((dma_addr == bad_dma_address) || ++ if ((dma_addr == DMA_ERROR_CODE) || + (dma_addr + size > dma_dom->aperture_size)) + return; + +@@ -1735,7 +1735,7 @@ static dma_addr_t map_page(struct device *dev, struct page *page, + INC_STATS_COUNTER(cnt_map_single); + + if (!check_device(dev)) +- return bad_dma_address; ++ return DMA_ERROR_CODE; + + dma_mask = *dev->dma_mask; + +@@ -1746,12 +1746,12 @@ static dma_addr_t map_page(struct device *dev, struct page *page, + return (dma_addr_t)paddr; + + if (!dma_ops_domain(domain)) +- return bad_dma_address; ++ return DMA_ERROR_CODE; + + spin_lock_irqsave(&domain->lock, flags); + addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false, + dma_mask); +- if (addr == bad_dma_address) ++ if (addr == DMA_ERROR_CODE) + goto out; + + iommu_completion_wait(iommu); +@@ -1960,7 +1960,7 @@ static void *alloc_coherent(struct device *dev, size_t size, + *dma_addr = __map_single(dev, iommu, domain->priv, paddr, + size, DMA_BIDIRECTIONAL, true, dma_mask); + +- if (*dma_addr == bad_dma_address) { ++ if (*dma_addr == DMA_ERROR_CODE) { + spin_unlock_irqrestore(&domain->lock, flags); + goto out_free; + } +@@ -2122,8 +2122,7 @@ int __init amd_iommu_init_dma_ops(void) + prealloc_protection_domains(); + + iommu_detected = 1; +- force_iommu = 1; +- bad_dma_address = 0; ++ swiotlb = 0; + #ifdef CONFIG_GART_IOMMU + gart_iommu_aperture_disabled = 1; + gart_iommu_aperture = 0; +diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c +index 400be99..0069df5 100644 +--- a/arch/x86/kernel/amd_iommu_init.c ++++ b/arch/x86/kernel/amd_iommu_init.c +@@ -29,6 +29,7 @@ + #include + #include + #include ++#include + + /* + * definitions for the ACPI scanning code +@@ -1206,19 +1207,10 @@ static struct sys_device device_amd_iommu = { + * functions. Finally it prints some information about AMD IOMMUs and + * the driver state and enables the hardware. + */ +-int __init amd_iommu_init(void) ++static int __init amd_iommu_init(void) + { + int i, ret = 0; + +- +- if (no_iommu) { +- printk(KERN_INFO "AMD-Vi disabled by kernel command line\n"); +- return 0; +- } +- +- if (!amd_iommu_detected) +- return -ENODEV; +- + /* + * First parse ACPI tables to find the largest Bus/Dev/Func + * we need to handle. Upon this information the shared data +@@ -1333,6 +1325,7 @@ int __init amd_iommu_init(void) + else + printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n"); + ++ x86_platform.iommu_shutdown = disable_iommus; + out: + return ret; + +@@ -1361,11 +1354,6 @@ free: + goto out; + } + +-void amd_iommu_shutdown(void) +-{ +- disable_iommus(); +-} +- + /**************************************************************************** + * + * Early detect code. This code runs at IOMMU detection time in the DMA +@@ -1380,16 +1368,13 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table) + + void __init amd_iommu_detect(void) + { +- if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture)) ++ if (no_iommu || (iommu_detected && !gart_iommu_aperture)) + return; + + if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { + iommu_detected = 1; + amd_iommu_detected = 1; +-#ifdef CONFIG_GART_IOMMU +- gart_iommu_aperture_disabled = 1; +- gart_iommu_aperture = 0; +-#endif ++ x86_init.iommu.iommu_init = amd_iommu_init; + } + } + +diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c +index 082089e..8d34362 100644 +--- a/arch/x86/kernel/aperture_64.c ++++ b/arch/x86/kernel/aperture_64.c +@@ -28,6 +28,7 @@ + #include + #include + #include ++#include + + int gart_iommu_aperture; + int gart_iommu_aperture_disabled __initdata; +@@ -401,6 +402,7 @@ void __init gart_iommu_hole_init(void) + + iommu_detected = 1; + gart_iommu_aperture = 1; ++ x86_init.iommu.iommu_init = gart_iommu_init; + + ctl = read_pci_config(bus, slot, 3, + AMD64_GARTAPERTURECTL); +@@ -469,7 +471,7 @@ out: + + if (aper_alloc) { + /* Got the aperture from the AGP bridge */ +- } else if (swiotlb && !valid_agp) { ++ } else if (!valid_agp) { + /* Do nothing */ + } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) || + force_iommu || +diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c +index 8928d97..4848d5d 100644 +--- a/arch/x86/kernel/apic/io_apic.c ++++ b/arch/x86/kernel/apic/io_apic.c +@@ -63,7 +63,12 @@ + #include + #include + ++#include + #include ++#include ++#include ++ ++#include + + #define __apicdebuginit(type) static type __init + #define for_each_irq_pin(entry, head) \ +@@ -395,14 +400,18 @@ static inline void io_apic_eoi(unsigned int apic, unsigned int vector) + + static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) + { +- struct io_apic __iomem *io_apic = io_apic_base(apic); ++ struct io_apic __iomem *io_apic; ++ ++ io_apic = io_apic_base(apic); + writel(reg, &io_apic->index); + return readl(&io_apic->data); + } + + static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) + { +- struct io_apic __iomem *io_apic = io_apic_base(apic); ++ struct io_apic __iomem *io_apic; ++ ++ io_apic = io_apic_base(apic); + writel(reg, &io_apic->index); + writel(value, &io_apic->data); + } +@@ -415,7 +424,9 @@ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned i + */ + static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) + { +- struct io_apic __iomem *io_apic = io_apic_base(apic); ++ struct io_apic __iomem *io_apic; ++ ++ io_apic = io_apic_base(apic); + + if (sis_apic_bug) + writel(reg, &io_apic->index); +@@ -3494,6 +3505,9 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) + if (type == PCI_CAP_ID_MSI && nvec > 1) + return 1; + ++ if (xen_pv_domain()) ++ return xen_pci_setup_msi_irqs(dev, nvec, type); ++ + node = dev_to_node(&dev->dev); + irq_want = nr_irqs_gsi; + sub_handle = 0; +@@ -3543,7 +3557,29 @@ error: + + void arch_teardown_msi_irq(unsigned int irq) + { +- destroy_irq(irq); ++ if (xen_domain()) ++ xen_pci_teardown_msi_irq(irq); ++ else ++ destroy_irq(irq); ++} ++ ++void arch_teardown_msi_irqs(struct pci_dev *dev) ++{ ++ struct msi_desc *entry; ++ ++ /* If we are non-privileged PV domain, we have to ++ * to call xen_teardown_msi_dev first. */ ++ if (xen_domain()) ++ xen_pci_teardown_msi_dev(dev); ++ ++ list_for_each_entry(entry, &dev->msi_list, list) { ++ int i, nvec; ++ if (entry->irq == 0) ++ continue; ++ nvec = 1 << entry->msi_attrib.multiple; ++ for (i = 0; i < nvec; i++) ++ arch_teardown_msi_irq(entry->irq + i); ++ } + } + + #if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP) +@@ -3860,7 +3896,14 @@ void __init probe_nr_irqs_gsi(void) + printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi); + } + ++int get_nr_irqs_gsi(void) ++{ ++ return nr_irqs_gsi; ++} ++ + #ifdef CONFIG_SPARSE_IRQ ++int nr_dynamic_irqs; ++ + int __init arch_probe_nr_irqs(void) + { + int nr; +@@ -3878,6 +3921,8 @@ int __init arch_probe_nr_irqs(void) + if (nr < nr_irqs) + nr_irqs = nr; + ++ nr_irqs += nr_dynamic_irqs; ++ + return 0; + } + #endif +diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c +index 7ff61d6..d1e6e60 100644 +--- a/arch/x86/kernel/apic/nmi.c ++++ b/arch/x86/kernel/apic/nmi.c +@@ -558,6 +558,9 @@ void arch_trigger_all_cpu_backtrace(void) + { + int i; + ++ if (!cpu_has_apic) ++ return; ++ + cpumask_copy(&backtrace_mask, cpu_online_mask); + + printk(KERN_INFO "sending NMI to all CPUs:\n"); +diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile +index f4361b5..404e458 100644 +--- a/arch/x86/kernel/cpu/mtrr/Makefile ++++ b/arch/x86/kernel/cpu/mtrr/Makefile +@@ -1,3 +1,4 @@ + obj-y := main.o if.o generic.o state.o cleanup.o + obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o ++obj-$(CONFIG_XEN_DOM0) += xen.o + +diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c +index 33af141..378f8dc 100644 +--- a/arch/x86/kernel/cpu/mtrr/amd.c ++++ b/arch/x86/kernel/cpu/mtrr/amd.c +@@ -108,6 +108,11 @@ amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type) + return 0; + } + ++static int amd_num_var_ranges(void) ++{ ++ return 2; ++} ++ + static struct mtrr_ops amd_mtrr_ops = { + .vendor = X86_VENDOR_AMD, + .set = amd_set_mtrr, +@@ -115,6 +120,7 @@ static struct mtrr_ops amd_mtrr_ops = { + .get_free_region = generic_get_free_region, + .validate_add_page = amd_validate_add_page, + .have_wrcomb = positive_have_wrcomb, ++ .num_var_ranges = amd_num_var_ranges, + }; + + int __init amd_init_mtrr(void) +diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c +index de89f14..7c686a0 100644 +--- a/arch/x86/kernel/cpu/mtrr/centaur.c ++++ b/arch/x86/kernel/cpu/mtrr/centaur.c +@@ -110,6 +110,11 @@ centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int t + return 0; + } + ++static int centaur_num_var_ranges(void) ++{ ++ return 8; ++} ++ + static struct mtrr_ops centaur_mtrr_ops = { + .vendor = X86_VENDOR_CENTAUR, + .set = centaur_set_mcr, +@@ -117,6 +122,7 @@ static struct mtrr_ops centaur_mtrr_ops = { + .get_free_region = centaur_get_free_region, + .validate_add_page = centaur_validate_add_page, + .have_wrcomb = positive_have_wrcomb, ++ .num_var_ranges = centaur_num_var_ranges, + }; + + int __init centaur_init_mtrr(void) +diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c +index 228d982..fd6edcc 100644 +--- a/arch/x86/kernel/cpu/mtrr/cyrix.c ++++ b/arch/x86/kernel/cpu/mtrr/cyrix.c +@@ -265,6 +265,11 @@ static void cyrix_set_all(void) + post_set(); + } + ++static int cyrix_num_var_ranges(void) ++{ ++ return 8; ++} ++ + static struct mtrr_ops cyrix_mtrr_ops = { + .vendor = X86_VENDOR_CYRIX, + .set_all = cyrix_set_all, +@@ -273,6 +278,7 @@ static struct mtrr_ops cyrix_mtrr_ops = { + .get_free_region = cyrix_get_free_region, + .validate_add_page = generic_validate_add_page, + .have_wrcomb = positive_have_wrcomb, ++ .num_var_ranges = cyrix_num_var_ranges, + }; + + int __init cyrix_init_mtrr(void) +diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c +index 55da0c5..42f30cd 100644 +--- a/arch/x86/kernel/cpu/mtrr/generic.c ++++ b/arch/x86/kernel/cpu/mtrr/generic.c +@@ -749,8 +749,16 @@ int positive_have_wrcomb(void) + return 1; + } + +-/* +- * Generic structure... ++static int generic_num_var_ranges(void) ++{ ++ unsigned long config = 0, dummy; ++ ++ rdmsr(MSR_MTRRcap, config, dummy); ++ ++ return config & 0xff; ++} ++ ++/* generic structure... + */ + struct mtrr_ops generic_mtrr_ops = { + .use_intel_if = 1, +@@ -760,4 +768,5 @@ struct mtrr_ops generic_mtrr_ops = { + .set = generic_set_mtrr, + .validate_add_page = generic_validate_add_page, + .have_wrcomb = generic_have_wrcomb, ++ .num_var_ranges = generic_num_var_ranges, + }; +diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c +index 84e83de..c8cb9ed 100644 +--- a/arch/x86/kernel/cpu/mtrr/main.c ++++ b/arch/x86/kernel/cpu/mtrr/main.c +@@ -110,21 +110,6 @@ static int have_wrcomb(void) + return mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0; + } + +-/* This function returns the number of variable MTRRs */ +-static void __init set_num_var_ranges(void) +-{ +- unsigned long config = 0, dummy; +- +- if (use_intel()) +- rdmsr(MSR_MTRRcap, config, dummy); +- else if (is_cpu(AMD)) +- config = 2; +- else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) +- config = 8; +- +- num_var_ranges = config & 0xff; +-} +- + static void __init init_table(void) + { + int i, max; +@@ -711,8 +696,11 @@ void __init mtrr_bp_init(void) + } + } + ++ /* Let Xen code override the above if it wants */ ++ xen_init_mtrr(); ++ + if (mtrr_if) { +- set_num_var_ranges(); ++ num_var_ranges = mtrr_if->num_var_ranges(); + init_table(); + if (use_intel()) { + get_mtrr_state(); +diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h +index a501dee..98569c3 100644 +--- a/arch/x86/kernel/cpu/mtrr/mtrr.h ++++ b/arch/x86/kernel/cpu/mtrr/mtrr.h +@@ -5,6 +5,8 @@ + #include + #include + ++#include ++ + #define MTRR_CHANGE_MASK_FIXED 0x01 + #define MTRR_CHANGE_MASK_VARIABLE 0x02 + #define MTRR_CHANGE_MASK_DEFTYPE 0x04 +@@ -25,6 +27,8 @@ struct mtrr_ops { + int (*validate_add_page)(unsigned long base, unsigned long size, + unsigned int type); + int (*have_wrcomb)(void); ++ ++ int (*num_var_ranges)(void); + }; + + extern int generic_get_free_region(unsigned long base, unsigned long size, +@@ -73,6 +77,13 @@ void mtrr_wrmsr(unsigned, unsigned, unsigned); + int amd_init_mtrr(void); + int cyrix_init_mtrr(void); + int centaur_init_mtrr(void); ++#ifdef CONFIG_XEN_DOM0 ++void xen_init_mtrr(void); ++#else ++static inline void xen_init_mtrr(void) ++{ ++} ++#endif + + extern int changed_by_mtrr_cleanup; + extern int mtrr_cleanup(unsigned address_bits); +diff --git a/arch/x86/kernel/cpu/mtrr/xen.c b/arch/x86/kernel/cpu/mtrr/xen.c +new file mode 100644 +index 0000000..852018b +--- /dev/null ++++ b/arch/x86/kernel/cpu/mtrr/xen.c +@@ -0,0 +1,109 @@ ++#include ++#include ++ ++#include ++ ++#include "mtrr.h" ++ ++#include ++#include ++#include ++#include ++ ++static void xen_set_mtrr(unsigned int reg, unsigned long base, ++ unsigned long size, mtrr_type type) ++{ ++ struct xen_platform_op op; ++ int error; ++ ++ /* mtrr_ops->set() is called once per CPU, ++ * but Xen's ops apply to all CPUs. ++ */ ++ if (smp_processor_id()) ++ return; ++ ++ if (size == 0) { ++ op.cmd = XENPF_del_memtype; ++ op.u.del_memtype.handle = 0; ++ op.u.del_memtype.reg = reg; ++ } else { ++ op.cmd = XENPF_add_memtype; ++ op.u.add_memtype.mfn = base; ++ op.u.add_memtype.nr_mfns = size; ++ op.u.add_memtype.type = type; ++ } ++ ++ error = HYPERVISOR_dom0_op(&op); ++ BUG_ON(error != 0); ++} ++ ++static void xen_get_mtrr(unsigned int reg, unsigned long *base, ++ unsigned long *size, mtrr_type *type) ++{ ++ struct xen_platform_op op; ++ ++ op.cmd = XENPF_read_memtype; ++ op.u.read_memtype.reg = reg; ++ if (HYPERVISOR_dom0_op(&op) != 0) { ++ *base = 0; ++ *size = 0; ++ *type = 0; ++ return; ++ } ++ ++ *size = op.u.read_memtype.nr_mfns; ++ *base = op.u.read_memtype.mfn; ++ *type = op.u.read_memtype.type; ++} ++ ++static int __init xen_num_var_ranges(void) ++{ ++ int ranges; ++ struct xen_platform_op op; ++ ++ op.cmd = XENPF_read_memtype; ++ ++ for (ranges = 0; ; ranges++) { ++ op.u.read_memtype.reg = ranges; ++ if (HYPERVISOR_dom0_op(&op) != 0) ++ break; ++ } ++ return ranges; ++} ++ ++/* ++ * DOM0 TODO: Need to fill in the remaining mtrr methods to have full ++ * working userland mtrr support. ++ */ ++static struct mtrr_ops xen_mtrr_ops = { ++ .vendor = X86_VENDOR_UNKNOWN, ++ .get_free_region = generic_get_free_region, ++ .set = xen_set_mtrr, ++ .get = xen_get_mtrr, ++ .have_wrcomb = positive_have_wrcomb, ++ .validate_add_page = generic_validate_add_page, ++ .use_intel_if = 0, ++ .num_var_ranges = xen_num_var_ranges, ++}; ++ ++void __init xen_init_mtrr(void) ++{ ++ /* ++ * Check that we're running under Xen, and privileged enough ++ * to play with MTRRs. ++ */ ++ if (!xen_initial_domain()) ++ return; ++ ++ /* ++ * Check that the CPU has an MTRR implementation we can ++ * support. ++ */ ++ if (cpu_has_mtrr || ++ cpu_has_k6_mtrr || ++ cpu_has_cyrix_arr || ++ cpu_has_centaur_mcr) { ++ mtrr_if = &xen_mtrr_ops; ++ pat_init(); ++ } ++} +diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c +index ff95824..ebd4c51 100644 +--- a/arch/x86/kernel/crash.c ++++ b/arch/x86/kernel/crash.c +@@ -28,7 +28,6 @@ + #include + #include + +- + #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) + + static void kdump_nmi_callback(int cpu, struct die_args *args) +diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c +index d17d482..4d0aded 100644 +--- a/arch/x86/kernel/e820.c ++++ b/arch/x86/kernel/e820.c +@@ -750,6 +750,36 @@ static int __init find_overlapped_early(u64 start, u64 end) + return i; + } + ++u64 __init early_res_next_free(u64 addr) ++{ ++ int i; ++ u64 end = addr; ++ struct early_res *r; ++ ++ for (i = 0; i < MAX_EARLY_RES; i++) { ++ r = &early_res[i]; ++ if (addr >= r->start && addr < r->end) { ++ end = r->end; ++ break; ++ } ++ } ++ return end; ++} ++ ++u64 __init early_res_next_reserved(u64 addr, u64 max) ++{ ++ int i; ++ struct early_res *r; ++ u64 next_res = max; ++ ++ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { ++ r = &early_res[i]; ++ if ((r->start >= addr) && (r->start < next_res)) ++ next_res = r->start; ++ } ++ return next_res; ++} ++ + /* + * Drop the i-th range from the early reservation map, + * by copying any higher ranges down one over it, and +diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S +index c097e7d..7764118 100644 +--- a/arch/x86/kernel/entry_32.S ++++ b/arch/x86/kernel/entry_32.S +@@ -1088,6 +1088,9 @@ ENTRY(xen_failsafe_callback) + .previous + ENDPROC(xen_failsafe_callback) + ++BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK, ++ xen_evtchn_do_upcall) ++ + #endif /* CONFIG_XEN */ + + #ifdef CONFIG_FUNCTION_TRACER +diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S +index b5c061f..a626344 100644 +--- a/arch/x86/kernel/entry_64.S ++++ b/arch/x86/kernel/entry_64.S +@@ -1364,6 +1364,9 @@ ENTRY(xen_failsafe_callback) + CFI_ENDPROC + END(xen_failsafe_callback) + ++apicinterrupt XEN_HVM_EVTCHN_CALLBACK \ ++ xen_hvm_callback_vector xen_evtchn_do_upcall ++ + #endif /* CONFIG_XEN */ + + /* +diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c +index 0b06cd7..f59b07a 100644 +--- a/arch/x86/kernel/head64.c ++++ b/arch/x86/kernel/head64.c +@@ -79,6 +79,8 @@ void __init x86_64_start_kernel(char * real_mode_data) + /* Cleanup the over mapped high alias */ + cleanup_highmap(); + ++ max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT; ++ + for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) { + #ifdef CONFIG_EARLY_PRINTK + set_intr_gate(i, &early_idt_handlers[i]); +diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c +index c771e1a..8b970b8 100644 +--- a/arch/x86/kernel/hpet.c ++++ b/arch/x86/kernel/hpet.c +@@ -98,7 +98,7 @@ static int __init hpet_setup(char *str) + } + __setup("hpet=", hpet_setup); + +-static int __init disable_hpet(char *str) ++int __init disable_hpet(char *str) + { + boot_hpet_disable = 1; + return 1; +diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c +index 99c4d30..919c1a8 100644 +--- a/arch/x86/kernel/ioport.c ++++ b/arch/x86/kernel/ioport.c +@@ -30,13 +30,29 @@ static void set_bitmap(unsigned long *bitmap, unsigned int base, + } + } + ++void native_set_io_bitmap(struct thread_struct *t, ++ unsigned long bytes_updated) ++{ ++ struct tss_struct *tss; ++ ++ if (!bytes_updated) ++ return; ++ ++ tss = &__get_cpu_var(init_tss); ++ ++ /* Update the TSS: */ ++ if (t->io_bitmap_ptr) ++ memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated); ++ else ++ memset(tss->io_bitmap, 0xff, bytes_updated); ++} ++ + /* + * this changes the io permissions bitmap in the current task. + */ + asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) + { + struct thread_struct *t = ¤t->thread; +- struct tss_struct *tss; + unsigned int i, max_long, bytes, bytes_updated; + + if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) +@@ -61,13 +77,13 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) + } + + /* +- * do it in the per-thread copy and in the TSS ... ++ * do it in the per-thread copy + * +- * Disable preemption via get_cpu() - we must not switch away ++ * Disable preemption - we must not switch away + * because the ->io_bitmap_max value must match the bitmap + * contents: + */ +- tss = &per_cpu(init_tss, get_cpu()); ++ preempt_disable(); + + set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); + +@@ -85,10 +101,9 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) + + t->io_bitmap_max = bytes; + +- /* Update the TSS: */ +- memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated); ++ set_io_bitmap(t, bytes_updated); + +- put_cpu(); ++ preempt_enable(); + + return 0; + } +@@ -119,11 +134,10 @@ static int do_iopl(unsigned int level, struct pt_regs *regs) + return 0; + } + +-#ifdef CONFIG_X86_32 +-long sys_iopl(struct pt_regs *regs) ++asmlinkage long sys_iopl(unsigned int level) + { +- unsigned int level = regs->bx; + struct thread_struct *t = ¤t->thread; ++ struct pt_regs *regs = task_pt_regs(current); + int rc; + + rc = do_iopl(level, regs); +@@ -135,9 +149,3 @@ long sys_iopl(struct pt_regs *regs) + out: + return rc; + } +-#else +-asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs) +-{ +- return do_iopl(level, regs); +-} +-#endif +diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c +index ec6ef60..fa5b061 100644 +--- a/arch/x86/kernel/ldt.c ++++ b/arch/x86/kernel/ldt.c +@@ -109,6 +109,9 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) + + mutex_init(&mm->context.lock); + mm->context.size = 0; ++#ifdef CONFIG_XEN ++ mm->context.has_foreign_mappings = 0; ++#endif + old_mm = current->mm; + if (old_mm && old_mm->context.size > 0) { + mutex_lock(&old_mm->context.lock); +diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c +index 378e9a8..86ca771 100644 +--- a/arch/x86/kernel/microcode_core.c ++++ b/arch/x86/kernel/microcode_core.c +@@ -81,6 +81,8 @@ + #include + #include + ++#include ++#include + #include + #include + +@@ -503,7 +505,9 @@ static int __init microcode_init(void) + struct cpuinfo_x86 *c = &cpu_data(0); + int error; + +- if (c->x86_vendor == X86_VENDOR_INTEL) ++ if (xen_pv_domain()) ++ microcode_ops = init_xen_microcode(); ++ else if (c->x86_vendor == X86_VENDOR_INTEL) + microcode_ops = init_intel_microcode(); + else if (c->x86_vendor == X86_VENDOR_AMD) + microcode_ops = init_amd_microcode(); +diff --git a/arch/x86/kernel/microcode_xen.c b/arch/x86/kernel/microcode_xen.c +new file mode 100644 +index 0000000..16c742e +--- /dev/null ++++ b/arch/x86/kernel/microcode_xen.c +@@ -0,0 +1,201 @@ ++/* ++ * Xen microcode update driver ++ * ++ * Xen does most of the work here. We just pass the whole blob into ++ * Xen, and it will apply it to all CPUs as appropriate. Xen will ++ * worry about how different CPU models are actually updated. ++ */ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++#include ++#include ++ ++#include ++#include ++ ++MODULE_DESCRIPTION("Xen microcode update driver"); ++MODULE_LICENSE("GPL"); ++ ++struct xen_microcode { ++ size_t len; ++ char data[0]; ++}; ++ ++static int xen_microcode_update(int cpu) ++{ ++ int err; ++ struct xen_platform_op op; ++ struct ucode_cpu_info *uci = ucode_cpu_info + cpu; ++ struct xen_microcode *uc = uci->mc; ++ ++ if (uc == NULL || uc->len == 0) { ++ /* ++ * We do all cpus at once, so we don't need to do ++ * other cpus explicitly (besides, these vcpu numbers ++ * have no relationship to underlying physical cpus). ++ */ ++ return 0; ++ } ++ ++ op.cmd = XENPF_microcode_update; ++ set_xen_guest_handle(op.u.microcode.data, uc->data); ++ op.u.microcode.length = uc->len; ++ ++ err = HYPERVISOR_dom0_op(&op); ++ ++ if (err != 0) ++ printk(KERN_WARNING "microcode_xen: microcode update failed: %d\n", err); ++ ++ return err; ++} ++ ++static enum ucode_state xen_request_microcode_fw(int cpu, struct device *device) ++{ ++ char name[30]; ++ struct cpuinfo_x86 *c = &cpu_data(cpu); ++ const struct firmware *firmware; ++ struct ucode_cpu_info *uci = ucode_cpu_info + cpu; ++ enum ucode_state ret; ++ struct xen_microcode *uc; ++ size_t size; ++ int err; ++ ++ switch (c->x86_vendor) { ++ case X86_VENDOR_INTEL: ++ snprintf(name, sizeof(name), "intel-ucode/%02x-%02x-%02x", ++ c->x86, c->x86_model, c->x86_mask); ++ break; ++ ++ case X86_VENDOR_AMD: ++ snprintf(name, sizeof(name), "amd-ucode/microcode_amd.bin"); ++ break; ++ ++ default: ++ return UCODE_NFOUND; ++ } ++ ++ err = request_firmware(&firmware, name, device); ++ if (err) { ++ pr_debug("microcode: data file %s load failed\n", name); ++ return UCODE_NFOUND; ++ } ++ ++ /* ++ * Only bother getting real firmware for cpu 0; the others get ++ * dummy placeholders. ++ */ ++ if (cpu == 0) ++ size = firmware->size; ++ else ++ size = 0; ++ ++ if (uci->mc != NULL) { ++ vfree(uci->mc); ++ uci->mc = NULL; ++ } ++ ++ ret = UCODE_ERROR; ++ uc = vmalloc(sizeof(*uc) + size); ++ if (uc == NULL) ++ goto out; ++ ++ ret = UCODE_OK; ++ uc->len = size; ++ memcpy(uc->data, firmware->data, uc->len); ++ ++ uci->mc = uc; ++ ++out: ++ release_firmware(firmware); ++ ++ return ret; ++} ++ ++static enum ucode_state xen_request_microcode_user(int cpu, ++ const void __user *buf, size_t size) ++{ ++ struct ucode_cpu_info *uci = ucode_cpu_info + cpu; ++ struct xen_microcode *uc; ++ enum ucode_state ret; ++ size_t unread; ++ ++ if (cpu != 0) { ++ /* No real firmware for non-zero cpus; just store a ++ placeholder */ ++ size = 0; ++ } ++ ++ if (uci->mc != NULL) { ++ vfree(uci->mc); ++ uci->mc = NULL; ++ } ++ ++ ret = UCODE_ERROR; ++ uc = vmalloc(sizeof(*uc) + size); ++ if (uc == NULL) ++ goto out; ++ ++ uc->len = size; ++ ++ ret = UCODE_NFOUND; ++ ++ /* XXX This sporadically returns uncopied bytes, so we return ++ EFAULT. As far as I can see, the usermode code ++ (microcode_ctl) isn't doing anything wrong... */ ++ unread = copy_from_user(uc->data, buf, size); ++ ++ if (unread != 0) { ++ printk(KERN_WARNING "failed to read %zd of %zd bytes at %p -> %p\n", ++ unread, size, buf, uc->data); ++ goto out; ++ } ++ ++ ret = UCODE_OK; ++ ++out: ++ if (ret == 0) ++ uci->mc = uc; ++ else ++ vfree(uc); ++ ++ return ret; ++} ++ ++static void xen_microcode_fini_cpu(int cpu) ++{ ++ struct ucode_cpu_info *uci = ucode_cpu_info + cpu; ++ ++ vfree(uci->mc); ++ uci->mc = NULL; ++} ++ ++static int xen_collect_cpu_info(int cpu, struct cpu_signature *sig) ++{ ++ sig->sig = 0; ++ sig->pf = 0; ++ sig->rev = 0; ++ ++ return 0; ++} ++ ++static struct microcode_ops microcode_xen_ops = { ++ .request_microcode_user = xen_request_microcode_user, ++ .request_microcode_fw = xen_request_microcode_fw, ++ .collect_cpu_info = xen_collect_cpu_info, ++ .apply_microcode = xen_microcode_update, ++ .microcode_fini_cpu = xen_microcode_fini_cpu, ++}; ++ ++struct microcode_ops * __init init_xen_microcode(void) ++{ ++ if (!xen_initial_domain()) ++ return NULL; ++ return µcode_xen_ops; ++} +diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c +index 1b1739d..f7e115c 100644 +--- a/arch/x86/kernel/paravirt.c ++++ b/arch/x86/kernel/paravirt.c +@@ -376,6 +376,7 @@ struct pv_cpu_ops pv_cpu_ops = { + .swapgs = native_swapgs, + + .set_iopl_mask = native_set_iopl_mask, ++ .set_io_bitmap = native_set_io_bitmap, + .io_delay = native_io_delay, + + .start_context_switch = paravirt_nop, +diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c +index 1a2d4b1..2f158a5 100644 +--- a/arch/x86/kernel/pci-calgary_64.c ++++ b/arch/x86/kernel/pci-calgary_64.c +@@ -46,6 +46,7 @@ + #include + #include + #include ++#include + + #ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT + int use_calgary __read_mostly = 1; +@@ -249,7 +250,7 @@ static unsigned long iommu_range_alloc(struct device *dev, + if (panic_on_overflow) + panic("Calgary: fix the allocator.\n"); + else +- return bad_dma_address; ++ return DMA_ERROR_CODE; + } + } + +@@ -265,11 +266,11 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, + void *vaddr, unsigned int npages, int direction) + { + unsigned long entry; +- dma_addr_t ret = bad_dma_address; ++ dma_addr_t ret = DMA_ERROR_CODE; + + entry = iommu_range_alloc(dev, tbl, npages); + +- if (unlikely(entry == bad_dma_address)) ++ if (unlikely(entry == DMA_ERROR_CODE)) + goto error; + + /* set the return dma address */ +@@ -284,7 +285,7 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, + error: + printk(KERN_WARNING "Calgary: failed to allocate %u pages in " + "iommu %p\n", npages, tbl); +- return bad_dma_address; ++ return DMA_ERROR_CODE; + } + + static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, +@@ -295,8 +296,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, + unsigned long flags; + + /* were we called with bad_dma_address? */ +- badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE); +- if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) { ++ badend = DMA_ERROR_CODE + (EMERGENCY_PAGES * PAGE_SIZE); ++ if (unlikely((dma_addr >= DMA_ERROR_CODE) && (dma_addr < badend))) { + WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA " + "address 0x%Lx\n", dma_addr); + return; +@@ -380,7 +381,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, + npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE); + + entry = iommu_range_alloc(dev, tbl, npages); +- if (entry == bad_dma_address) { ++ if (entry == DMA_ERROR_CODE) { + /* makes sure unmap knows to stop */ + s->dma_length = 0; + goto error; +@@ -398,7 +399,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, + error: + calgary_unmap_sg(dev, sg, nelems, dir, NULL); + for_each_sg(sg, s, nelems, i) { +- sg->dma_address = bad_dma_address; ++ sg->dma_address = DMA_ERROR_CODE; + sg->dma_length = 0; + } + return 0; +@@ -453,7 +454,7 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size, + + /* set up tces to cover the allocated range */ + mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); +- if (mapping == bad_dma_address) ++ if (mapping == DMA_ERROR_CODE) + goto free; + *dma_handle = mapping; + return ret; +@@ -734,7 +735,7 @@ static void __init calgary_reserve_regions(struct pci_dev *dev) + struct iommu_table *tbl = pci_iommu(dev->bus); + + /* reserve EMERGENCY_PAGES from bad_dma_address and up */ +- iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES); ++ iommu_range_reserve(tbl, DMA_ERROR_CODE, EMERGENCY_PAGES); + + /* avoid the BIOS/VGA first 640KB-1MB region */ + /* for CalIOC2 - avoid the entire first MB */ +@@ -1349,6 +1350,23 @@ static void __init get_tce_space_from_tar(void) + return; + } + ++static int __init calgary_iommu_init(void) ++{ ++ int ret; ++ ++ /* ok, we're trying to use Calgary - let's roll */ ++ printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n"); ++ ++ ret = calgary_init(); ++ if (ret) { ++ printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " ++ "falling back to no_iommu\n", ret); ++ return ret; ++ } ++ ++ return 0; ++} ++ + void __init detect_calgary(void) + { + int bus; +@@ -1362,7 +1380,7 @@ void __init detect_calgary(void) + * if the user specified iommu=off or iommu=soft or we found + * another HW IOMMU already, bail out. + */ +- if (swiotlb || no_iommu || iommu_detected) ++ if (no_iommu || iommu_detected) + return; + + if (!use_calgary) +@@ -1447,9 +1465,7 @@ void __init detect_calgary(void) + printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n", + specified_table_size); + +- /* swiotlb for devices that aren't behind the Calgary. */ +- if (max_pfn > MAX_DMA32_PFN) +- swiotlb = 1; ++ x86_init.iommu.iommu_init = calgary_iommu_init; + } + return; + +@@ -1462,35 +1478,6 @@ cleanup: + } + } + +-int __init calgary_iommu_init(void) +-{ +- int ret; +- +- if (no_iommu || (swiotlb && !calgary_detected)) +- return -ENODEV; +- +- if (!calgary_detected) +- return -ENODEV; +- +- /* ok, we're trying to use Calgary - let's roll */ +- printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n"); +- +- ret = calgary_init(); +- if (ret) { +- printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " +- "falling back to no_iommu\n", ret); +- return ret; +- } +- +- force_iommu = 1; +- bad_dma_address = 0x0; +- /* dma_ops is set to swiotlb or nommu */ +- if (!dma_ops) +- dma_ops = &nommu_dma_ops; +- +- return 0; +-} +- + static int __init calgary_parse_options(char *p) + { + unsigned int bridge; +diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c +index 6ac3931..3e57c58 100644 +--- a/arch/x86/kernel/pci-dma.c ++++ b/arch/x86/kernel/pci-dma.c +@@ -11,10 +11,12 @@ + #include + #include + #include ++#include ++#include + + static int forbid_dac __read_mostly; + +-struct dma_map_ops *dma_ops; ++struct dma_map_ops *dma_ops = &nommu_dma_ops; + EXPORT_SYMBOL(dma_ops); + + static int iommu_sac_force __read_mostly; +@@ -42,9 +44,6 @@ int iommu_detected __read_mostly = 0; + */ + int iommu_pass_through __read_mostly; + +-dma_addr_t bad_dma_address __read_mostly = 0; +-EXPORT_SYMBOL(bad_dma_address); +- + /* Dummy device used for NULL arguments (normally ISA). */ + struct device x86_dma_fallback_dev = { + .init_name = "fallback device", +@@ -126,18 +125,19 @@ void __init pci_iommu_alloc(void) + /* free the range so iommu could get some range less than 4G */ + dma32_free_bootmem(); + #endif ++ if (pci_xen_swiotlb_detect() || pci_swiotlb_detect()) ++ goto out; + +- /* +- * The order of these functions is important for +- * fall-back/fail-over reasons +- */ + gart_iommu_hole_init(); + + detect_calgary(); + + detect_intel_iommu(); + ++ /* needs to be called after gart_iommu_hole_init */ + amd_iommu_detect(); ++out: ++ pci_xen_swiotlb_init(); + + pci_swiotlb_init(); + } +@@ -289,25 +289,17 @@ static int __init pci_iommu_init(void) + #ifdef CONFIG_PCI + dma_debug_add_bus(&pci_bus_type); + #endif ++ x86_init.iommu.iommu_init(); + +- calgary_iommu_init(); +- +- intel_iommu_init(); +- +- amd_iommu_init(); ++ if (swiotlb || xen_swiotlb) { ++ printk(KERN_INFO "PCI-DMA: " ++ "Using software bounce buffering for IO (SWIOTLB)\n"); ++ swiotlb_print_info(); ++ } else ++ swiotlb_free(); + +- gart_iommu_init(); +- +- no_iommu_init(); + return 0; + } +- +-void pci_iommu_shutdown(void) +-{ +- gart_iommu_shutdown(); +- +- amd_iommu_shutdown(); +-} + /* Must execute after PCI subsystem */ + rootfs_initcall(pci_iommu_init); + +diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c +index 1c76691..8c9dd05 100644 +--- a/arch/x86/kernel/pci-gart_64.c ++++ b/arch/x86/kernel/pci-gart_64.c +@@ -39,6 +39,7 @@ + #include + #include + #include ++#include + + static unsigned long iommu_bus_base; /* GART remapping area (physical) */ + static unsigned long iommu_size; /* size of remapping area bytes */ +@@ -46,6 +47,8 @@ static unsigned long iommu_pages; /* .. and in pages */ + + static u32 *iommu_gatt_base; /* Remapping table */ + ++static dma_addr_t bad_dma_addr; ++ + /* + * If this is disabled the IOMMU will use an optimized flushing strategy + * of only flushing when an mapping is reused. With it true the GART is +@@ -216,7 +219,7 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, + if (panic_on_overflow) + panic("dma_map_area overflow %lu bytes\n", size); + iommu_full(dev, size, dir); +- return bad_dma_address; ++ return bad_dma_addr; + } + + for (i = 0; i < npages; i++) { +@@ -302,7 +305,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, + + if (nonforced_iommu(dev, addr, s->length)) { + addr = dma_map_area(dev, addr, s->length, dir, 0); +- if (addr == bad_dma_address) { ++ if (addr == bad_dma_addr) { + if (i > 0) + gart_unmap_sg(dev, sg, i, dir, NULL); + nents = 0; +@@ -455,7 +458,7 @@ error: + + iommu_full(dev, pages << PAGE_SHIFT, dir); + for_each_sg(sg, s, nents, i) +- s->dma_address = bad_dma_address; ++ s->dma_address = bad_dma_addr; + return 0; + } + +@@ -479,7 +482,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, + DMA_BIDIRECTIONAL, align_mask); + + flush_gart(); +- if (paddr != bad_dma_address) { ++ if (paddr != bad_dma_addr) { + *dma_addr = paddr; + return page_address(page); + } +@@ -499,6 +502,11 @@ gart_free_coherent(struct device *dev, size_t size, void *vaddr, + free_pages((unsigned long)vaddr, get_order(size)); + } + ++static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr) ++{ ++ return (dma_addr == bad_dma_addr); ++} ++ + static int no_agp; + + static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) +@@ -689,14 +697,15 @@ static struct dma_map_ops gart_dma_ops = { + .unmap_page = gart_unmap_page, + .alloc_coherent = gart_alloc_coherent, + .free_coherent = gart_free_coherent, ++ .mapping_error = gart_mapping_error, + }; + +-void gart_iommu_shutdown(void) ++static void gart_iommu_shutdown(void) + { + struct pci_dev *dev; + int i; + +- if (no_agp && (dma_ops != &gart_dma_ops)) ++ if (no_agp) + return; + + for (i = 0; i < num_k8_northbridges; i++) { +@@ -711,7 +720,7 @@ void gart_iommu_shutdown(void) + } + } + +-void __init gart_iommu_init(void) ++int __init gart_iommu_init(void) + { + struct agp_kern_info info; + unsigned long iommu_start; +@@ -721,7 +730,7 @@ void __init gart_iommu_init(void) + long i; + + if (num_k8_northbridges == 0) +- return; ++ return 0; + + #ifndef CONFIG_AGP_AMD64 + no_agp = 1; +@@ -733,13 +742,6 @@ void __init gart_iommu_init(void) + (agp_copy_info(agp_bridge, &info) < 0); + #endif + +- if (swiotlb) +- return; +- +- /* Did we detect a different HW IOMMU? */ +- if (iommu_detected && !gart_iommu_aperture) +- return; +- + if (no_iommu || + (!force_iommu && max_pfn <= MAX_DMA32_PFN) || + !gart_iommu_aperture || +@@ -749,7 +751,7 @@ void __init gart_iommu_init(void) + "but GART IOMMU not available.\n"); + printk(KERN_WARNING "falling back to iommu=soft.\n"); + } +- return; ++ return 0; + } + + /* need to map that range */ +@@ -794,7 +796,7 @@ void __init gart_iommu_init(void) + + iommu_start = aper_size - iommu_size; + iommu_bus_base = info.aper_base + iommu_start; +- bad_dma_address = iommu_bus_base; ++ bad_dma_addr = iommu_bus_base; + iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); + + /* +@@ -841,6 +843,10 @@ void __init gart_iommu_init(void) + + flush_gart(); + dma_ops = &gart_dma_ops; ++ x86_platform.iommu_shutdown = gart_iommu_shutdown; ++ swiotlb = 0; ++ ++ return 0; + } + + void __init gart_parse_options(char *p) +diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c +index a3933d4..22be12b 100644 +--- a/arch/x86/kernel/pci-nommu.c ++++ b/arch/x86/kernel/pci-nommu.c +@@ -33,7 +33,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page, + dma_addr_t bus = page_to_phys(page) + offset; + WARN_ON(size == 0); + if (!check_addr("map_single", dev, bus, size)) +- return bad_dma_address; ++ return DMA_ERROR_CODE; + flush_write_buffers(); + return bus; + } +@@ -103,12 +103,3 @@ struct dma_map_ops nommu_dma_ops = { + .sync_sg_for_device = nommu_sync_sg_for_device, + .is_phys = 1, + }; +- +-void __init no_iommu_init(void) +-{ +- if (dma_ops) +- return; +- +- force_iommu = 0; /* no HW IOMMU */ +- dma_ops = &nommu_dma_ops; +-} +diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c +index aaa6b78..7d2829d 100644 +--- a/arch/x86/kernel/pci-swiotlb.c ++++ b/arch/x86/kernel/pci-swiotlb.c +@@ -42,18 +42,31 @@ static struct dma_map_ops swiotlb_dma_ops = { + .dma_supported = NULL, + }; + +-void __init pci_swiotlb_init(void) ++/* ++ * pci_swiotlb_detect - set swiotlb to 1 if necessary ++ * ++ * This returns non-zero if we are forced to use swiotlb (by the boot ++ * option). ++ */ ++int __init pci_swiotlb_detect(void) + { ++ int use_swiotlb = swiotlb | swiotlb_force; ++ + /* don't initialize swiotlb if iommu=off (no_iommu=1) */ + #ifdef CONFIG_X86_64 +- if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)) ++ if (!no_iommu && max_pfn > MAX_DMA32_PFN) + swiotlb = 1; + #endif + if (swiotlb_force) + swiotlb = 1; ++ ++ return use_swiotlb; ++} ++ ++void __init pci_swiotlb_init(void) ++{ + if (swiotlb) { +- printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); +- swiotlb_init(); ++ swiotlb_init(0); + dma_ops = &swiotlb_dma_ops; + } + } +diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c +index 5fd5b07..11d8667 100644 +--- a/arch/x86/kernel/process.c ++++ b/arch/x86/kernel/process.c +@@ -73,16 +73,12 @@ void exit_thread(void) + unsigned long *bp = t->io_bitmap_ptr; + + if (bp) { +- struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); +- ++ preempt_disable(); + t->io_bitmap_ptr = NULL; + clear_thread_flag(TIF_IO_BITMAP); +- /* +- * Careful, clear this in the TSS too: +- */ +- memset(tss->io_bitmap, 0xff, t->io_bitmap_max); ++ set_io_bitmap(t, t->io_bitmap_max); + t->io_bitmap_max = 0; +- put_cpu(); ++ preempt_enable(); + kfree(bp); + } + } +@@ -199,19 +195,10 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, + hard_enable_TSC(); + } + +- if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { +- /* +- * Copy the relevant range of the IO bitmap. +- * Normally this is 128 bytes or less: +- */ +- memcpy(tss->io_bitmap, next->io_bitmap_ptr, +- max(prev->io_bitmap_max, next->io_bitmap_max)); +- } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { +- /* +- * Clear any possible leftover bits: +- */ +- memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); +- } ++ if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP) || ++ test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) ++ set_io_bitmap(next, ++ max(prev->io_bitmap_max, next->io_bitmap_max)); + } + + int sys_fork(struct pt_regs *regs) +diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c +index dfdfe46..b12fe8d 100644 +--- a/arch/x86/kernel/pvclock.c ++++ b/arch/x86/kernel/pvclock.c +@@ -111,6 +111,11 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) + + static atomic64_t last_value = ATOMIC64_INIT(0); + ++void pvclock_resume(void) ++{ ++ atomic64_set(&last_value, 0); ++} ++ + cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) + { + struct pvclock_shadow_time shadow; +diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c +index 200fcde..ff8cc40 100644 +--- a/arch/x86/kernel/reboot.c ++++ b/arch/x86/kernel/reboot.c +@@ -23,7 +23,7 @@ + # include + # include + #else +-# include ++# include + #endif + + /* +@@ -647,7 +647,7 @@ void native_machine_shutdown(void) + #endif + + #ifdef CONFIG_X86_64 +- pci_iommu_shutdown(); ++ x86_platform.iommu_shutdown(); + #endif + } + +diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c +index 5449a26..56b4707 100644 +--- a/arch/x86/kernel/setup.c ++++ b/arch/x86/kernel/setup.c +@@ -70,6 +70,7 @@ + #include + + #include