diff options
author | William Pitcock <nenolod@dereferenced.org> | 2011-01-13 06:57:09 -0600 |
---|---|---|
committer | William Pitcock <nenolod@dereferenced.org> | 2011-01-13 06:57:09 -0600 |
commit | 172a1491b99d8394dfc0de201f97ea1e6d1be846 (patch) | |
tree | c318d5cc2d1febc1c72d12943c7071378ebc0622 /testing | |
parent | 9fda7c74ade267723862a150620f2083acd2bdf7 (diff) | |
download | aports-172a1491b99d8394dfc0de201f97ea1e6d1be846.tar.bz2 aports-172a1491b99d8394dfc0de201f97ea1e6d1be846.tar.xz |
testing/linux-xen0: new aport
Diffstat (limited to 'testing')
-rw-r--r-- | testing/linux-xen0/APKBUILD | 147 | ||||
-rw-r--r-- | testing/linux-xen0/kernelconfig.x86_64 | 4241 | ||||
-rw-r--r-- | testing/linux-xen0/pvops.patch | 37837 |
3 files changed, 42225 insertions, 0 deletions
diff --git a/testing/linux-xen0/APKBUILD b/testing/linux-xen0/APKBUILD new file mode 100644 index 000000000..0efd6d06a --- /dev/null +++ b/testing/linux-xen0/APKBUILD @@ -0,0 +1,147 @@ +# Maintainer: William Pitcock <nenolod@dereferenced.org> + +_flavor=xen0 +pkgname=linux-${_flavor} +pkgver=2.6.32.28 +_kernver=2.6.32 +pkgrel=0 +pkgdesc="Linux kernel with dom0 support (no grsecurity)" +url=http://grsecurity.net +depends="mkinitfs linux-firmware xen" +makedepends="perl installkernel bash xen" +options="!strip" +_config=${config:-kernelconfig.${CARCH}} +install= +source="ftp://ftp.kernel.org/pub/linux/kernel/v2.6/linux-$_kernver.tar.bz2 + http://www.kernel.org/pub/linux/kernel/v2.6/longterm/v${pkgver%.*}/patch-$pkgver.bz2 + + pvops.patch + + kernelconfig.x86_64" +subpackages="$pkgname-dev linux-firmware:firmware" +arch="x86_64" +license="GPL-2" + +_abi_release=${pkgver}-${_flavor} + +prepare() { + local _patch_failed= + cd "$srcdir"/linux-$_kernver + if [ "$_kernver" != "$pkgver" ]; then + bunzip2 -c < ../patch-$pkgver.bz2 | patch -p1 -N || return 1 + fi + + # first apply patches in specified order + for i in $source; do + case $i in + *.patch) + bn=$(basename $i) + + msg "Applying $bn..." + if ! patch -s -p1 -N -i "$srcdir"/$bn; then + echo $bn >>failed + _patch_failed=1 + fi + ;; + esac + done + + if ! [ -z "$_patch_failed" ]; then + error "The following patches failed:" + cat failed + return 1 + fi + + echo "-xen0" > "$srcdir"/linux-$_kernver/localversion-xen0 + + mkdir -p "$srcdir"/build + cp "$srcdir"/$_config "$srcdir"/build/.config || return 1 + make -C "$srcdir"/linux-$_kernver O="$srcdir"/build HOSTCC="${CC:-gcc}" V=1 \ + silentoldconfig +} + +# this is so we can do: 'abuild menuconfig' to reconfigure kernel +menuconfig() { + cd "$srcdir"/build || return 1 + make menuconfig + cp .config "$startdir"/$_config +} + +build() { + cd "$srcdir"/build + make CC="${CC:-gcc}" \ + KBUILD_BUILD_VERSION="$((pkgrel + 1 ))-Alpine" V=1 \ + || return 1 +} + +package() { + cd "$srcdir"/build + mkdir -p "$pkgdir"/boot "$pkgdir"/lib/modules + make -j1 modules_install firmware_install install \ + INSTALL_MOD_PATH="$pkgdir" \ + INSTALL_PATH="$pkgdir"/boot \ + || return 1 + + rm -f "$pkgdir"/lib/modules/${_abi_release}/build \ + "$pkgdir"/lib/modules/${_abi_release}/source + install -D include/config/kernel.release \ + "$pkgdir"/usr/share/kernel/$_flavor/kernel.release +} + +dev() { + # copy the only the parts that we really need for build 3rd party + # kernel modules and install those as /usr/src/linux-headers, + # simlar to what ubuntu does + # + # this way you dont need to install the 300-400 kernel sources to + # build a tiny kernel module + # + pkgdesc="Headers and script for third party modules for grsec kernel" + local dir="$subpkgdir"/usr/src/linux-headers-${_abi_release} + + # first we import config, run prepare to set up for building + # external modules, and create the scripts + mkdir -p "$dir" + cp "$srcdir"/$_config "$dir"/.config + make -j1 -C "$srcdir"/linux-$_kernver O="$dir" HOSTCC="${CC:-gcc}" \ + silentoldconfig prepare scripts + + # remove the stuff that poits to real sources. we want 3rd party + # modules to believe this is the soruces + rm "$dir"/Makefile "$dir"/source + + # copy the needed stuff from real sources + # + # this is taken from ubuntu kernel build script + # http://kernel.ubuntu.com/git?p=ubuntu/ubuntu-jaunty.git;a=blob;f=debian/rules.d/3-binary-indep.mk;hb=HEAD + cd "$srcdir"/linux-$_kernver + find . -path './include/*' -prune -o -path './scripts/*' -prune \ + -o -type f \( -name 'Makefile*' -o -name 'Kconfig*' \ + -o -name 'Kbuild*' -o -name '*.sh' -o -name '*.pl' \ + -o -name '*.lds' \) | cpio -pdm "$dir" + cp -a drivers/media/dvb/dvb-core/*.h "$dir"/drivers/media/dvb/dvb-core + cp -a drivers/media/video/*.h "$dir"/drivers/media/video + cp -a drivers/media/dvb/frontends/*.h "$dir"/drivers/media/dvb/frontends + cp -a scripts include "$dir" + find $(find arch -name include -type d -print) -type f \ + | cpio -pdm "$dir" + + install -Dm644 "$srcdir"/build/Module.symvers \ + "$dir"/Module.symvers + + mkdir -p "$subpkgdir"/lib/modules/${_abi_release} + ln -sf /usr/src/linux-headers-${_abi_release} \ + "$subpkgdir"/lib/modules/${_abi_release}/build +} + +firmware() { + pkgdesc="Firmware for linux kernel" + replaces="linux-grsec linux-vserver" + mkdir -p "$subpkgdir"/lib + mv "$pkgdir"/lib/firmware "$subpkgdir"/lib/ +} + +md5sums="260551284ac224c3a43c4adac7df4879 linux-2.6.32.tar.bz2 +fc8c36b4638d8384a5d26a50413a1d11 patch-2.6.32.28.bz2 +2c678c4610b9d425fd3791e4ebaa0bdd pvops.patch +9bc561fbe3a4ed775bf3e9cf2f38a228 kernelconfig.x86_64" diff --git a/testing/linux-xen0/kernelconfig.x86_64 b/testing/linux-xen0/kernelconfig.x86_64 new file mode 100644 index 000000000..4343c0d83 --- /dev/null +++ b/testing/linux-xen0/kernelconfig.x86_64 @@ -0,0 +1,4241 @@ +# +# Automatically generated make config: don't edit +# Linux kernel version: 2.6.32.27 +# Tue Dec 28 00:27:19 2010 +# +CONFIG_64BIT=y +# CONFIG_X86_32 is not set +CONFIG_X86_64=y +CONFIG_X86=y +CONFIG_OUTPUT_FORMAT="elf64-x86-64" +CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig" +CONFIG_GENERIC_TIME=y +CONFIG_GENERIC_CMOS_UPDATE=y +CONFIG_CLOCKSOURCE_WATCHDOG=y +CONFIG_GENERIC_CLOCKEVENTS=y +CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y +CONFIG_LOCKDEP_SUPPORT=y +CONFIG_STACKTRACE_SUPPORT=y +CONFIG_HAVE_LATENCYTOP_SUPPORT=y +CONFIG_MMU=y +CONFIG_ZONE_DMA=y +CONFIG_GENERIC_ISA_DMA=y +CONFIG_GENERIC_IOMAP=y +CONFIG_GENERIC_BUG=y +CONFIG_GENERIC_BUG_RELATIVE_POINTERS=y +CONFIG_GENERIC_HWEIGHT=y +CONFIG_GENERIC_GPIO=y +CONFIG_ARCH_MAY_HAVE_PC_FDC=y +# CONFIG_RWSEM_GENERIC_SPINLOCK is not set +CONFIG_RWSEM_XCHGADD_ALGORITHM=y +CONFIG_ARCH_HAS_CPU_IDLE_WAIT=y +CONFIG_GENERIC_CALIBRATE_DELAY=y +CONFIG_GENERIC_TIME_VSYSCALL=y +CONFIG_ARCH_HAS_CPU_RELAX=y +CONFIG_ARCH_HAS_DEFAULT_IDLE=y +CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y +CONFIG_HAVE_SETUP_PER_CPU_AREA=y +CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y +CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y +CONFIG_HAVE_CPUMASK_OF_CPU_MAP=y +CONFIG_ARCH_HIBERNATION_POSSIBLE=y +CONFIG_ARCH_SUSPEND_POSSIBLE=y +CONFIG_ZONE_DMA32=y +CONFIG_ARCH_POPULATES_NODE_MAP=y +CONFIG_AUDIT_ARCH=y +CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y +CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y +CONFIG_GENERIC_HARDIRQS=y +CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y +CONFIG_GENERIC_IRQ_PROBE=y +CONFIG_GENERIC_PENDING_IRQ=y +CONFIG_USE_GENERIC_SMP_HELPERS=y +CONFIG_X86_64_SMP=y +CONFIG_X86_HT=y +CONFIG_X86_TRAMPOLINE=y +# CONFIG_KTIME_SCALAR is not set +CONFIG_ARCH_CPU_PROBE_RELEASE=y +CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" +CONFIG_CONSTRUCTORS=y + +# +# General setup +# +CONFIG_EXPERIMENTAL=y +CONFIG_LOCK_KERNEL=y +CONFIG_INIT_ENV_ARG_LIMIT=32 +CONFIG_LOCALVERSION="" +# CONFIG_LOCALVERSION_AUTO is not set +CONFIG_HAVE_KERNEL_GZIP=y +CONFIG_HAVE_KERNEL_BZIP2=y +CONFIG_HAVE_KERNEL_LZMA=y +CONFIG_KERNEL_GZIP=y +# CONFIG_KERNEL_BZIP2 is not set +# CONFIG_KERNEL_LZMA is not set +CONFIG_SWAP=y +CONFIG_SYSVIPC=y +CONFIG_SYSVIPC_SYSCTL=y +# CONFIG_POSIX_MQUEUE is not set +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_BSD_PROCESS_ACCT_V3=y +# CONFIG_TASKSTATS is not set +# CONFIG_AUDIT is not set + +# +# RCU Subsystem +# +CONFIG_TREE_RCU=y +# CONFIG_TREE_PREEMPT_RCU is not set +# CONFIG_RCU_TRACE is not set +CONFIG_RCU_FANOUT=32 +# CONFIG_RCU_FANOUT_EXACT is not set +# CONFIG_TREE_RCU_TRACE is not set +CONFIG_IKCONFIG=m +CONFIG_IKCONFIG_PROC=y +CONFIG_LOG_BUF_SHIFT=14 +CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y +# CONFIG_GROUP_SCHED is not set +# CONFIG_CGROUPS is not set +# CONFIG_SYSFS_DEPRECATED_V2 is not set +# CONFIG_RELAY is not set +# CONFIG_NAMESPACES is not set +CONFIG_BLK_DEV_INITRD=y +CONFIG_INITRAMFS_SOURCE="" +CONFIG_RD_GZIP=y +CONFIG_RD_BZIP2=y +CONFIG_RD_LZMA=y +CONFIG_CC_OPTIMIZE_FOR_SIZE=y +CONFIG_SYSCTL=y +CONFIG_ANON_INODES=y +CONFIG_EMBEDDED=y +CONFIG_UID16=y +CONFIG_SYSCTL_SYSCALL=y +# CONFIG_KALLSYMS is not set +CONFIG_HOTPLUG=y +CONFIG_PRINTK=y +CONFIG_BUG=y +CONFIG_ELF_CORE=y +CONFIG_PCSPKR_PLATFORM=y +CONFIG_BASE_FULL=y +CONFIG_FUTEX=y +CONFIG_EPOLL=y +CONFIG_SIGNALFD=y +CONFIG_TIMERFD=y +CONFIG_EVENTFD=y +CONFIG_SHMEM=y +CONFIG_AIO=y +CONFIG_HAVE_PERF_EVENTS=y + +# +# Kernel Performance Events And Counters +# +CONFIG_PERF_EVENTS=y +CONFIG_PERF_COUNTERS=y +CONFIG_VM_EVENT_COUNTERS=y +CONFIG_PCI_QUIRKS=y +# CONFIG_SLUB_DEBUG is not set +# CONFIG_COMPAT_BRK is not set +# CONFIG_SLAB is not set +CONFIG_SLUB=y +# CONFIG_SLOB is not set +CONFIG_PROFILING=y +CONFIG_OPROFILE=m +# CONFIG_OPROFILE_IBS is not set +# CONFIG_OPROFILE_EVENT_MULTIPLEX is not set +CONFIG_HAVE_OPROFILE=y +CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y +CONFIG_HAVE_IOREMAP_PROT=y +CONFIG_HAVE_KPROBES=y +CONFIG_HAVE_KRETPROBES=y +CONFIG_HAVE_ARCH_TRACEHOOK=y +CONFIG_HAVE_DMA_ATTRS=y +CONFIG_HAVE_DMA_API_DEBUG=y + +# +# GCOV-based kernel profiling +# +# CONFIG_GCOV_KERNEL is not set +CONFIG_SLOW_WORK=y +# CONFIG_SLOW_WORK_DEBUG is not set +# CONFIG_HAVE_GENERIC_DMA_COHERENT is not set +CONFIG_RT_MUTEXES=y +CONFIG_BASE_SMALL=0 +CONFIG_MODULES=y +# CONFIG_MODULE_FORCE_LOAD is not set +CONFIG_MODULE_UNLOAD=y +# CONFIG_MODULE_FORCE_UNLOAD is not set +CONFIG_MODVERSIONS=y +# CONFIG_MODULE_SRCVERSION_ALL is not set +CONFIG_STOP_MACHINE=y +CONFIG_BLOCK=y +CONFIG_BLK_DEV_BSG=y +# CONFIG_BLK_DEV_INTEGRITY is not set +CONFIG_BLOCK_COMPAT=y + +# +# IO Schedulers +# +CONFIG_IOSCHED_NOOP=y +CONFIG_IOSCHED_AS=y +CONFIG_IOSCHED_DEADLINE=m +CONFIG_IOSCHED_CFQ=y +# CONFIG_DEFAULT_AS is not set +# CONFIG_DEFAULT_DEADLINE is not set +CONFIG_DEFAULT_CFQ=y +# CONFIG_DEFAULT_NOOP is not set +CONFIG_DEFAULT_IOSCHED="cfq" +CONFIG_PREEMPT_NOTIFIERS=y +CONFIG_FREEZER=y + +# +# Processor type and features +# +CONFIG_TICK_ONESHOT=y +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +CONFIG_GENERIC_CLOCKEVENTS_BUILD=y +CONFIG_SMP=y +# CONFIG_SPARSE_IRQ is not set +CONFIG_X86_MPPARSE=y +CONFIG_X86_EXTENDED_PLATFORM=y +# CONFIG_X86_VSMP is not set +CONFIG_SCHED_OMIT_FRAME_POINTER=y +CONFIG_PARAVIRT_GUEST=y +CONFIG_XEN=y +CONFIG_XEN_PVHVM=y +CONFIG_XEN_MAX_DOMAIN_MEMORY=128 +CONFIG_XEN_SAVE_RESTORE=y +CONFIG_XEN_DEBUG_FS=y +CONFIG_SWIOTLB_XEN=y +CONFIG_MICROCODE_XEN=y +CONFIG_XEN_DOM0=y +CONFIG_XEN_PRIVILEGED_GUEST=y +CONFIG_XEN_DOM0_PCI=y +CONFIG_XEN_PCI_PASSTHROUGH=y +CONFIG_KVM_CLOCK=y +CONFIG_KVM_GUEST=y +CONFIG_PARAVIRT=y +# CONFIG_PARAVIRT_SPINLOCKS is not set +CONFIG_PARAVIRT_CLOCK=y +# CONFIG_MEMTEST is not set +# CONFIG_M386 is not set +# CONFIG_M486 is not set +# CONFIG_M586 is not set +# CONFIG_M586TSC is not set +# CONFIG_M586MMX is not set +# CONFIG_M686 is not set +# CONFIG_MPENTIUMII is not set +# CONFIG_MPENTIUMIII is not set +# CONFIG_MPENTIUMM is not set +# CONFIG_MPENTIUM4 is not set +# CONFIG_MK6 is not set +# CONFIG_MK7 is not set +# CONFIG_MK8 is not set +# CONFIG_MCRUSOE is not set +# CONFIG_MEFFICEON is not set +# CONFIG_MWINCHIPC6 is not set +# CONFIG_MWINCHIP3D is not set +# CONFIG_MGEODEGX1 is not set +# CONFIG_MGEODE_LX is not set +# CONFIG_MCYRIXIII is not set +# CONFIG_MVIAC3_2 is not set +# CONFIG_MVIAC7 is not set +# CONFIG_MPSC is not set +# CONFIG_MCORE2 is not set +# CONFIG_MATOM is not set +CONFIG_GENERIC_CPU=y +CONFIG_X86_CPU=y +CONFIG_X86_L1_CACHE_BYTES=64 +CONFIG_X86_INTERNODE_CACHE_BYTES=64 +CONFIG_X86_CMPXCHG=y +CONFIG_X86_L1_CACHE_SHIFT=6 +CONFIG_X86_XADD=y +CONFIG_X86_WP_WORKS_OK=y +CONFIG_X86_TSC=y +CONFIG_X86_CMPXCHG64=y +CONFIG_X86_CMOV=y +CONFIG_X86_MINIMUM_CPU_FAMILY=64 +CONFIG_X86_DEBUGCTLMSR=y +# CONFIG_PROCESSOR_SELECT is not set +CONFIG_CPU_SUP_INTEL=y +CONFIG_CPU_SUP_AMD=y +CONFIG_CPU_SUP_CENTAUR=y +# CONFIG_X86_DS is not set +CONFIG_HPET_TIMER=y +CONFIG_HPET_EMULATE_RTC=y +CONFIG_DMI=y +CONFIG_GART_IOMMU=y +CONFIG_CALGARY_IOMMU=y +CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT=y +CONFIG_AMD_IOMMU=y +# CONFIG_AMD_IOMMU_STATS is not set +CONFIG_SWIOTLB=y +CONFIG_IOMMU_HELPER=y +CONFIG_IOMMU_API=y +CONFIG_NR_CPUS=8 +CONFIG_SCHED_SMT=y +CONFIG_SCHED_MC=y +CONFIG_PREEMPT_NONE=y +# CONFIG_PREEMPT_VOLUNTARY is not set +# CONFIG_PREEMPT is not set +CONFIG_X86_LOCAL_APIC=y +CONFIG_X86_IO_APIC=y +# CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS is not set +# CONFIG_X86_MCE is not set +CONFIG_I8K=m +CONFIG_MICROCODE=m +CONFIG_MICROCODE_INTEL=y +CONFIG_MICROCODE_AMD=y +CONFIG_MICROCODE_OLD_INTERFACE=y +CONFIG_X86_MSR=m +CONFIG_X86_CPUID=m +CONFIG_ARCH_PHYS_ADDR_T_64BIT=y +CONFIG_DIRECT_GBPAGES=y +# CONFIG_NUMA is not set +CONFIG_ARCH_SPARSEMEM_DEFAULT=y +CONFIG_ARCH_SPARSEMEM_ENABLE=y +CONFIG_ARCH_SELECT_MEMORY_MODEL=y +CONFIG_ILLEGAL_POINTER_VALUE=0xdead000000000000 +CONFIG_SELECT_MEMORY_MODEL=y +# CONFIG_FLATMEM_MANUAL is not set +# CONFIG_DISCONTIGMEM_MANUAL is not set +CONFIG_SPARSEMEM_MANUAL=y +CONFIG_SPARSEMEM=y +CONFIG_HAVE_MEMORY_PRESENT=y +CONFIG_SPARSEMEM_EXTREME=y +CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y +CONFIG_SPARSEMEM_VMEMMAP=y +# CONFIG_MEMORY_HOTPLUG is not set +CONFIG_PAGEFLAGS_EXTENDED=y +CONFIG_SPLIT_PTLOCK_CPUS=4 +CONFIG_PHYS_ADDR_T_64BIT=y +CONFIG_ZONE_DMA_FLAG=1 +CONFIG_BOUNCE=y +CONFIG_VIRT_TO_BUS=y +CONFIG_HAVE_MLOCK=y +CONFIG_HAVE_MLOCKED_PAGE_BIT=y +CONFIG_MMU_NOTIFIER=y +CONFIG_KSM=y +CONFIG_DEFAULT_MMAP_MIN_ADDR=4096 +# CONFIG_X86_CHECK_BIOS_CORRUPTION is not set +CONFIG_X86_RESERVE_LOW_64K=y +CONFIG_MTRR=y +CONFIG_MTRR_SANITIZER=y +CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT=0 +CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT=1 +CONFIG_X86_PAT=y +CONFIG_ARCH_USES_PG_UNCACHED=y +# CONFIG_EFI is not set +# CONFIG_SECCOMP is not set +# CONFIG_CC_STACKPROTECTOR is not set +# CONFIG_HZ_100 is not set +# CONFIG_HZ_250 is not set +CONFIG_HZ_300=y +# CONFIG_HZ_1000 is not set +CONFIG_HZ=300 +CONFIG_SCHED_HRTICK=y +# CONFIG_KEXEC is not set +# CONFIG_CRASH_DUMP is not set +CONFIG_PHYSICAL_START=0x1000000 +# CONFIG_RELOCATABLE is not set +CONFIG_PHYSICAL_ALIGN=0x1000000 +CONFIG_HOTPLUG_CPU=y +CONFIG_COMPAT_VDSO=y +# CONFIG_CMDLINE_BOOL is not set +CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y + +# +# Power management and ACPI options +# +CONFIG_PM=y +# CONFIG_PM_DEBUG is not set +CONFIG_PM_SLEEP_SMP=y +CONFIG_PM_SLEEP=y +CONFIG_SUSPEND=y +CONFIG_SUSPEND_FREEZER=y +# CONFIG_HIBERNATION is not set +# CONFIG_PM_RUNTIME is not set +CONFIG_ACPI=y +CONFIG_ACPI_SLEEP=y +CONFIG_ACPI_PROCFS=y +CONFIG_ACPI_PROCFS_POWER=y +# CONFIG_ACPI_POWER_METER is not set +CONFIG_ACPI_SYSFS_POWER=y +CONFIG_ACPI_PROC_EVENT=y +CONFIG_ACPI_AC=m +CONFIG_ACPI_BATTERY=m +CONFIG_ACPI_BUTTON=m +CONFIG_ACPI_VIDEO=m +CONFIG_ACPI_FAN=m +CONFIG_ACPI_DOCK=y +CONFIG_ACPI_PROCESSOR=m +CONFIG_ACPI_HOTPLUG_CPU=y +# CONFIG_ACPI_PROCESSOR_AGGREGATOR is not set +CONFIG_ACPI_THERMAL=m +# CONFIG_ACPI_CUSTOM_DSDT is not set +CONFIG_ACPI_BLACKLIST_YEAR=0 +# CONFIG_ACPI_DEBUG is not set +CONFIG_ACPI_PCI_SLOT=m +CONFIG_X86_PM_TIMER=y +CONFIG_ACPI_CONTAINER=m +CONFIG_ACPI_SBS=m +# CONFIG_SFI is not set + +# +# CPU Frequency scaling +# +CONFIG_CPU_FREQ=y +CONFIG_CPU_FREQ_TABLE=m +# CONFIG_CPU_FREQ_DEBUG is not set +CONFIG_CPU_FREQ_STAT=m +# CONFIG_CPU_FREQ_STAT_DETAILS is not set +CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y +# CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set +CONFIG_CPU_FREQ_GOV_PERFORMANCE=y +CONFIG_CPU_FREQ_GOV_POWERSAVE=m +CONFIG_CPU_FREQ_GOV_USERSPACE=m +CONFIG_CPU_FREQ_GOV_ONDEMAND=m +CONFIG_CPU_FREQ_GOV_CONSERVATIVE=m + +# +# CPUFreq processor drivers +# +CONFIG_X86_ACPI_CPUFREQ=m +CONFIG_X86_POWERNOW_K8=m +CONFIG_X86_SPEEDSTEP_CENTRINO=m +CONFIG_X86_P4_CLOCKMOD=m + +# +# shared options +# +CONFIG_X86_SPEEDSTEP_LIB=m +CONFIG_CPU_IDLE=y +CONFIG_CPU_IDLE_GOV_LADDER=y +CONFIG_CPU_IDLE_GOV_MENU=y + +# +# Memory power savings +# +CONFIG_I7300_IDLE_IOAT_CHANNEL=y +CONFIG_I7300_IDLE=m + +# +# Bus options (PCI etc.) +# +CONFIG_PCI=y +CONFIG_PCI_DIRECT=y +CONFIG_PCI_MMCONFIG=y +CONFIG_PCI_XEN=y +CONFIG_PCI_DOMAINS=y +# CONFIG_DMAR is not set +# CONFIG_INTR_REMAP is not set +CONFIG_PCIEPORTBUS=y +CONFIG_HOTPLUG_PCI_PCIE=m +# CONFIG_PCIEAER is not set +CONFIG_PCIEASPM=y +# CONFIG_PCIEASPM_DEBUG is not set +CONFIG_ARCH_SUPPORTS_MSI=y +CONFIG_PCI_MSI=y +CONFIG_PCI_LEGACY=y +CONFIG_PCI_STUB=m +CONFIG_XEN_PCIDEV_FRONTEND=y +CONFIG_HT_IRQ=y +# CONFIG_PCI_IOV is not set +CONFIG_ISA_DMA_API=y +CONFIG_K8_NB=y +CONFIG_PCCARD=m +# CONFIG_PCMCIA_DEBUG is not set +CONFIG_PCMCIA=m +CONFIG_PCMCIA_LOAD_CIS=y +CONFIG_PCMCIA_IOCTL=y +CONFIG_CARDBUS=y + +# +# PC-card bridges +# +CONFIG_YENTA=m +CONFIG_YENTA_O2=y +CONFIG_YENTA_RICOH=y +CONFIG_YENTA_TI=y +CONFIG_YENTA_ENE_TUNE=y +CONFIG_YENTA_TOSHIBA=y +CONFIG_PD6729=m +CONFIG_I82092=m +CONFIG_PCCARD_NONSTATIC=m +CONFIG_HOTPLUG_PCI=m +CONFIG_HOTPLUG_PCI_FAKE=m +CONFIG_HOTPLUG_PCI_ACPI=m +CONFIG_HOTPLUG_PCI_ACPI_IBM=m +CONFIG_HOTPLUG_PCI_CPCI=y +CONFIG_HOTPLUG_PCI_CPCI_ZT5550=m +CONFIG_HOTPLUG_PCI_CPCI_GENERIC=m +CONFIG_HOTPLUG_PCI_SHPC=m + +# +# Executable file formats / Emulations +# +CONFIG_BINFMT_ELF=y +CONFIG_COMPAT_BINFMT_ELF=y +# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set +# CONFIG_HAVE_AOUT is not set +CONFIG_BINFMT_MISC=m +CONFIG_IA32_EMULATION=y +# CONFIG_IA32_AOUT is not set +CONFIG_COMPAT=y +CONFIG_COMPAT_FOR_U64_ALIGNMENT=y +CONFIG_SYSVIPC_COMPAT=y +CONFIG_NET=y +CONFIG_COMPAT_NETLINK_MESSAGES=y + +# +# Networking options +# +CONFIG_PACKET=m +# CONFIG_PACKET_MMAP is not set +CONFIG_UNIX=y +CONFIG_XFRM=y +CONFIG_XFRM_USER=m +CONFIG_XFRM_SUB_POLICY=y +CONFIG_XFRM_MIGRATE=y +# CONFIG_XFRM_STATISTICS is not set +CONFIG_XFRM_IPCOMP=m +CONFIG_NET_KEY=m +CONFIG_NET_KEY_MIGRATE=y +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_ASK_IP_FIB_HASH=y +# CONFIG_IP_FIB_TRIE is not set +CONFIG_IP_FIB_HASH=y +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_IP_ROUTE_MULTIPATH=y +CONFIG_IP_ROUTE_VERBOSE=y +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +CONFIG_IP_PNP_BOOTP=y +CONFIG_IP_PNP_RARP=y +CONFIG_NET_IPIP=m +CONFIG_NET_IPGRE=m +CONFIG_NET_IPGRE_BROADCAST=y +CONFIG_IP_MROUTE=y +# CONFIG_IP_PIMSM_V1 is not set +CONFIG_IP_PIMSM_V2=y +CONFIG_ARPD=y +CONFIG_SYN_COOKIES=y +CONFIG_INET_AH=m +CONFIG_INET_ESP=m +CONFIG_INET_IPCOMP=m +CONFIG_INET_XFRM_TUNNEL=m +CONFIG_INET_TUNNEL=m +CONFIG_INET_XFRM_MODE_TRANSPORT=m +CONFIG_INET_XFRM_MODE_TUNNEL=m +CONFIG_INET_XFRM_MODE_BEET=m +CONFIG_INET_LRO=y +CONFIG_INET_DIAG=m +CONFIG_INET_TCP_DIAG=m +CONFIG_TCP_CONG_ADVANCED=y +CONFIG_TCP_CONG_BIC=m +CONFIG_TCP_CONG_CUBIC=y +CONFIG_TCP_CONG_WESTWOOD=m +CONFIG_TCP_CONG_HTCP=m +CONFIG_TCP_CONG_HSTCP=m +CONFIG_TCP_CONG_HYBLA=m +CONFIG_TCP_CONG_VEGAS=m +CONFIG_TCP_CONG_SCALABLE=m +CONFIG_TCP_CONG_LP=m +CONFIG_TCP_CONG_VENO=m +CONFIG_TCP_CONG_YEAH=m +CONFIG_TCP_CONG_ILLINOIS=m +# CONFIG_DEFAULT_BIC is not set +CONFIG_DEFAULT_CUBIC=y +# CONFIG_DEFAULT_HTCP is not set +# CONFIG_DEFAULT_VEGAS is not set +# CONFIG_DEFAULT_WESTWOOD is not set +# CONFIG_DEFAULT_RENO is not set +CONFIG_DEFAULT_TCP_CONG="cubic" +CONFIG_TCP_MD5SIG=y +CONFIG_IPV6=m +CONFIG_IPV6_PRIVACY=y +CONFIG_IPV6_ROUTER_PREF=y +CONFIG_IPV6_ROUTE_INFO=y +# CONFIG_IPV6_OPTIMISTIC_DAD is not set +CONFIG_INET6_AH=m +CONFIG_INET6_ESP=m +CONFIG_INET6_IPCOMP=m +CONFIG_IPV6_MIP6=m +CONFIG_INET6_XFRM_TUNNEL=m +CONFIG_INET6_TUNNEL=m +CONFIG_INET6_XFRM_MODE_TRANSPORT=m +CONFIG_INET6_XFRM_MODE_TUNNEL=m +CONFIG_INET6_XFRM_MODE_BEET=m +CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION=m +CONFIG_IPV6_SIT=m +CONFIG_IPV6_NDISC_NODETYPE=y +CONFIG_IPV6_TUNNEL=m +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_IPV6_SUBTREES=y +CONFIG_IPV6_MROUTE=y +CONFIG_IPV6_PIMSM_V2=y +CONFIG_NETLABEL=y +CONFIG_NETWORK_SECMARK=y +CONFIG_NETFILTER=y +# CONFIG_NETFILTER_DEBUG is not set +CONFIG_NETFILTER_ADVANCED=y +CONFIG_BRIDGE_NETFILTER=y + +# +# Core Netfilter Configuration +# +CONFIG_NETFILTER_NETLINK=m +CONFIG_NETFILTER_NETLINK_QUEUE=m +CONFIG_NETFILTER_NETLINK_LOG=m +CONFIG_NF_CONNTRACK=m +CONFIG_NF_CT_ACCT=y +CONFIG_NF_CONNTRACK_MARK=y +CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NF_CT_PROTO_DCCP=m +CONFIG_NF_CT_PROTO_GRE=m +CONFIG_NF_CT_PROTO_SCTP=m +CONFIG_NF_CT_PROTO_UDPLITE=m +CONFIG_NF_CONNTRACK_AMANDA=m +CONFIG_NF_CONNTRACK_FTP=m +CONFIG_NF_CONNTRACK_H323=m +CONFIG_NF_CONNTRACK_IRC=m +CONFIG_NF_CONNTRACK_NETBIOS_NS=m +CONFIG_NF_CONNTRACK_PPTP=m +CONFIG_NF_CONNTRACK_SANE=m +CONFIG_NF_CONNTRACK_SIP=m +CONFIG_NF_CONNTRACK_TFTP=m +CONFIG_NF_CT_NETLINK=m +CONFIG_NETFILTER_TPROXY=m +CONFIG_NETFILTER_XTABLES=m +CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m +CONFIG_NETFILTER_XT_TARGET_CONNMARK=m +CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m +CONFIG_NETFILTER_XT_TARGET_DSCP=m +CONFIG_NETFILTER_XT_TARGET_HL=m +CONFIG_NETFILTER_XT_TARGET_LED=m +CONFIG_NETFILTER_XT_TARGET_MARK=m +CONFIG_NETFILTER_XT_TARGET_NFLOG=m +CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m +CONFIG_NETFILTER_XT_TARGET_NOTRACK=m +CONFIG_NETFILTER_XT_TARGET_RATEEST=m +CONFIG_NETFILTER_XT_TARGET_TPROXY=m +CONFIG_NETFILTER_XT_TARGET_TRACE=m +CONFIG_NETFILTER_XT_TARGET_SECMARK=m +CONFIG_NETFILTER_XT_TARGET_TCPMSS=m +CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m +CONFIG_NETFILTER_XT_MATCH_CLUSTER=m +CONFIG_NETFILTER_XT_MATCH_COMMENT=m +CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m +CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m +CONFIG_NETFILTER_XT_MATCH_CONNMARK=m +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m +CONFIG_NETFILTER_XT_MATCH_DCCP=m +CONFIG_NETFILTER_XT_MATCH_DSCP=m +CONFIG_NETFILTER_XT_MATCH_ESP=m +CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m +CONFIG_NETFILTER_XT_MATCH_HELPER=m +CONFIG_NETFILTER_XT_MATCH_HL=m +CONFIG_NETFILTER_XT_MATCH_IPRANGE=m +CONFIG_NETFILTER_XT_MATCH_LENGTH=m +CONFIG_NETFILTER_XT_MATCH_LIMIT=m +CONFIG_NETFILTER_XT_MATCH_MAC=m +CONFIG_NETFILTER_XT_MATCH_MARK=m +CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m +CONFIG_NETFILTER_XT_MATCH_OWNER=m +CONFIG_NETFILTER_XT_MATCH_POLICY=m +CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m +CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m +CONFIG_NETFILTER_XT_MATCH_QUOTA=m +CONFIG_NETFILTER_XT_MATCH_RATEEST=m +CONFIG_NETFILTER_XT_MATCH_REALM=m +CONFIG_NETFILTER_XT_MATCH_RECENT=m +# CONFIG_NETFILTER_XT_MATCH_RECENT_PROC_COMPAT is not set +CONFIG_NETFILTER_XT_MATCH_SCTP=m +CONFIG_NETFILTER_XT_MATCH_SOCKET=m +CONFIG_NETFILTER_XT_MATCH_STATE=m +CONFIG_NETFILTER_XT_MATCH_STATISTIC=m +CONFIG_NETFILTER_XT_MATCH_STRING=m +CONFIG_NETFILTER_XT_MATCH_TCPMSS=m +CONFIG_NETFILTER_XT_MATCH_TIME=m +CONFIG_NETFILTER_XT_MATCH_U32=m +CONFIG_NETFILTER_XT_MATCH_OSF=m +CONFIG_IP_VS=m +CONFIG_IP_VS_IPV6=y +# CONFIG_IP_VS_DEBUG is not set +CONFIG_IP_VS_TAB_BITS=12 + +# +# IPVS transport protocol load balancing support +# +CONFIG_IP_VS_PROTO_TCP=y +CONFIG_IP_VS_PROTO_UDP=y +CONFIG_IP_VS_PROTO_AH_ESP=y +CONFIG_IP_VS_PROTO_ESP=y +CONFIG_IP_VS_PROTO_AH=y + +# +# IPVS scheduler +# +CONFIG_IP_VS_RR=m +CONFIG_IP_VS_WRR=m +CONFIG_IP_VS_LC=m +CONFIG_IP_VS_WLC=m +CONFIG_IP_VS_LBLC=m +CONFIG_IP_VS_LBLCR=m +CONFIG_IP_VS_DH=m +CONFIG_IP_VS_SH=m +CONFIG_IP_VS_SED=m +CONFIG_IP_VS_NQ=m + +# +# IPVS application helper +# +CONFIG_IP_VS_FTP=m + +# +# IP: Netfilter Configuration +# +CONFIG_NF_DEFRAG_IPV4=m +CONFIG_NF_CONNTRACK_IPV4=m +CONFIG_NF_CONNTRACK_PROC_COMPAT=y +CONFIG_IP_NF_QUEUE=m +CONFIG_IP_NF_IPTABLES=m +CONFIG_IP_NF_MATCH_ADDRTYPE=m +CONFIG_IP_NF_MATCH_AH=m +CONFIG_IP_NF_MATCH_ECN=m +CONFIG_IP_NF_MATCH_TTL=m +CONFIG_IP_NF_FILTER=m +CONFIG_IP_NF_TARGET_REJECT=m +CONFIG_IP_NF_TARGET_LOG=m +CONFIG_IP_NF_TARGET_ULOG=m +CONFIG_NF_NAT=m +CONFIG_NF_NAT_NEEDED=y +CONFIG_IP_NF_TARGET_MASQUERADE=m +CONFIG_IP_NF_TARGET_NETMAP=m +CONFIG_IP_NF_TARGET_REDIRECT=m +CONFIG_NF_NAT_SNMP_BASIC=m +CONFIG_NF_NAT_PROTO_DCCP=m +CONFIG_NF_NAT_PROTO_GRE=m +CONFIG_NF_NAT_PROTO_UDPLITE=m +CONFIG_NF_NAT_PROTO_SCTP=m +CONFIG_NF_NAT_FTP=m +CONFIG_NF_NAT_IRC=m +CONFIG_NF_NAT_TFTP=m +CONFIG_NF_NAT_AMANDA=m +CONFIG_NF_NAT_PPTP=m +CONFIG_NF_NAT_H323=m +CONFIG_NF_NAT_SIP=m +CONFIG_IP_NF_MANGLE=m +CONFIG_IP_NF_TARGET_CLUSTERIP=m +CONFIG_IP_NF_TARGET_ECN=m +CONFIG_IP_NF_TARGET_TTL=m +CONFIG_IP_NF_RAW=m +CONFIG_IP_NF_SECURITY=m +CONFIG_IP_NF_ARPTABLES=m +CONFIG_IP_NF_ARPFILTER=m +CONFIG_IP_NF_ARP_MANGLE=m + +# +# IPv6: Netfilter Configuration +# +CONFIG_NF_CONNTRACK_IPV6=m +CONFIG_IP6_NF_QUEUE=m +CONFIG_IP6_NF_IPTABLES=m +CONFIG_IP6_NF_MATCH_AH=m +CONFIG_IP6_NF_MATCH_EUI64=m +CONFIG_IP6_NF_MATCH_FRAG=m +CONFIG_IP6_NF_MATCH_OPTS=m +CONFIG_IP6_NF_MATCH_HL=m +CONFIG_IP6_NF_MATCH_IPV6HEADER=m +CONFIG_IP6_NF_MATCH_MH=m +CONFIG_IP6_NF_MATCH_RT=m +CONFIG_IP6_NF_TARGET_HL=m +CONFIG_IP6_NF_TARGET_LOG=m +CONFIG_IP6_NF_FILTER=m +CONFIG_IP6_NF_TARGET_REJECT=m +CONFIG_IP6_NF_MANGLE=m +CONFIG_IP6_NF_RAW=m +CONFIG_IP6_NF_SECURITY=m + +# +# DECnet: Netfilter Configuration +# +CONFIG_DECNET_NF_GRABULATOR=m +CONFIG_BRIDGE_NF_EBTABLES=m +CONFIG_BRIDGE_EBT_BROUTE=m +CONFIG_BRIDGE_EBT_T_FILTER=m +CONFIG_BRIDGE_EBT_T_NAT=m +CONFIG_BRIDGE_EBT_802_3=m +CONFIG_BRIDGE_EBT_AMONG=m +CONFIG_BRIDGE_EBT_ARP=m +CONFIG_BRIDGE_EBT_IP=m +CONFIG_BRIDGE_EBT_IP6=m +CONFIG_BRIDGE_EBT_LIMIT=m +CONFIG_BRIDGE_EBT_MARK=m +CONFIG_BRIDGE_EBT_PKTTYPE=m +CONFIG_BRIDGE_EBT_STP=m +CONFIG_BRIDGE_EBT_VLAN=m +CONFIG_BRIDGE_EBT_ARPREPLY=m +CONFIG_BRIDGE_EBT_DNAT=m +CONFIG_BRIDGE_EBT_MARK_T=m +CONFIG_BRIDGE_EBT_REDIRECT=m +CONFIG_BRIDGE_EBT_SNAT=m +CONFIG_BRIDGE_EBT_LOG=m +CONFIG_BRIDGE_EBT_ULOG=m +CONFIG_BRIDGE_EBT_NFLOG=m +CONFIG_IP_DCCP=m +CONFIG_INET_DCCP_DIAG=m + +# +# DCCP CCIDs Configuration (EXPERIMENTAL) +# +# CONFIG_IP_DCCP_CCID2_DEBUG is not set +CONFIG_IP_DCCP_CCID3=y +# CONFIG_IP_DCCP_CCID3_DEBUG is not set +CONFIG_IP_DCCP_CCID3_RTO=100 +CONFIG_IP_DCCP_TFRC_LIB=y +CONFIG_IP_SCTP=m +# CONFIG_SCTP_DBG_MSG is not set +# CONFIG_SCTP_DBG_OBJCNT is not set +# CONFIG_SCTP_HMAC_NONE is not set +CONFIG_SCTP_HMAC_SHA1=y +# CONFIG_SCTP_HMAC_MD5 is not set +CONFIG_RDS=m +# CONFIG_RDS_RDMA is not set +# CONFIG_RDS_TCP is not set +# CONFIG_RDS_DEBUG is not set +CONFIG_TIPC=m +# CONFIG_TIPC_ADVANCED is not set +# CONFIG_TIPC_DEBUG is not set +CONFIG_ATM=m +CONFIG_ATM_CLIP=m +# CONFIG_ATM_CLIP_NO_ICMP is not set +CONFIG_ATM_LANE=m +CONFIG_ATM_MPOA=m +CONFIG_ATM_BR2684=m +# CONFIG_ATM_BR2684_IPFILTER is not set +CONFIG_STP=m +CONFIG_BRIDGE=m +# CONFIG_NET_DSA is not set +CONFIG_VLAN_8021Q=m +# CONFIG_VLAN_8021Q_GVRP is not set +CONFIG_DECNET=m +CONFIG_DECNET_ROUTER=y +CONFIG_LLC=m +CONFIG_LLC2=m +CONFIG_IPX=m +# CONFIG_IPX_INTERN is not set +CONFIG_ATALK=m +CONFIG_DEV_APPLETALK=m +CONFIG_IPDDP=m +CONFIG_IPDDP_ENCAP=y +CONFIG_IPDDP_DECAP=y +CONFIG_X25=m +CONFIG_LAPB=m +CONFIG_ECONET=m +CONFIG_ECONET_AUNUDP=y +CONFIG_ECONET_NATIVE=y +CONFIG_WAN_ROUTER=m +CONFIG_PHONET=m +CONFIG_IEEE802154=m +CONFIG_NET_SCHED=y + +# +# Queueing/Scheduling +# +CONFIG_NET_SCH_CBQ=m +CONFIG_NET_SCH_HTB=m +CONFIG_NET_SCH_HFSC=m +CONFIG_NET_SCH_ATM=m +CONFIG_NET_SCH_PRIO=m +CONFIG_NET_SCH_MULTIQ=m +CONFIG_NET_SCH_RED=m +CONFIG_NET_SCH_SFQ=m +CONFIG_NET_SCH_TEQL=m +CONFIG_NET_SCH_TBF=m +CONFIG_NET_SCH_GRED=m +CONFIG_NET_SCH_DSMARK=m +CONFIG_NET_SCH_NETEM=m +CONFIG_NET_SCH_DRR=m +CONFIG_NET_SCH_INGRESS=m +# CONFIG_NET_SCH_PLUG is not set + +# +# Classification +# +CONFIG_NET_CLS=y +CONFIG_NET_CLS_BASIC=m +CONFIG_NET_CLS_TCINDEX=m +CONFIG_NET_CLS_ROUTE4=m +CONFIG_NET_CLS_ROUTE=y +CONFIG_NET_CLS_FW=m +CONFIG_NET_CLS_U32=m +CONFIG_CLS_U32_PERF=y +CONFIG_CLS_U32_MARK=y +CONFIG_NET_CLS_RSVP=m +CONFIG_NET_CLS_RSVP6=m +CONFIG_NET_CLS_FLOW=m +CONFIG_NET_EMATCH=y +CONFIG_NET_EMATCH_STACK=32 +CONFIG_NET_EMATCH_CMP=m +CONFIG_NET_EMATCH_NBYTE=m +CONFIG_NET_EMATCH_U32=m +CONFIG_NET_EMATCH_META=m +CONFIG_NET_EMATCH_TEXT=m +CONFIG_NET_CLS_ACT=y +CONFIG_NET_ACT_POLICE=m +CONFIG_NET_ACT_GACT=m +CONFIG_GACT_PROB=y +CONFIG_NET_ACT_MIRRED=m +CONFIG_NET_ACT_IPT=m +CONFIG_NET_ACT_NAT=m +CONFIG_NET_ACT_PEDIT=m +CONFIG_NET_ACT_SIMP=m +CONFIG_NET_ACT_SKBEDIT=m +# CONFIG_NET_CLS_IND is not set +CONFIG_NET_SCH_FIFO=y +# CONFIG_DCB is not set + +# +# Network testing +# +CONFIG_NET_PKTGEN=m +# CONFIG_HAMRADIO is not set +CONFIG_CAN=m +CONFIG_CAN_RAW=m +CONFIG_CAN_BCM=m + +# +# CAN Device Drivers +# +CONFIG_CAN_VCAN=m +CONFIG_CAN_DEV=m +# CONFIG_CAN_CALC_BITTIMING is not set +CONFIG_CAN_SJA1000=m +CONFIG_CAN_SJA1000_PLATFORM=m +CONFIG_CAN_EMS_PCI=m +CONFIG_CAN_KVASER_PCI=m + +# +# CAN USB interfaces +# +# CONFIG_CAN_EMS_USB is not set +# CONFIG_CAN_DEBUG_DEVICES is not set +CONFIG_IRDA=m + +# +# IrDA protocols +# +CONFIG_IRLAN=m +CONFIG_IRNET=m +CONFIG_IRCOMM=m +CONFIG_IRDA_ULTRA=y + +# +# IrDA options +# +CONFIG_IRDA_CACHE_LAST_LSAP=y +CONFIG_IRDA_FAST_RR=y +# CONFIG_IRDA_DEBUG is not set + +# +# Infrared-port device drivers +# + +# +# SIR device drivers +# +CONFIG_IRTTY_SIR=m + +# +# Dongle support +# +CONFIG_DONGLE=y +CONFIG_ESI_DONGLE=m +CONFIG_ACTISYS_DONGLE=m +CONFIG_TEKRAM_DONGLE=m +CONFIG_TOIM3232_DONGLE=m +CONFIG_LITELINK_DONGLE=m +CONFIG_MA600_DONGLE=m +CONFIG_GIRBIL_DONGLE=m +CONFIG_MCP2120_DONGLE=m +CONFIG_OLD_BELKIN_DONGLE=m +CONFIG_ACT200L_DONGLE=m +CONFIG_KINGSUN_DONGLE=m +CONFIG_KSDAZZLE_DONGLE=m +CONFIG_KS959_DONGLE=m + +# +# FIR device drivers +# +CONFIG_USB_IRDA=m +CONFIG_SIGMATEL_FIR=m +CONFIG_NSC_FIR=m +CONFIG_WINBOND_FIR=m +CONFIG_SMC_IRCC_FIR=m +CONFIG_ALI_FIR=m +CONFIG_VLSI_FIR=m +CONFIG_VIA_FIR=m +CONFIG_MCS_FIR=m +CONFIG_BT=m +CONFIG_BT_L2CAP=m +CONFIG_BT_SCO=m +CONFIG_BT_RFCOMM=m +CONFIG_BT_RFCOMM_TTY=y +CONFIG_BT_BNEP=m +CONFIG_BT_BNEP_MC_FILTER=y +CONFIG_BT_BNEP_PROTO_FILTER=y +CONFIG_BT_CMTP=m +CONFIG_BT_HIDP=m + +# +# Bluetooth device drivers +# +CONFIG_BT_HCIBTUSB=m +CONFIG_BT_HCIBTSDIO=m +CONFIG_BT_HCIUART=m +CONFIG_BT_HCIUART_H4=y +CONFIG_BT_HCIUART_BCSP=y +CONFIG_BT_HCIUART_LL=y +CONFIG_BT_HCIBCM203X=m +CONFIG_BT_HCIBPA10X=m +CONFIG_BT_HCIBFUSB=m +CONFIG_BT_HCIDTL1=m +CONFIG_BT_HCIBT3C=m +CONFIG_BT_HCIBLUECARD=m +CONFIG_BT_HCIBTUART=m +CONFIG_BT_HCIVHCI=m +# CONFIG_BT_MRVL is not set +CONFIG_AF_RXRPC=m +# CONFIG_AF_RXRPC_DEBUG is not set +CONFIG_RXKAD=m +CONFIG_FIB_RULES=y +CONFIG_WIRELESS=y +CONFIG_CFG80211=m +# CONFIG_NL80211_TESTMODE is not set +# CONFIG_CFG80211_DEVELOPER_WARNINGS is not set +# CONFIG_CFG80211_REG_DEBUG is not set +CONFIG_CFG80211_DEFAULT_PS=y +CONFIG_CFG80211_DEFAULT_PS_VALUE=1 +# CONFIG_CFG80211_DEBUGFS is not set +# CONFIG_WIRELESS_OLD_REGULATORY is not set +CONFIG_WIRELESS_EXT=y +CONFIG_WIRELESS_EXT_SYSFS=y +CONFIG_LIB80211=m +# CONFIG_LIB80211_DEBUG is not set +CONFIG_MAC80211=m +CONFIG_MAC80211_HAS_RC=y +CONFIG_MAC80211_RC_PID=y +CONFIG_MAC80211_RC_MINSTREL=y +CONFIG_MAC80211_RC_DEFAULT_PID=y +# CONFIG_MAC80211_RC_DEFAULT_MINSTREL is not set +CONFIG_MAC80211_RC_DEFAULT="pid" +# CONFIG_MAC80211_MESH is not set +CONFIG_MAC80211_LEDS=y +# CONFIG_MAC80211_DEBUGFS is not set +# CONFIG_MAC80211_DEBUG_MENU is not set +CONFIG_WIMAX=m +CONFIG_WIMAX_DEBUG_LEVEL=8 +CONFIG_RFKILL=m +CONFIG_RFKILL_LEDS=y +# CONFIG_RFKILL_INPUT is not set +CONFIG_NET_9P=m +CONFIG_NET_9P_VIRTIO=m +CONFIG_NET_9P_RDMA=m +# CONFIG_NET_9P_DEBUG is not set + +# +# Device Drivers +# + +# +# Generic Driver Options +# +CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" +# CONFIG_DEVTMPFS is not set +CONFIG_STANDALONE=y +# CONFIG_PREVENT_FIRMWARE_BUILD is not set +CONFIG_FW_LOADER=m +# CONFIG_FIRMWARE_IN_KERNEL is not set +CONFIG_EXTRA_FIRMWARE="" +CONFIG_SYS_HYPERVISOR=y +CONFIG_CONNECTOR=m +CONFIG_MTD=m +# CONFIG_MTD_DEBUG is not set +CONFIG_MTD_TESTS=m +CONFIG_MTD_CONCAT=m +CONFIG_MTD_PARTITIONS=y +CONFIG_MTD_REDBOOT_PARTS=m +CONFIG_MTD_REDBOOT_DIRECTORY_BLOCK=-1 +# CONFIG_MTD_REDBOOT_PARTS_UNALLOCATED is not set +# CONFIG_MTD_REDBOOT_PARTS_READONLY is not set +CONFIG_MTD_AR7_PARTS=m + +# +# User Modules And Translation Layers +# +CONFIG_MTD_CHAR=m +CONFIG_HAVE_MTD_OTP=y +CONFIG_MTD_BLKDEVS=m +CONFIG_MTD_BLOCK=m +CONFIG_MTD_BLOCK_RO=m +CONFIG_FTL=m +CONFIG_NFTL=m +CONFIG_NFTL_RW=y +CONFIG_INFTL=m +CONFIG_RFD_FTL=m +CONFIG_SSFDC=m +CONFIG_MTD_OOPS=m + +# +# RAM/ROM/Flash chip drivers +# +CONFIG_MTD_CFI=m +CONFIG_MTD_JEDECPROBE=m +CONFIG_MTD_GEN_PROBE=m +# CONFIG_MTD_CFI_ADV_OPTIONS is not set +CONFIG_MTD_MAP_BANK_WIDTH_1=y +CONFIG_MTD_MAP_BANK_WIDTH_2=y +CONFIG_MTD_MAP_BANK_WIDTH_4=y +# CONFIG_MTD_MAP_BANK_WIDTH_8 is not set +# CONFIG_MTD_MAP_BANK_WIDTH_16 is not set +# CONFIG_MTD_MAP_BANK_WIDTH_32 is not set +CONFIG_MTD_CFI_I1=y +CONFIG_MTD_CFI_I2=y +# CONFIG_MTD_CFI_I4 is not set +# CONFIG_MTD_CFI_I8 is not set +CONFIG_MTD_CFI_INTELEXT=m +CONFIG_MTD_CFI_AMDSTD=m +CONFIG_MTD_CFI_STAA=m +CONFIG_MTD_CFI_UTIL=m +CONFIG_MTD_RAM=m +CONFIG_MTD_ROM=m +CONFIG_MTD_ABSENT=m + +# +# Mapping drivers for chip access +# +CONFIG_MTD_COMPLEX_MAPPINGS=y +CONFIG_MTD_PHYSMAP=m +# CONFIG_MTD_PHYSMAP_COMPAT is not set +CONFIG_MTD_SC520CDP=m +CONFIG_MTD_NETSC520=m +CONFIG_MTD_TS5500=m +CONFIG_MTD_SBC_GXX=m +CONFIG_MTD_AMD76XROM=m +CONFIG_MTD_ICHXROM=m +CONFIG_MTD_ESB2ROM=m +CONFIG_MTD_CK804XROM=m +CONFIG_MTD_SCB2_FLASH=m +CONFIG_MTD_NETtel=m +CONFIG_MTD_L440GX=m +CONFIG_MTD_PCI=m +# CONFIG_MTD_GPIO_ADDR is not set +CONFIG_MTD_INTEL_VR_NOR=m +CONFIG_MTD_PLATRAM=m + +# +# Self-contained MTD device drivers +# +CONFIG_MTD_PMC551=m +CONFIG_MTD_PMC551_BUGFIX=y +# CONFIG_MTD_PMC551_DEBUG is not set +CONFIG_MTD_DATAFLASH=m +# CONFIG_MTD_DATAFLASH_WRITE_VERIFY is not set +# CONFIG_MTD_DATAFLASH_OTP is not set +CONFIG_MTD_M25P80=m +CONFIG_M25PXX_USE_FAST_READ=y +# CONFIG_MTD_SST25L is not set +CONFIG_MTD_SLRAM=m +CONFIG_MTD_PHRAM=m +CONFIG_MTD_MTDRAM=m +CONFIG_MTDRAM_TOTAL_SIZE=4096 +CONFIG_MTDRAM_ERASE_SIZE=128 +CONFIG_MTD_BLOCK2MTD=m + +# +# Disk-On-Chip Device Drivers +# +CONFIG_MTD_DOC2000=m +CONFIG_MTD_DOC2001=m +CONFIG_MTD_DOC2001PLUS=m +CONFIG_MTD_DOCPROBE=m +CONFIG_MTD_DOCECC=m +CONFIG_MTD_DOCPROBE_ADVANCED=y +CONFIG_MTD_DOCPROBE_ADDRESS=0x0000 +# CONFIG_MTD_DOCPROBE_HIGH is not set +# CONFIG_MTD_DOCPROBE_55AA is not set +CONFIG_MTD_NAND=m +# CONFIG_MTD_NAND_VERIFY_WRITE is not set +CONFIG_MTD_NAND_ECC_SMC=y +# CONFIG_MTD_NAND_MUSEUM_IDS is not set +CONFIG_MTD_NAND_IDS=m +CONFIG_MTD_NAND_DISKONCHIP=m +# CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADVANCED is not set +CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADDRESS=0 +# CONFIG_MTD_NAND_DISKONCHIP_BBTWRITE is not set +CONFIG_MTD_NAND_CAFE=m +CONFIG_MTD_NAND_NANDSIM=m +CONFIG_MTD_NAND_PLATFORM=m +CONFIG_MTD_ALAUDA=m +CONFIG_MTD_ONENAND=m +# CONFIG_MTD_ONENAND_VERIFY_WRITE is not set +# CONFIG_MTD_ONENAND_GENERIC is not set +CONFIG_MTD_ONENAND_OTP=y +CONFIG_MTD_ONENAND_2X_PROGRAM=y +CONFIG_MTD_ONENAND_SIM=m + +# +# LPDDR flash memory drivers +# +CONFIG_MTD_LPDDR=m +CONFIG_MTD_QINFO_PROBE=m + +# +# UBI - Unsorted block images +# +CONFIG_MTD_UBI=m +CONFIG_MTD_UBI_WL_THRESHOLD=4096 +CONFIG_MTD_UBI_BEB_RESERVE=1 +# CONFIG_MTD_UBI_GLUEBI is not set + +# +# UBI debugging options +# +# CONFIG_MTD_UBI_DEBUG is not set +CONFIG_PARPORT=m +CONFIG_PARPORT_PC=m +CONFIG_PARPORT_SERIAL=m +# CONFIG_PARPORT_PC_FIFO is not set +# CONFIG_PARPORT_PC_SUPERIO is not set +CONFIG_PARPORT_PC_PCMCIA=m +# CONFIG_PARPORT_GSC is not set +CONFIG_PARPORT_AX88796=m +# CONFIG_PARPORT_1284 is not set +CONFIG_PARPORT_NOT_PC=y +CONFIG_PNP=y +# CONFIG_PNP_DEBUG_MESSAGES is not set + +# +# Protocols +# +CONFIG_PNPACPI=y +CONFIG_BLK_DEV=y +CONFIG_BLK_DEV_FD=m +# CONFIG_PARIDE is not set +CONFIG_BLK_CPQ_DA=m +CONFIG_BLK_CPQ_CISS_DA=m +CONFIG_CISS_SCSI_TAPE=y +CONFIG_BLK_DEV_DAC960=m +CONFIG_BLK_DEV_UMEM=m +# CONFIG_BLK_DEV_COW_COMMON is not set +CONFIG_BLK_DEV_LOOP=m +CONFIG_BLK_DEV_CRYPTOLOOP=m +CONFIG_BLK_DEV_NBD=m +CONFIG_BLK_DEV_OSD=m +CONFIG_BLK_DEV_SX8=m +CONFIG_BLK_DEV_UB=m +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_COUNT=16 +CONFIG_BLK_DEV_RAM_SIZE=4096 +# CONFIG_BLK_DEV_XIP is not set +CONFIG_CDROM_PKTCDVD=m +CONFIG_CDROM_PKTCDVD_BUFFERS=8 +# CONFIG_CDROM_PKTCDVD_WCACHE is not set +CONFIG_ATA_OVER_ETH=m +CONFIG_XEN_BLKDEV_FRONTEND=y +CONFIG_VIRTIO_BLK=m +# CONFIG_BLK_DEV_HD is not set +CONFIG_MISC_DEVICES=y +CONFIG_IBM_ASM=m +CONFIG_PHANTOM=m +CONFIG_SGI_IOC4=m +CONFIG_TIFM_CORE=m +CONFIG_TIFM_7XX1=m +CONFIG_ICS932S401=m +CONFIG_ENCLOSURE_SERVICES=m +CONFIG_HP_ILO=m +CONFIG_DELL_LAPTOP=m +CONFIG_ISL29003=m +CONFIG_C2PORT=m +CONFIG_C2PORT_DURAMAR_2150=m + +# +# EEPROM support +# +CONFIG_EEPROM_AT24=m +CONFIG_EEPROM_AT25=m +CONFIG_EEPROM_LEGACY=m +CONFIG_EEPROM_MAX6875=m +CONFIG_EEPROM_93CX6=m +CONFIG_CB710_CORE=m +# CONFIG_CB710_DEBUG is not set +CONFIG_CB710_DEBUG_ASSUMPTIONS=y +CONFIG_HAVE_IDE=y +# CONFIG_IDE is not set + +# +# SCSI device support +# +CONFIG_RAID_ATTRS=m +CONFIG_SCSI=m +CONFIG_SCSI_DMA=y +CONFIG_SCSI_TGT=m +CONFIG_SCSI_NETLINK=y +CONFIG_SCSI_PROC_FS=y + +# +# SCSI support type (disk, tape, CD-ROM) +# +CONFIG_BLK_DEV_SD=m +CONFIG_CHR_DEV_ST=m +CONFIG_CHR_DEV_OSST=m +CONFIG_BLK_DEV_SR=m +CONFIG_BLK_DEV_SR_VENDOR=y +CONFIG_CHR_DEV_SG=m +CONFIG_CHR_DEV_SCH=m +CONFIG_SCSI_ENCLOSURE=m +CONFIG_SCSI_MULTI_LUN=y +# CONFIG_SCSI_CONSTANTS is not set +# CONFIG_SCSI_LOGGING is not set +CONFIG_SCSI_SCAN_ASYNC=y +CONFIG_SCSI_WAIT_SCAN=m + +# +# SCSI Transports +# +CONFIG_SCSI_SPI_ATTRS=m +CONFIG_SCSI_FC_ATTRS=m +CONFIG_SCSI_FC_TGT_ATTRS=y +CONFIG_SCSI_ISCSI_ATTRS=m +CONFIG_SCSI_SAS_ATTRS=m +CONFIG_SCSI_SAS_LIBSAS=m +CONFIG_SCSI_SAS_ATA=y +CONFIG_SCSI_SAS_HOST_SMP=y +# CONFIG_SCSI_SAS_LIBSAS_DEBUG is not set +CONFIG_SCSI_SRP_ATTRS=m +CONFIG_SCSI_SRP_TGT_ATTRS=y +CONFIG_SCSI_LOWLEVEL=y +CONFIG_ISCSI_TCP=m +CONFIG_SCSI_CXGB3_ISCSI=m +CONFIG_SCSI_BNX2_ISCSI=m +# CONFIG_BE2ISCSI is not set +CONFIG_BLK_DEV_3W_XXXX_RAID=m +CONFIG_SCSI_3W_9XXX=m +CONFIG_SCSI_ACARD=m +CONFIG_SCSI_AACRAID=m +CONFIG_SCSI_AIC7XXX=m +CONFIG_AIC7XXX_CMDS_PER_DEVICE=32 +CONFIG_AIC7XXX_RESET_DELAY_MS=15000 +# CONFIG_AIC7XXX_BUILD_FIRMWARE is not set +CONFIG_AIC7XXX_DEBUG_ENABLE=y +CONFIG_AIC7XXX_DEBUG_MASK=0 +CONFIG_AIC7XXX_REG_PRETTY_PRINT=y +CONFIG_SCSI_AIC7XXX_OLD=m +CONFIG_SCSI_AIC79XX=m +CONFIG_AIC79XX_CMDS_PER_DEVICE=32 +CONFIG_AIC79XX_RESET_DELAY_MS=15000 +# CONFIG_AIC79XX_BUILD_FIRMWARE is not set +CONFIG_AIC79XX_DEBUG_ENABLE=y +CONFIG_AIC79XX_DEBUG_MASK=0 +CONFIG_AIC79XX_REG_PRETTY_PRINT=y +CONFIG_SCSI_AIC94XX=m +# CONFIG_AIC94XX_DEBUG is not set +CONFIG_SCSI_MVSAS=m +CONFIG_SCSI_MVSAS_DEBUG=y +CONFIG_SCSI_DPT_I2O=m +CONFIG_SCSI_ADVANSYS=m +CONFIG_SCSI_ARCMSR=m +CONFIG_MEGARAID_NEWGEN=y +CONFIG_MEGARAID_MM=m +CONFIG_MEGARAID_MAILBOX=m +CONFIG_MEGARAID_LEGACY=m +CONFIG_MEGARAID_SAS=m +CONFIG_SCSI_MPT2SAS=m +CONFIG_SCSI_MPT2SAS_MAX_SGE=128 +# CONFIG_SCSI_MPT2SAS_LOGGING is not set +CONFIG_SCSI_HPTIOP=m +CONFIG_SCSI_BUSLOGIC=m +CONFIG_LIBFC=m +CONFIG_LIBFCOE=m +CONFIG_FCOE=m +CONFIG_FCOE_FNIC=m +CONFIG_SCSI_DMX3191D=m +CONFIG_SCSI_EATA=m +# CONFIG_SCSI_EATA_TAGGED_QUEUE is not set +# CONFIG_SCSI_EATA_LINKED_COMMANDS is not set +CONFIG_SCSI_EATA_MAX_TAGS=16 +CONFIG_SCSI_FUTURE_DOMAIN=m +CONFIG_SCSI_GDTH=m +CONFIG_SCSI_IPS=m +CONFIG_SCSI_INITIO=m +CONFIG_SCSI_INIA100=m +CONFIG_SCSI_PPA=m +CONFIG_SCSI_IMM=m +# CONFIG_SCSI_IZIP_EPP16 is not set +# CONFIG_SCSI_IZIP_SLOW_CTR is not set +CONFIG_SCSI_STEX=m +CONFIG_SCSI_SYM53C8XX_2=m +CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=1 +CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16 +CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64 +CONFIG_SCSI_SYM53C8XX_MMIO=y +CONFIG_SCSI_IPR=m +CONFIG_SCSI_IPR_TRACE=y +# CONFIG_SCSI_IPR_DUMP is not set +CONFIG_SCSI_QLOGIC_1280=m +CONFIG_SCSI_QLA_FC=m +CONFIG_SCSI_QLA_ISCSI=m +CONFIG_SCSI_LPFC=m +# CONFIG_SCSI_LPFC_DEBUG_FS is not set +CONFIG_SCSI_DC395x=m +CONFIG_SCSI_DC390T=m +CONFIG_SCSI_DEBUG=m +# CONFIG_SCSI_PMCRAID is not set +CONFIG_SCSI_SRP=m +# CONFIG_SCSI_BFA_FC is not set +CONFIG_SCSI_LOWLEVEL_PCMCIA=y +CONFIG_PCMCIA_FDOMAIN=m +CONFIG_PCMCIA_QLOGIC=m +CONFIG_PCMCIA_SYM53C500=m +CONFIG_SCSI_DH=m +CONFIG_SCSI_DH_RDAC=m +CONFIG_SCSI_DH_HP_SW=m +CONFIG_SCSI_DH_EMC=m +CONFIG_SCSI_DH_ALUA=m +CONFIG_SCSI_OSD_INITIATOR=m +CONFIG_SCSI_OSD_ULD=m +CONFIG_SCSI_OSD_DPRINT_SENSE=1 +# CONFIG_SCSI_OSD_DEBUG is not set +CONFIG_ATA=m +# CONFIG_ATA_NONSTANDARD is not set +CONFIG_ATA_VERBOSE_ERROR=y +CONFIG_ATA_ACPI=y +CONFIG_SATA_PMP=y +CONFIG_SATA_AHCI=m +CONFIG_SATA_SIL24=m +CONFIG_ATA_SFF=y +CONFIG_SATA_SVW=m +CONFIG_ATA_PIIX=m +CONFIG_SATA_MV=m +CONFIG_SATA_NV=m +CONFIG_PDC_ADMA=m +CONFIG_SATA_QSTOR=m +CONFIG_SATA_PROMISE=m +CONFIG_SATA_SX4=m +CONFIG_SATA_SIL=m +CONFIG_SATA_SIS=m +CONFIG_SATA_ULI=m +CONFIG_SATA_VIA=m +CONFIG_SATA_VITESSE=m +CONFIG_SATA_INIC162X=m +CONFIG_PATA_ACPI=m +CONFIG_PATA_ALI=m +CONFIG_PATA_AMD=m +CONFIG_PATA_ARTOP=m +CONFIG_PATA_ATP867X=m +CONFIG_PATA_ATIIXP=m +CONFIG_PATA_CMD640_PCI=m +CONFIG_PATA_CMD64X=m +CONFIG_PATA_CS5520=m +CONFIG_PATA_CS5530=m +CONFIG_PATA_CYPRESS=m +CONFIG_PATA_EFAR=m +CONFIG_ATA_GENERIC=m +CONFIG_PATA_HPT366=m +CONFIG_PATA_HPT37X=m +CONFIG_PATA_HPT3X2N=m +CONFIG_PATA_HPT3X3=m +CONFIG_PATA_HPT3X3_DMA=y +CONFIG_PATA_IT821X=m +CONFIG_PATA_IT8213=m +CONFIG_PATA_JMICRON=m +CONFIG_PATA_TRIFLEX=m +CONFIG_PATA_MARVELL=m +CONFIG_PATA_MPIIX=m +CONFIG_PATA_OLDPIIX=m +CONFIG_PATA_NETCELL=m +CONFIG_PATA_NINJA32=m +CONFIG_PATA_NS87410=m +CONFIG_PATA_NS87415=m +CONFIG_PATA_OPTI=m +CONFIG_PATA_OPTIDMA=m +CONFIG_PATA_PCMCIA=m +CONFIG_PATA_PDC_OLD=m +CONFIG_PATA_RADISYS=m +CONFIG_PATA_RDC=m +CONFIG_PATA_RZ1000=m +CONFIG_PATA_SC1200=m +CONFIG_PATA_SERVERWORKS=m +CONFIG_PATA_PDC2027X=m +CONFIG_PATA_SIL680=m +CONFIG_PATA_SIS=m +CONFIG_PATA_VIA=m +CONFIG_PATA_WINBOND=m +CONFIG_PATA_PLATFORM=m +CONFIG_PATA_SCH=m +CONFIG_MD=y +CONFIG_BLK_DEV_MD=y +# CONFIG_MD_AUTODETECT is not set +CONFIG_MD_LINEAR=m +CONFIG_MD_RAID0=m +CONFIG_MD_RAID1=m +CONFIG_MD_RAID10=m +CONFIG_MD_RAID456=m +# CONFIG_MULTICORE_RAID456 is not set +CONFIG_MD_RAID6_PQ=m +# CONFIG_ASYNC_RAID6_TEST is not set +CONFIG_MD_MULTIPATH=m +CONFIG_MD_FAULTY=m +CONFIG_BLK_DEV_DM=m +# CONFIG_DM_DEBUG is not set +CONFIG_DM_CRYPT=m +CONFIG_DM_SNAPSHOT=m +CONFIG_DM_MIRROR=m +CONFIG_DM_LOG_USERSPACE=m +CONFIG_DM_ZERO=m +CONFIG_DM_MULTIPATH=m +CONFIG_DM_MULTIPATH_QL=m +CONFIG_DM_MULTIPATH_ST=m +CONFIG_DM_DELAY=m +# CONFIG_DM_UEVENT is not set +CONFIG_FUSION=y +CONFIG_FUSION_SPI=m +CONFIG_FUSION_FC=m +CONFIG_FUSION_SAS=m +CONFIG_FUSION_MAX_SGE=128 +CONFIG_FUSION_CTL=m +# CONFIG_FUSION_LOGGING is not set + +# +# IEEE 1394 (FireWire) support +# + +# +# You can enable one or both FireWire driver stacks. +# + +# +# See the help texts for more information. +# +CONFIG_FIREWIRE=m +CONFIG_FIREWIRE_OHCI=m +CONFIG_FIREWIRE_OHCI_DEBUG=y +CONFIG_FIREWIRE_SBP2=m +CONFIG_FIREWIRE_NET=m +CONFIG_IEEE1394=m +CONFIG_IEEE1394_OHCI1394=m +CONFIG_IEEE1394_PCILYNX=m +CONFIG_IEEE1394_SBP2=m +# CONFIG_IEEE1394_SBP2_PHYS_DMA is not set +CONFIG_IEEE1394_ETH1394_ROM_ENTRY=y +CONFIG_IEEE1394_ETH1394=m +CONFIG_IEEE1394_RAWIO=m +CONFIG_IEEE1394_VIDEO1394=m +CONFIG_IEEE1394_DV1394=m +# CONFIG_IEEE1394_VERBOSEDEBUG is not set +CONFIG_I2O=m +CONFIG_I2O_LCT_NOTIFY_ON_CHANGES=y +CONFIG_I2O_EXT_ADAPTEC=y +CONFIG_I2O_EXT_ADAPTEC_DMA64=y +CONFIG_I2O_CONFIG=m +CONFIG_I2O_CONFIG_OLD_IOCTL=y +CONFIG_I2O_BUS=m +CONFIG_I2O_BLOCK=m +CONFIG_I2O_SCSI=m +CONFIG_I2O_PROC=m +# CONFIG_MACINTOSH_DRIVERS is not set +CONFIG_NETDEVICES=y +CONFIG_IFB=m +CONFIG_DUMMY=m +CONFIG_BONDING=m +CONFIG_MACVLAN=m +CONFIG_EQUALIZER=m +CONFIG_TUN=m +CONFIG_VETH=m +CONFIG_NET_SB1000=m +CONFIG_ARCNET=m +CONFIG_ARCNET_1201=m +CONFIG_ARCNET_1051=m +CONFIG_ARCNET_RAW=m +CONFIG_ARCNET_CAP=m +CONFIG_ARCNET_COM90xx=m +CONFIG_ARCNET_COM90xxIO=m +CONFIG_ARCNET_RIM_I=m +CONFIG_ARCNET_COM20020=m +CONFIG_ARCNET_COM20020_PCI=m +CONFIG_PHYLIB=m + +# +# MII PHY device drivers +# +CONFIG_MARVELL_PHY=m +CONFIG_DAVICOM_PHY=m +CONFIG_QSEMI_PHY=m +CONFIG_LXT_PHY=m +CONFIG_CICADA_PHY=m +CONFIG_VITESSE_PHY=m +CONFIG_SMSC_PHY=m +CONFIG_BROADCOM_PHY=m +CONFIG_ICPLUS_PHY=m +CONFIG_REALTEK_PHY=m +CONFIG_NATIONAL_PHY=m +CONFIG_STE10XP=m +CONFIG_LSI_ET1011C_PHY=m +CONFIG_MDIO_BITBANG=m +CONFIG_MDIO_GPIO=m +CONFIG_NET_ETHERNET=y +CONFIG_MII=m +CONFIG_HAPPYMEAL=m +CONFIG_SUNGEM=m +CONFIG_CASSINI=m +CONFIG_NET_VENDOR_3COM=y +CONFIG_VORTEX=m +CONFIG_TYPHOON=m +CONFIG_ENC28J60=m +# CONFIG_ENC28J60_WRITEVERIFY is not set +CONFIG_ETHOC=m +CONFIG_DNET=m +CONFIG_NET_TULIP=y +CONFIG_DE2104X=m +CONFIG_DE2104X_DSL=0 +CONFIG_TULIP=m +# CONFIG_TULIP_MWI is not set +# CONFIG_TULIP_MMIO is not set +# CONFIG_TULIP_NAPI is not set +CONFIG_DE4X5=m +CONFIG_WINBOND_840=m +CONFIG_DM9102=m +CONFIG_ULI526X=m +CONFIG_PCMCIA_XIRCOM=m +CONFIG_HP100=m +# CONFIG_IBM_NEW_EMAC_ZMII is not set +# CONFIG_IBM_NEW_EMAC_RGMII is not set +# CONFIG_IBM_NEW_EMAC_TAH is not set +# CONFIG_IBM_NEW_EMAC_EMAC4 is not set +# CONFIG_IBM_NEW_EMAC_NO_FLOW_CTRL is not set +# CONFIG_IBM_NEW_EMAC_MAL_CLR_ICINTSTAT is not set +# CONFIG_IBM_NEW_EMAC_MAL_COMMON_ERR is not set +CONFIG_NET_PCI=y +CONFIG_PCNET32=m +CONFIG_AMD8111_ETH=m +CONFIG_ADAPTEC_STARFIRE=m +CONFIG_B44=m +CONFIG_B44_PCI_AUTOSELECT=y +CONFIG_B44_PCICORE_AUTOSELECT=y +CONFIG_B44_PCI=y +CONFIG_FORCEDETH=m +# CONFIG_FORCEDETH_NAPI is not set +CONFIG_E100=m +CONFIG_FEALNX=m +CONFIG_NATSEMI=m +CONFIG_NE2K_PCI=m +CONFIG_8139CP=m +CONFIG_8139TOO=m +CONFIG_8139TOO_PIO=y +# CONFIG_8139TOO_TUNE_TWISTER is not set +# CONFIG_8139TOO_8129 is not set +# CONFIG_8139_OLD_RX_RESET is not set +CONFIG_R6040=m +CONFIG_SIS900=m +CONFIG_EPIC100=m +CONFIG_SMSC9420=m +CONFIG_SUNDANCE=m +# CONFIG_SUNDANCE_MMIO is not set +CONFIG_TLAN=m +CONFIG_KS8842=m +CONFIG_KS8851=m +CONFIG_KS8851_MLL=m +CONFIG_VIA_RHINE=m +# CONFIG_VIA_RHINE_MMIO is not set +CONFIG_SC92031=m +CONFIG_NET_POCKET=y +CONFIG_ATP=m +CONFIG_DE600=m +CONFIG_DE620=m +CONFIG_ATL2=m +CONFIG_NETDEV_1000=y +CONFIG_ACENIC=m +# CONFIG_ACENIC_OMIT_TIGON_I is not set +CONFIG_DL2K=m +CONFIG_E1000=m +CONFIG_E1000E=m +CONFIG_IP1000=m +CONFIG_IGB=m +CONFIG_IGB_DCA=y +CONFIG_IGBVF=m +CONFIG_NS83820=m +CONFIG_HAMACHI=m +CONFIG_YELLOWFIN=m +CONFIG_R8169=m +CONFIG_R8169_VLAN=y +CONFIG_SIS190=m +CONFIG_SKGE=m +# CONFIG_SKGE_DEBUG is not set +CONFIG_SKY2=m +# CONFIG_SKY2_DEBUG is not set +CONFIG_VIA_VELOCITY=m +CONFIG_TIGON3=m +CONFIG_BNX2=m +CONFIG_CNIC=m +CONFIG_QLA3XXX=m +CONFIG_ATL1=m +CONFIG_ATL1E=m +CONFIG_ATL1C=m +CONFIG_JME=m +CONFIG_NETDEV_10000=y +CONFIG_MDIO=m +CONFIG_CHELSIO_T1=m +CONFIG_CHELSIO_T1_1G=y +CONFIG_CHELSIO_T3_DEPENDS=y +CONFIG_CHELSIO_T3=m +CONFIG_ENIC=m +CONFIG_IXGBE=m +CONFIG_IXGBE_DCA=y +CONFIG_IXGB=m +CONFIG_S2IO=m +CONFIG_VXGE=m +# CONFIG_VXGE_DEBUG_TRACE_ALL is not set +CONFIG_MYRI10GE=m +CONFIG_MYRI10GE_DCA=y +CONFIG_NETXEN_NIC=m +CONFIG_NIU=m +CONFIG_MLX4_EN=m +CONFIG_MLX4_CORE=m +CONFIG_MLX4_DEBUG=y +CONFIG_TEHUTI=m +CONFIG_BNX2X=m +CONFIG_QLGE=m +CONFIG_SFC=m +CONFIG_SFC_MTD=y +CONFIG_BE2NET=m +# CONFIG_TR is not set +CONFIG_WLAN=y +# CONFIG_WLAN_PRE80211 is not set +# CONFIG_WLAN_80211 is not set + +# +# WiMAX Wireless Broadband devices +# +CONFIG_WIMAX_I2400M=m +CONFIG_WIMAX_I2400M_USB=m +CONFIG_WIMAX_I2400M_SDIO=m +CONFIG_WIMAX_I2400M_DEBUG_LEVEL=8 + +# +# USB Network Adapters +# +CONFIG_USB_CATC=m +CONFIG_USB_KAWETH=m +CONFIG_USB_PEGASUS=m +CONFIG_USB_RTL8150=m +CONFIG_USB_USBNET=m +CONFIG_USB_NET_AX8817X=m +CONFIG_USB_NET_CDCETHER=m +CONFIG_USB_NET_CDC_EEM=m +CONFIG_USB_NET_DM9601=m +CONFIG_USB_NET_SMSC95XX=m +CONFIG_USB_NET_GL620A=m +CONFIG_USB_NET_NET1080=m +CONFIG_USB_NET_PLUSB=m +CONFIG_USB_NET_MCS7830=m +CONFIG_USB_NET_RNDIS_HOST=m +CONFIG_USB_NET_CDC_SUBSET=m +CONFIG_USB_ALI_M5632=y +CONFIG_USB_AN2720=y +CONFIG_USB_BELKIN=y +CONFIG_USB_ARMLINUX=y +CONFIG_USB_EPSON2888=y +CONFIG_USB_KC2190=y +CONFIG_USB_NET_ZAURUS=m +CONFIG_USB_HSO=m +CONFIG_USB_NET_INT51X1=m +CONFIG_USB_CDC_PHONET=m +CONFIG_NET_PCMCIA=y +CONFIG_PCMCIA_3C589=m +CONFIG_PCMCIA_3C574=m +CONFIG_PCMCIA_FMVJ18X=m +CONFIG_PCMCIA_PCNET=m +CONFIG_PCMCIA_NMCLAN=m +CONFIG_PCMCIA_SMC91C92=m +CONFIG_PCMCIA_XIRC2PS=m +CONFIG_PCMCIA_AXNET=m +CONFIG_ARCNET_COM20020_CS=m +CONFIG_WAN=y +CONFIG_LANMEDIA=m +CONFIG_HDLC=m +CONFIG_HDLC_RAW=m +CONFIG_HDLC_RAW_ETH=m +CONFIG_HDLC_CISCO=m +CONFIG_HDLC_FR=m +CONFIG_HDLC_PPP=m +CONFIG_HDLC_X25=m +CONFIG_PCI200SYN=m +CONFIG_WANXL=m +# CONFIG_WANXL_BUILD_FIRMWARE is not set +CONFIG_PC300TOO=m +CONFIG_FARSYNC=m +CONFIG_DSCC4=m +CONFIG_DSCC4_PCISYNC=y +CONFIG_DSCC4_PCI_RST=y +CONFIG_DLCI=m +CONFIG_DLCI_MAX=8 +CONFIG_WAN_ROUTER_DRIVERS=m +CONFIG_CYCLADES_SYNC=m +CONFIG_CYCLOMX_X25=y +CONFIG_LAPBETHER=m +CONFIG_X25_ASY=m +CONFIG_SBNI=m +CONFIG_SBNI_MULTILINE=y +CONFIG_ATM_DRIVERS=y +CONFIG_ATM_DUMMY=m +CONFIG_ATM_TCP=m +CONFIG_ATM_LANAI=m +CONFIG_ATM_ENI=m +# CONFIG_ATM_ENI_DEBUG is not set +# CONFIG_ATM_ENI_TUNE_BURST is not set +CONFIG_ATM_FIRESTREAM=m +CONFIG_ATM_ZATM=m +# CONFIG_ATM_ZATM_DEBUG is not set +CONFIG_ATM_IDT77252=m +# CONFIG_ATM_IDT77252_DEBUG is not set +# CONFIG_ATM_IDT77252_RCV_ALL is not set +CONFIG_ATM_IDT77252_USE_SUNI=y +CONFIG_ATM_AMBASSADOR=m +# CONFIG_ATM_AMBASSADOR_DEBUG is not set +CONFIG_ATM_HORIZON=m +# CONFIG_ATM_HORIZON_DEBUG is not set +CONFIG_ATM_IA=m +# CONFIG_ATM_IA_DEBUG is not set +CONFIG_ATM_FORE200E=m +CONFIG_ATM_FORE200E_USE_TASKLET=y +CONFIG_ATM_FORE200E_TX_RETRY=16 +CONFIG_ATM_FORE200E_DEBUG=0 +CONFIG_ATM_HE=m +CONFIG_ATM_HE_USE_SUNI=y +CONFIG_ATM_SOLOS=m +CONFIG_IEEE802154_DRIVERS=m +CONFIG_IEEE802154_FAKEHARD=m +CONFIG_XEN_NETDEV_FRONTEND=y +CONFIG_FDDI=y +CONFIG_DEFXX=m +# CONFIG_DEFXX_MMIO is not set +CONFIG_SKFP=m +CONFIG_HIPPI=y +CONFIG_ROADRUNNER=m +# CONFIG_ROADRUNNER_LARGE_RINGS is not set +CONFIG_PLIP=m +CONFIG_PPP=m +CONFIG_PPP_MULTILINK=y +CONFIG_PPP_FILTER=y +CONFIG_PPP_ASYNC=m +CONFIG_PPP_SYNC_TTY=m +CONFIG_PPP_DEFLATE=m +CONFIG_PPP_BSDCOMP=m +CONFIG_PPP_MPPE=m +CONFIG_PPPOE=m +CONFIG_PPPOATM=m +CONFIG_PPPOL2TP=m +CONFIG_SLIP=m +CONFIG_SLIP_COMPRESSED=y +CONFIG_SLHC=m +CONFIG_SLIP_SMART=y +CONFIG_SLIP_MODE_SLIP6=y +# CONFIG_NET_FC is not set +CONFIG_NETCONSOLE=m +CONFIG_NETCONSOLE_DYNAMIC=y +CONFIG_NETPOLL=y +# CONFIG_NETPOLL_TRAP is not set +CONFIG_NET_POLL_CONTROLLER=y +CONFIG_VIRTIO_NET=m +CONFIG_VMXNET3=m +CONFIG_ISDN=y +# CONFIG_ISDN_I4L is not set +CONFIG_MISDN=m +CONFIG_MISDN_DSP=m +CONFIG_MISDN_L1OIP=m + +# +# mISDN hardware drivers +# +CONFIG_MISDN_HFCPCI=m +CONFIG_MISDN_HFCMULTI=m +CONFIG_MISDN_HFCUSB=m +CONFIG_MISDN_AVMFRITZ=m +# CONFIG_MISDN_SPEEDFAX is not set +# CONFIG_MISDN_INFINEON is not set +# CONFIG_MISDN_W6692 is not set +# CONFIG_MISDN_NETJET is not set +CONFIG_MISDN_IPAC=m +CONFIG_ISDN_CAPI=m +# CONFIG_ISDN_DRV_AVMB1_VERBOSE_REASON is not set +# CONFIG_CAPI_TRACE is not set +CONFIG_ISDN_CAPI_MIDDLEWARE=y +CONFIG_ISDN_CAPI_CAPI20=m +CONFIG_ISDN_CAPI_CAPIFS_BOOL=y +CONFIG_ISDN_CAPI_CAPIFS=m + +# +# CAPI hardware drivers +# +CONFIG_CAPI_AVM=y +CONFIG_ISDN_DRV_AVMB1_B1PCI=m +CONFIG_ISDN_DRV_AVMB1_B1PCIV4=y +CONFIG_ISDN_DRV_AVMB1_B1PCMCIA=m +CONFIG_ISDN_DRV_AVMB1_AVM_CS=m +CONFIG_ISDN_DRV_AVMB1_T1PCI=m +CONFIG_ISDN_DRV_AVMB1_C4=m +CONFIG_CAPI_EICON=y +CONFIG_ISDN_DIVAS=m +CONFIG_ISDN_DIVAS_BRIPCI=y +CONFIG_ISDN_DIVAS_PRIPCI=y +CONFIG_ISDN_DIVAS_DIVACAPI=m +CONFIG_ISDN_DIVAS_USERIDI=m +CONFIG_ISDN_DIVAS_MAINT=m +# CONFIG_PHONE is not set + +# +# Input device support +# +CONFIG_INPUT=y +CONFIG_INPUT_FF_MEMLESS=m +CONFIG_INPUT_POLLDEV=m + +# +# Userland interfaces +# +CONFIG_INPUT_MOUSEDEV=m +CONFIG_INPUT_MOUSEDEV_PSAUX=y +CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 +CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 +CONFIG_INPUT_JOYDEV=m +CONFIG_INPUT_EVDEV=m +CONFIG_INPUT_EVBUG=m +CONFIG_XEN_KBDDEV_FRONTEND=m + +# +# Input Device Drivers +# +CONFIG_INPUT_KEYBOARD=y +# CONFIG_KEYBOARD_ADP5588 is not set +CONFIG_KEYBOARD_ATKBD=y +# CONFIG_QT2160 is not set +CONFIG_KEYBOARD_LKKBD=m +CONFIG_KEYBOARD_GPIO=m +CONFIG_KEYBOARD_MATRIX=m +CONFIG_KEYBOARD_LM8323=m +# CONFIG_KEYBOARD_MAX7359 is not set +CONFIG_KEYBOARD_NEWTON=m +# CONFIG_KEYBOARD_OPENCORES is not set +CONFIG_KEYBOARD_STOWAWAY=m +CONFIG_KEYBOARD_SUNKBD=m +CONFIG_KEYBOARD_XTKBD=m +CONFIG_INPUT_MOUSE=y +CONFIG_MOUSE_PS2=m +CONFIG_MOUSE_PS2_ALPS=y +CONFIG_MOUSE_PS2_LOGIPS2PP=y +CONFIG_MOUSE_PS2_SYNAPTICS=y +CONFIG_MOUSE_PS2_LIFEBOOK=y +CONFIG_MOUSE_PS2_TRACKPOINT=y +# CONFIG_MOUSE_PS2_ELANTECH is not set +# CONFIG_MOUSE_PS2_SENTELIC is not set +# CONFIG_MOUSE_PS2_TOUCHKIT is not set +CONFIG_MOUSE_SERIAL=m +CONFIG_MOUSE_APPLETOUCH=m +CONFIG_MOUSE_BCM5974=m +CONFIG_MOUSE_VSXXXAA=m +CONFIG_MOUSE_GPIO=m +CONFIG_MOUSE_SYNAPTICS_I2C=m +# CONFIG_INPUT_JOYSTICK is not set +# CONFIG_INPUT_TABLET is not set +CONFIG_INPUT_TOUCHSCREEN=y +CONFIG_TOUCHSCREEN_ADS7846=m +CONFIG_TOUCHSCREEN_AD7877=m +CONFIG_TOUCHSCREEN_AD7879_I2C=m +CONFIG_TOUCHSCREEN_AD7879=m +CONFIG_TOUCHSCREEN_EETI=m +CONFIG_TOUCHSCREEN_FUJITSU=m +CONFIG_TOUCHSCREEN_GUNZE=m +CONFIG_TOUCHSCREEN_ELO=m +CONFIG_TOUCHSCREEN_WACOM_W8001=m +# CONFIG_TOUCHSCREEN_MCS5000 is not set +CONFIG_TOUCHSCREEN_MTOUCH=m +CONFIG_TOUCHSCREEN_INEXIO=m +CONFIG_TOUCHSCREEN_MK712=m +CONFIG_TOUCHSCREEN_PENMOUNT=m +CONFIG_TOUCHSCREEN_TOUCHRIGHT=m +CONFIG_TOUCHSCREEN_TOUCHWIN=m +CONFIG_TOUCHSCREEN_UCB1400=m +CONFIG_TOUCHSCREEN_WM97XX=m +CONFIG_TOUCHSCREEN_WM9705=y +CONFIG_TOUCHSCREEN_WM9712=y +CONFIG_TOUCHSCREEN_WM9713=y +CONFIG_TOUCHSCREEN_USB_COMPOSITE=m +CONFIG_TOUCHSCREEN_USB_EGALAX=y +CONFIG_TOUCHSCREEN_USB_PANJIT=y +CONFIG_TOUCHSCREEN_USB_3M=y +CONFIG_TOUCHSCREEN_USB_ITM=y +CONFIG_TOUCHSCREEN_USB_ETURBO=y +CONFIG_TOUCHSCREEN_USB_GUNZE=y +CONFIG_TOUCHSCREEN_USB_DMC_TSC10=y +CONFIG_TOUCHSCREEN_USB_IRTOUCH=y +CONFIG_TOUCHSCREEN_USB_IDEALTEK=y +CONFIG_TOUCHSCREEN_USB_GENERAL_TOUCH=y +CONFIG_TOUCHSCREEN_USB_GOTOP=y +CONFIG_TOUCHSCREEN_USB_JASTEC=y +CONFIG_TOUCHSCREEN_USB_E2I=y +CONFIG_TOUCHSCREEN_TOUCHIT213=m +CONFIG_TOUCHSCREEN_TSC2007=m +CONFIG_INPUT_MISC=y +CONFIG_INPUT_PCSPKR=m +CONFIG_INPUT_APANEL=m +CONFIG_INPUT_ATLAS_BTNS=m +CONFIG_INPUT_ATI_REMOTE=m +CONFIG_INPUT_ATI_REMOTE2=m +CONFIG_INPUT_KEYSPAN_REMOTE=m +CONFIG_INPUT_POWERMATE=m +CONFIG_INPUT_YEALINK=m +CONFIG_INPUT_CM109=m +CONFIG_INPUT_UINPUT=m +CONFIG_INPUT_WINBOND_CIR=m +CONFIG_INPUT_PCF50633_PMU=m +CONFIG_INPUT_GPIO_ROTARY_ENCODER=m + +# +# Hardware I/O ports +# +CONFIG_SERIO=y +CONFIG_SERIO_I8042=y +CONFIG_SERIO_SERPORT=m +CONFIG_SERIO_CT82C710=m +CONFIG_SERIO_PARKBD=m +CONFIG_SERIO_PCIPS2=m +CONFIG_SERIO_LIBPS2=y +CONFIG_SERIO_RAW=m +# CONFIG_GAMEPORT is not set + +# +# Character devices +# +CONFIG_VT=y +CONFIG_CONSOLE_TRANSLATIONS=y +CONFIG_VT_CONSOLE=y +CONFIG_HW_CONSOLE=y +# CONFIG_VT_HW_CONSOLE_BINDING is not set +# CONFIG_DEVKMEM is not set +CONFIG_SERIAL_NONSTANDARD=y +CONFIG_COMPUTONE=m +CONFIG_ROCKETPORT=m +CONFIG_CYCLADES=m +# CONFIG_CYZ_INTR is not set +CONFIG_DIGIEPCA=m +CONFIG_MOXA_INTELLIO=m +CONFIG_MOXA_SMARTIO=m +CONFIG_ISI=m +CONFIG_SYNCLINK=m +CONFIG_SYNCLINKMP=m +CONFIG_SYNCLINK_GT=m +CONFIG_N_HDLC=m +CONFIG_RISCOM8=m +CONFIG_SPECIALIX=m +CONFIG_STALDRV=y +CONFIG_STALLION=m +CONFIG_ISTALLION=m +CONFIG_NOZOMI=m + +# +# Serial drivers +# +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_FIX_EARLYCON_MEM=y +CONFIG_SERIAL_8250_PCI=y +CONFIG_SERIAL_8250_PNP=y +CONFIG_SERIAL_8250_CS=m +CONFIG_SERIAL_8250_NR_UARTS=16 +CONFIG_SERIAL_8250_RUNTIME_UARTS=4 +CONFIG_SERIAL_8250_EXTENDED=y +CONFIG_SERIAL_8250_MANY_PORTS=y +CONFIG_SERIAL_8250_SHARE_IRQ=y +# CONFIG_SERIAL_8250_DETECT_IRQ is not set +CONFIG_SERIAL_8250_RSA=y + +# +# Non-8250 serial port support +# +CONFIG_SERIAL_MAX3100=m +CONFIG_SERIAL_CORE=y +CONFIG_SERIAL_CORE_CONSOLE=y +CONFIG_SERIAL_JSM=m +CONFIG_UNIX98_PTYS=y +# CONFIG_DEVPTS_MULTIPLE_INSTANCES is not set +# CONFIG_LEGACY_PTYS is not set +CONFIG_PRINTER=m +# CONFIG_LP_CONSOLE is not set +CONFIG_PPDEV=m +CONFIG_HVC_DRIVER=y +CONFIG_HVC_IRQ=y +CONFIG_HVC_XEN=y +CONFIG_VIRTIO_CONSOLE=m +CONFIG_IPMI_HANDLER=m +# CONFIG_IPMI_PANIC_EVENT is not set +CONFIG_IPMI_DEVICE_INTERFACE=m +CONFIG_IPMI_SI=m +CONFIG_IPMI_WATCHDOG=m +CONFIG_IPMI_POWEROFF=m +CONFIG_HW_RANDOM=m +CONFIG_HW_RANDOM_TIMERIOMEM=m +CONFIG_HW_RANDOM_INTEL=m +CONFIG_HW_RANDOM_AMD=m +CONFIG_HW_RANDOM_VIA=m +CONFIG_HW_RANDOM_VIRTIO=m +CONFIG_NVRAM=m +CONFIG_R3964=m +CONFIG_APPLICOM=m + +# +# PCMCIA character devices +# +CONFIG_SYNCLINK_CS=m +CONFIG_CARDMAN_4000=m +CONFIG_CARDMAN_4040=m +CONFIG_IPWIRELESS=m +CONFIG_MWAVE=m +CONFIG_PC8736x_GPIO=m +CONFIG_NSC_GPIO=m +CONFIG_RAW_DRIVER=m +CONFIG_MAX_RAW_DEVS=256 +CONFIG_HPET=y +CONFIG_HPET_MMAP=y +CONFIG_HANGCHECK_TIMER=m +CONFIG_TCG_TPM=m +CONFIG_TCG_TIS=m +CONFIG_TCG_NSC=m +CONFIG_TCG_ATMEL=m +CONFIG_TCG_INFINEON=m +CONFIG_TELCLOCK=m +CONFIG_DEVPORT=y +CONFIG_I2C=m +CONFIG_I2C_BOARDINFO=y +CONFIG_I2C_COMPAT=y +CONFIG_I2C_CHARDEV=m +CONFIG_I2C_HELPER_AUTO=y +CONFIG_I2C_ALGOBIT=m +CONFIG_I2C_ALGOPCA=m + +# +# I2C Hardware Bus support +# + +# +# PC SMBus host controller drivers +# +CONFIG_I2C_ALI1535=m +CONFIG_I2C_ALI1563=m +CONFIG_I2C_ALI15X3=m +CONFIG_I2C_AMD756=m +CONFIG_I2C_AMD756_S4882=m +CONFIG_I2C_AMD8111=m +CONFIG_I2C_I801=m +CONFIG_I2C_ISCH=m +CONFIG_I2C_PIIX4=m +CONFIG_I2C_NFORCE2=m +CONFIG_I2C_NFORCE2_S4985=m +CONFIG_I2C_SIS5595=m +CONFIG_I2C_SIS630=m +CONFIG_I2C_SIS96X=m +CONFIG_I2C_VIA=m +CONFIG_I2C_VIAPRO=m + +# +# ACPI drivers +# +CONFIG_I2C_SCMI=m + +# +# I2C system bus drivers (mostly embedded / system-on-chip) +# +CONFIG_I2C_GPIO=m +CONFIG_I2C_OCORES=m +CONFIG_I2C_SIMTEC=m + +# +# External I2C/SMBus adapter drivers +# +CONFIG_I2C_PARPORT=m +CONFIG_I2C_PARPORT_LIGHT=m +CONFIG_I2C_TAOS_EVM=m +CONFIG_I2C_TINY_USB=m + +# +# Graphics adapter I2C/DDC channel drivers +# +# CONFIG_I2C_VOODOO3 is not set + +# +# Other I2C/SMBus bus drivers +# +CONFIG_I2C_PCA_PLATFORM=m +CONFIG_I2C_STUB=m + +# +# Miscellaneous I2C Chip support +# +CONFIG_DS1682=m +CONFIG_SENSORS_TSL2550=m +# CONFIG_I2C_DEBUG_CORE is not set +# CONFIG_I2C_DEBUG_ALGO is not set +# CONFIG_I2C_DEBUG_BUS is not set +# CONFIG_I2C_DEBUG_CHIP is not set +CONFIG_SPI=y +CONFIG_SPI_MASTER=y + +# +# SPI Master Controller Drivers +# +CONFIG_SPI_BITBANG=m +CONFIG_SPI_BUTTERFLY=m +CONFIG_SPI_GPIO=m +CONFIG_SPI_LM70_LLP=m + +# +# SPI Protocol Masters +# +CONFIG_SPI_SPIDEV=m +CONFIG_SPI_TLE62X0=m + +# +# PPS support +# +# CONFIG_PPS is not set +CONFIG_ARCH_WANT_OPTIONAL_GPIOLIB=y +CONFIG_GPIOLIB=y +# CONFIG_GPIO_SYSFS is not set + +# +# Memory mapped GPIO expanders: +# + +# +# I2C GPIO expanders: +# +CONFIG_GPIO_MAX732X=m +CONFIG_GPIO_PCA953X=m +CONFIG_GPIO_PCF857X=m + +# +# PCI GPIO expanders: +# +# CONFIG_GPIO_LANGWELL is not set + +# +# SPI GPIO expanders: +# +CONFIG_GPIO_MAX7301=m +CONFIG_GPIO_MCP23S08=m +# CONFIG_GPIO_MC33880 is not set + +# +# AC97 GPIO expanders: +# +# CONFIG_GPIO_UCB1400 is not set +CONFIG_W1=m +CONFIG_W1_CON=y + +# +# 1-wire Bus Masters +# +CONFIG_W1_MASTER_MATROX=m +CONFIG_W1_MASTER_DS2490=m +CONFIG_W1_MASTER_DS2482=m +CONFIG_W1_MASTER_GPIO=m + +# +# 1-wire Slaves +# +CONFIG_W1_SLAVE_THERM=m +CONFIG_W1_SLAVE_SMEM=m +CONFIG_W1_SLAVE_DS2431=m +CONFIG_W1_SLAVE_DS2433=m +# CONFIG_W1_SLAVE_DS2433_CRC is not set +CONFIG_W1_SLAVE_DS2760=m +CONFIG_W1_SLAVE_BQ27000=m +CONFIG_POWER_SUPPLY=y +# CONFIG_POWER_SUPPLY_DEBUG is not set +CONFIG_PDA_POWER=m +CONFIG_BATTERY_DS2760=m +CONFIG_BATTERY_DS2782=m +CONFIG_BATTERY_BQ27x00=m +CONFIG_BATTERY_MAX17040=m +CONFIG_CHARGER_PCF50633=m +CONFIG_HWMON=m +CONFIG_HWMON_VID=m +# CONFIG_HWMON_DEBUG_CHIP is not set + +# +# Native drivers +# +CONFIG_SENSORS_ABITUGURU=m +CONFIG_SENSORS_ABITUGURU3=m +CONFIG_SENSORS_AD7414=m +CONFIG_SENSORS_AD7418=m +CONFIG_SENSORS_ADCXX=m +CONFIG_SENSORS_ADM1021=m +CONFIG_SENSORS_ADM1025=m +CONFIG_SENSORS_ADM1026=m +CONFIG_SENSORS_ADM1029=m +CONFIG_SENSORS_ADM1031=m +CONFIG_SENSORS_ADM9240=m +CONFIG_SENSORS_ADT7462=m +CONFIG_SENSORS_ADT7470=m +# CONFIG_SENSORS_ADT7473 is not set +CONFIG_SENSORS_ADT7475=m +CONFIG_SENSORS_K8TEMP=m +CONFIG_SENSORS_ASB100=m +CONFIG_SENSORS_ATXP1=m +CONFIG_SENSORS_DS1621=m +CONFIG_SENSORS_I5K_AMB=m +CONFIG_SENSORS_F71805F=m +CONFIG_SENSORS_F71882FG=m +CONFIG_SENSORS_F75375S=m +CONFIG_SENSORS_FSCHMD=m +CONFIG_SENSORS_G760A=m +CONFIG_SENSORS_GL518SM=m +CONFIG_SENSORS_GL520SM=m +CONFIG_SENSORS_CORETEMP=m +CONFIG_SENSORS_IBMAEM=m +CONFIG_SENSORS_IBMPEX=m +CONFIG_SENSORS_IT87=m +CONFIG_SENSORS_LM63=m +CONFIG_SENSORS_LM70=m +CONFIG_SENSORS_LM75=m +CONFIG_SENSORS_LM77=m +CONFIG_SENSORS_LM78=m +CONFIG_SENSORS_LM80=m +CONFIG_SENSORS_LM83=m +CONFIG_SENSORS_LM85=m +CONFIG_SENSORS_LM87=m +CONFIG_SENSORS_LM90=m +CONFIG_SENSORS_LM92=m +CONFIG_SENSORS_LM93=m +CONFIG_SENSORS_LTC4215=m +CONFIG_SENSORS_LTC4245=m +CONFIG_SENSORS_LM95241=m +CONFIG_SENSORS_MAX1111=m +CONFIG_SENSORS_MAX1619=m +CONFIG_SENSORS_MAX6650=m +CONFIG_SENSORS_PC87360=m +CONFIG_SENSORS_PC87427=m +CONFIG_SENSORS_PCF8591=m +CONFIG_SENSORS_SHT15=m +CONFIG_SENSORS_SIS5595=m +CONFIG_SENSORS_DME1737=m +CONFIG_SENSORS_SMSC47M1=m +CONFIG_SENSORS_SMSC47M192=m +CONFIG_SENSORS_SMSC47B397=m +CONFIG_SENSORS_ADS7828=m +CONFIG_SENSORS_THMC50=m +CONFIG_SENSORS_TMP401=m +CONFIG_SENSORS_TMP421=m +CONFIG_SENSORS_VIA686A=m +CONFIG_SENSORS_VT1211=m +CONFIG_SENSORS_VT8231=m +CONFIG_SENSORS_W83781D=m +CONFIG_SENSORS_W83791D=m +CONFIG_SENSORS_W83792D=m +CONFIG_SENSORS_W83793=m +CONFIG_SENSORS_W83L785TS=m +CONFIG_SENSORS_W83L786NG=m +CONFIG_SENSORS_W83627HF=m +CONFIG_SENSORS_W83627EHF=m +CONFIG_SENSORS_HDAPS=m +CONFIG_SENSORS_APPLESMC=m + +# +# ACPI drivers +# +CONFIG_SENSORS_ATK0110=m +CONFIG_SENSORS_LIS3LV02D=m +CONFIG_THERMAL=y +CONFIG_WATCHDOG=y +# CONFIG_WATCHDOG_NOWAYOUT is not set + +# +# Watchdog Device Drivers +# +CONFIG_SOFT_WATCHDOG=m +CONFIG_ACQUIRE_WDT=m +CONFIG_ADVANTECH_WDT=m +CONFIG_ALIM1535_WDT=m +CONFIG_ALIM7101_WDT=m +CONFIG_SC520_WDT=m +# CONFIG_SBC_FITPC2_WATCHDOG is not set +CONFIG_EUROTECH_WDT=m +CONFIG_IB700_WDT=m +CONFIG_IBMASR=m +CONFIG_WAFER_WDT=m +CONFIG_I6300ESB_WDT=m +CONFIG_ITCO_WDT=m +CONFIG_ITCO_VENDOR_SUPPORT=y +CONFIG_IT8712F_WDT=m +CONFIG_IT87_WDT=m +# CONFIG_HP_WATCHDOG is not set +CONFIG_SC1200_WDT=m +CONFIG_PC87413_WDT=m +CONFIG_60XX_WDT=m +CONFIG_SBC8360_WDT=m +CONFIG_CPU5_WDT=m +CONFIG_SMSC_SCH311X_WDT=m +CONFIG_SMSC37B787_WDT=m +CONFIG_W83627HF_WDT=m +CONFIG_W83697HF_WDT=m +CONFIG_W83697UG_WDT=m +CONFIG_W83877F_WDT=m +CONFIG_W83977F_WDT=m +CONFIG_MACHZ_WDT=m +CONFIG_SBC_EPX_C3_WATCHDOG=m +# CONFIG_XEN_WDT is not set + +# +# PCI-based Watchdog Cards +# +CONFIG_PCIPCWATCHDOG=m +CONFIG_WDTPCI=m + +# +# USB-based Watchdog Cards +# +CONFIG_USBPCWATCHDOG=m +CONFIG_SSB_POSSIBLE=y + +# +# Sonics Silicon Backplane +# +CONFIG_SSB=m +CONFIG_SSB_SPROM=y +CONFIG_SSB_PCIHOST_POSSIBLE=y +CONFIG_SSB_PCIHOST=y +# CONFIG_SSB_B43_PCI_BRIDGE is not set +CONFIG_SSB_PCMCIAHOST_POSSIBLE=y +CONFIG_SSB_PCMCIAHOST=y +CONFIG_SSB_SDIOHOST_POSSIBLE=y +CONFIG_SSB_SDIOHOST=y +# CONFIG_SSB_SILENT is not set +# CONFIG_SSB_DEBUG is not set +CONFIG_SSB_DRIVER_PCICORE_POSSIBLE=y +CONFIG_SSB_DRIVER_PCICORE=y + +# +# Multifunction device drivers +# +CONFIG_MFD_CORE=m +CONFIG_MFD_SM501=m +# CONFIG_MFD_SM501_GPIO is not set +CONFIG_HTC_PASIC3=m +CONFIG_UCB1400_CORE=m +CONFIG_TPS65010=m +# CONFIG_MFD_TMIO is not set +CONFIG_MFD_WM8400=m +# CONFIG_MFD_WM831X is not set +# CONFIG_MFD_WM8350_I2C is not set +CONFIG_MFD_PCF50633=m +# CONFIG_MFD_MC13783 is not set +CONFIG_PCF50633_ADC=m +CONFIG_PCF50633_GPIO=m +# CONFIG_AB3100_CORE is not set +# CONFIG_EZX_PCAP is not set +CONFIG_REGULATOR=y +# CONFIG_REGULATOR_DEBUG is not set +# CONFIG_REGULATOR_FIXED_VOLTAGE is not set +CONFIG_REGULATOR_VIRTUAL_CONSUMER=m +CONFIG_REGULATOR_USERSPACE_CONSUMER=m +CONFIG_REGULATOR_BQ24022=m +CONFIG_REGULATOR_MAX1586=m +CONFIG_REGULATOR_WM8400=m +CONFIG_REGULATOR_PCF50633=m +CONFIG_REGULATOR_LP3971=m +# CONFIG_REGULATOR_TPS65023 is not set +# CONFIG_REGULATOR_TPS6507X is not set +CONFIG_MEDIA_SUPPORT=m + +# +# Multimedia core support +# +CONFIG_VIDEO_DEV=m +CONFIG_VIDEO_V4L2_COMMON=m +# CONFIG_VIDEO_ALLOW_V4L1 is not set +CONFIG_VIDEO_V4L1_COMPAT=y +CONFIG_DVB_CORE=m +CONFIG_VIDEO_MEDIA=m + +# +# Multimedia drivers +# +CONFIG_VIDEO_SAA7146=m +CONFIG_VIDEO_SAA7146_VV=m +# CONFIG_MEDIA_ATTACH is not set +CONFIG_MEDIA_TUNER=m +# CONFIG_MEDIA_TUNER_CUSTOMISE is not set +CONFIG_MEDIA_TUNER_SIMPLE=m +CONFIG_MEDIA_TUNER_TDA8290=m +CONFIG_MEDIA_TUNER_TDA827X=m +CONFIG_MEDIA_TUNER_TDA18271=m +CONFIG_MEDIA_TUNER_TDA9887=m +CONFIG_MEDIA_TUNER_TEA5761=m +CONFIG_MEDIA_TUNER_TEA5767=m +CONFIG_MEDIA_TUNER_MT20XX=m +CONFIG_MEDIA_TUNER_MT2060=m +CONFIG_MEDIA_TUNER_MT2266=m +CONFIG_MEDIA_TUNER_MT2131=m +CONFIG_MEDIA_TUNER_QT1010=m +CONFIG_MEDIA_TUNER_XC2028=m +CONFIG_MEDIA_TUNER_XC5000=m +CONFIG_MEDIA_TUNER_MXL5005S=m +CONFIG_MEDIA_TUNER_MXL5007T=m +CONFIG_MEDIA_TUNER_MC44S803=m +CONFIG_VIDEO_V4L2=m +CONFIG_VIDEOBUF_GEN=m +CONFIG_VIDEOBUF_DMA_SG=m +CONFIG_VIDEOBUF_VMALLOC=m +CONFIG_VIDEOBUF_DVB=m +CONFIG_VIDEO_BTCX=m +CONFIG_VIDEO_IR=m +CONFIG_VIDEO_TVEEPROM=m +CONFIG_VIDEO_TUNER=m +CONFIG_VIDEO_CAPTURE_DRIVERS=y +# CONFIG_VIDEO_ADV_DEBUG is not set +# CONFIG_VIDEO_FIXED_MINOR_RANGES is not set +# CONFIG_VIDEO_HELPER_CHIPS_AUTO is not set +CONFIG_VIDEO_IR_I2C=m + +# +# Encoders/decoders and other helper chips +# + +# +# Audio decoders +# +CONFIG_VIDEO_TVAUDIO=m +CONFIG_VIDEO_TDA7432=m +CONFIG_VIDEO_TDA9840=m +CONFIG_VIDEO_TDA9875=m +CONFIG_VIDEO_TEA6415C=m +CONFIG_VIDEO_TEA6420=m +CONFIG_VIDEO_MSP3400=m +CONFIG_VIDEO_CS5345=m +CONFIG_VIDEO_CS53L32A=m +CONFIG_VIDEO_M52790=m +CONFIG_VIDEO_TLV320AIC23B=m +CONFIG_VIDEO_WM8775=m +CONFIG_VIDEO_WM8739=m +CONFIG_VIDEO_VP27SMPX=m + +# +# RDS decoders +# +CONFIG_VIDEO_SAA6588=m + +# +# Video decoders +# +# CONFIG_VIDEO_ADV7180 is not set +CONFIG_VIDEO_BT819=m +CONFIG_VIDEO_BT856=m +CONFIG_VIDEO_BT866=m +CONFIG_VIDEO_KS0127=m +CONFIG_VIDEO_OV7670=m +CONFIG_VIDEO_MT9V011=m +CONFIG_VIDEO_TCM825X=m +CONFIG_VIDEO_SAA7110=m +CONFIG_VIDEO_SAA711X=m +CONFIG_VIDEO_SAA717X=m +CONFIG_VIDEO_TVP514X=m +CONFIG_VIDEO_TVP5150=m +CONFIG_VIDEO_VPX3220=m + +# +# Video and audio decoders +# +CONFIG_VIDEO_CX25840=m + +# +# MPEG video encoders +# +CONFIG_VIDEO_CX2341X=m + +# +# Video encoders +# +CONFIG_VIDEO_SAA7127=m +CONFIG_VIDEO_SAA7185=m +CONFIG_VIDEO_ADV7170=m +CONFIG_VIDEO_ADV7175=m +CONFIG_VIDEO_THS7303=m +CONFIG_VIDEO_ADV7343=m + +# +# Video improvement chips +# +CONFIG_VIDEO_UPD64031A=m +CONFIG_VIDEO_UPD64083=m +# CONFIG_VIDEO_VIVI is not set +CONFIG_VIDEO_BT848=m +CONFIG_VIDEO_BT848_DVB=y +CONFIG_VIDEO_SAA5246A=m +CONFIG_VIDEO_SAA5249=m +CONFIG_VIDEO_ZORAN=m +CONFIG_VIDEO_ZORAN_DC30=m +CONFIG_VIDEO_ZORAN_ZR36060=m +CONFIG_VIDEO_ZORAN_BUZ=m +CONFIG_VIDEO_ZORAN_DC10=m +CONFIG_VIDEO_ZORAN_LML33=m +CONFIG_VIDEO_ZORAN_LML33R10=m +CONFIG_VIDEO_ZORAN_AVS6EYES=m +CONFIG_VIDEO_SAA7134=m +CONFIG_VIDEO_SAA7134_ALSA=m +CONFIG_VIDEO_SAA7134_DVB=m +CONFIG_VIDEO_HEXIUM_ORION=m +CONFIG_VIDEO_HEXIUM_GEMINI=m +CONFIG_VIDEO_CX88=m +CONFIG_VIDEO_CX88_ALSA=m +CONFIG_VIDEO_CX88_BLACKBIRD=m +CONFIG_VIDEO_CX88_DVB=m +CONFIG_VIDEO_CX88_MPEG=m +CONFIG_VIDEO_CX88_VP3054=m +CONFIG_VIDEO_CX23885=m +CONFIG_VIDEO_AU0828=m +CONFIG_VIDEO_IVTV=m +CONFIG_VIDEO_FB_IVTV=m +CONFIG_VIDEO_CX18=m +CONFIG_VIDEO_SAA7164=m +CONFIG_VIDEO_CAFE_CCIC=m +CONFIG_SOC_CAMERA=m +CONFIG_SOC_CAMERA_MT9M001=m +CONFIG_SOC_CAMERA_MT9M111=m +CONFIG_SOC_CAMERA_MT9T031=m +CONFIG_SOC_CAMERA_MT9V022=m +CONFIG_SOC_CAMERA_TW9910=m +CONFIG_SOC_CAMERA_PLATFORM=m +CONFIG_SOC_CAMERA_OV772X=m +CONFIG_V4L_USB_DRIVERS=y +CONFIG_USB_VIDEO_CLASS=m +CONFIG_USB_VIDEO_CLASS_INPUT_EVDEV=y +CONFIG_USB_GSPCA=m +CONFIG_USB_M5602=m +CONFIG_USB_STV06XX=m +CONFIG_USB_GL860=m +CONFIG_USB_GSPCA_CONEX=m +CONFIG_USB_GSPCA_ETOMS=m +CONFIG_USB_GSPCA_FINEPIX=m +CONFIG_USB_GSPCA_JEILINJ=m +CONFIG_USB_GSPCA_MARS=m +CONFIG_USB_GSPCA_MR97310A=m +CONFIG_USB_GSPCA_OV519=m +CONFIG_USB_GSPCA_OV534=m +CONFIG_USB_GSPCA_PAC207=m +CONFIG_USB_GSPCA_PAC7311=m +CONFIG_USB_GSPCA_SN9C20X=m +# CONFIG_USB_GSPCA_SN9C20X_EVDEV is not set +CONFIG_USB_GSPCA_SONIXB=m +CONFIG_USB_GSPCA_SONIXJ=m +CONFIG_USB_GSPCA_SPCA500=m +CONFIG_USB_GSPCA_SPCA501=m +CONFIG_USB_GSPCA_SPCA505=m +CONFIG_USB_GSPCA_SPCA506=m +CONFIG_USB_GSPCA_SPCA508=m +CONFIG_USB_GSPCA_SPCA561=m +CONFIG_USB_GSPCA_SQ905=m +CONFIG_USB_GSPCA_SQ905C=m +CONFIG_USB_GSPCA_STK014=m +CONFIG_USB_GSPCA_SUNPLUS=m +CONFIG_USB_GSPCA_T613=m +CONFIG_USB_GSPCA_TV8532=m +CONFIG_USB_GSPCA_VC032X=m +CONFIG_USB_GSPCA_ZC3XX=m +CONFIG_VIDEO_PVRUSB2=m +CONFIG_VIDEO_PVRUSB2_SYSFS=y +CONFIG_VIDEO_PVRUSB2_DVB=y +# CONFIG_VIDEO_PVRUSB2_DEBUGIFC is not set +CONFIG_VIDEO_HDPVR=m +CONFIG_VIDEO_EM28XX=m +CONFIG_VIDEO_EM28XX_ALSA=m +CONFIG_VIDEO_EM28XX_DVB=m +CONFIG_VIDEO_CX231XX=m +CONFIG_VIDEO_CX231XX_ALSA=m +CONFIG_VIDEO_CX231XX_DVB=m +CONFIG_VIDEO_USBVISION=m +CONFIG_USB_ET61X251=m +CONFIG_USB_SN9C102=m +CONFIG_USB_ZC0301=m +CONFIG_USB_PWC_INPUT_EVDEV=y +CONFIG_USB_ZR364XX=m +CONFIG_USB_STKWEBCAM=m +CONFIG_USB_S2255=m +CONFIG_RADIO_ADAPTERS=y +CONFIG_RADIO_GEMTEK_PCI=m +CONFIG_RADIO_MAXIRADIO=m +CONFIG_RADIO_MAESTRO=m +# CONFIG_I2C_SI4713 is not set +# CONFIG_RADIO_SI4713 is not set +CONFIG_USB_DSBR=m +# CONFIG_RADIO_SI470X is not set +CONFIG_USB_MR800=m +CONFIG_RADIO_TEA5764=m +CONFIG_DVB_MAX_ADAPTERS=8 +# CONFIG_DVB_DYNAMIC_MINORS is not set +CONFIG_DVB_CAPTURE_DRIVERS=y + +# +# Supported SAA7146 based PCI Adapters +# +CONFIG_TTPCI_EEPROM=m +CONFIG_DVB_AV7110=m +CONFIG_DVB_AV7110_OSD=y +CONFIG_DVB_BUDGET_CORE=m +CONFIG_DVB_BUDGET=m +CONFIG_DVB_BUDGET_CI=m +CONFIG_DVB_BUDGET_AV=m +CONFIG_DVB_BUDGET_PATCH=m + +# +# Supported USB Adapters +# +CONFIG_DVB_USB=m +# CONFIG_DVB_USB_DEBUG is not set +CONFIG_DVB_USB_A800=m +CONFIG_DVB_USB_DIBUSB_MB=m +# CONFIG_DVB_USB_DIBUSB_MB_FAULTY is not set +CONFIG_DVB_USB_DIBUSB_MC=m +CONFIG_DVB_USB_DIB0700=m +CONFIG_DVB_USB_UMT_010=m +CONFIG_DVB_USB_CXUSB=m +CONFIG_DVB_USB_M920X=m +CONFIG_DVB_USB_GL861=m +CONFIG_DVB_USB_AU6610=m +CONFIG_DVB_USB_DIGITV=m +CONFIG_DVB_USB_VP7045=m +CONFIG_DVB_USB_VP702X=m +CONFIG_DVB_USB_GP8PSK=m +CONFIG_DVB_USB_NOVA_T_USB2=m +CONFIG_DVB_USB_TTUSB2=m +CONFIG_DVB_USB_DTT200U=m +CONFIG_DVB_USB_OPERA1=m +CONFIG_DVB_USB_AF9005=m +CONFIG_DVB_USB_AF9005_REMOTE=m +CONFIG_DVB_USB_DW2102=m +CONFIG_DVB_USB_CINERGY_T2=m +CONFIG_DVB_USB_ANYSEE=m +CONFIG_DVB_USB_DTV5100=m +CONFIG_DVB_USB_AF9015=m +CONFIG_DVB_USB_CE6230=m +# CONFIG_DVB_USB_FRIIO is not set +CONFIG_DVB_TTUSB_BUDGET=m +CONFIG_DVB_TTUSB_DEC=m +CONFIG_SMS_SIANO_MDTV=m + +# +# Siano module components +# +CONFIG_SMS_USB_DRV=m +CONFIG_SMS_SDIO_DRV=m + +# +# Supported FlexCopII (B2C2) Adapters +# +CONFIG_DVB_B2C2_FLEXCOP=m +CONFIG_DVB_B2C2_FLEXCOP_PCI=m +CONFIG_DVB_B2C2_FLEXCOP_USB=m +# CONFIG_DVB_B2C2_FLEXCOP_DEBUG is not set + +# +# Supported BT878 Adapters +# +CONFIG_DVB_BT8XX=m + +# +# Supported Pluto2 Adapters +# +CONFIG_DVB_PLUTO2=m + +# +# Supported SDMC DM1105 Adapters +# +CONFIG_DVB_DM1105=m + +# +# Supported FireWire (IEEE 1394) Adapters +# +CONFIG_DVB_FIREDTV=m +CONFIG_DVB_FIREDTV_IEEE1394=y +CONFIG_DVB_FIREDTV_INPUT=y + +# +# Supported Earthsoft PT1 Adapters +# +# CONFIG_DVB_PT1 is not set + +# +# Supported DVB Frontends +# +# CONFIG_DVB_FE_CUSTOMISE is not set +CONFIG_DVB_STB0899=m +CONFIG_DVB_STB6100=m +CONFIG_DVB_STV090x=m +CONFIG_DVB_STV6110x=m +CONFIG_DVB_CX24110=m +CONFIG_DVB_CX24123=m +CONFIG_DVB_MT312=m +CONFIG_DVB_ZL10036=m +CONFIG_DVB_ZL10039=m +CONFIG_DVB_S5H1420=m +CONFIG_DVB_STV0288=m +CONFIG_DVB_STB6000=m +CONFIG_DVB_STV0299=m +CONFIG_DVB_STV6110=m +CONFIG_DVB_STV0900=m +CONFIG_DVB_TDA8083=m +CONFIG_DVB_TDA10086=m +CONFIG_DVB_TDA8261=m +CONFIG_DVB_VES1X93=m +CONFIG_DVB_TUNER_ITD1000=m +CONFIG_DVB_TUNER_CX24113=m +CONFIG_DVB_TDA826X=m +CONFIG_DVB_TUA6100=m +CONFIG_DVB_CX24116=m +CONFIG_DVB_SI21XX=m +CONFIG_DVB_SP8870=m +CONFIG_DVB_SP887X=m +CONFIG_DVB_CX22700=m +CONFIG_DVB_CX22702=m +CONFIG_DVB_L64781=m +CONFIG_DVB_TDA1004X=m +CONFIG_DVB_NXT6000=m +CONFIG_DVB_MT352=m +CONFIG_DVB_ZL10353=m +CONFIG_DVB_DIB3000MB=m +CONFIG_DVB_DIB3000MC=m +CONFIG_DVB_DIB7000M=m +CONFIG_DVB_DIB7000P=m +CONFIG_DVB_TDA10048=m +CONFIG_DVB_AF9013=m +CONFIG_DVB_VES1820=m +CONFIG_DVB_TDA10021=m +CONFIG_DVB_TDA10023=m +CONFIG_DVB_STV0297=m +CONFIG_DVB_NXT200X=m +CONFIG_DVB_OR51211=m +CONFIG_DVB_OR51132=m +CONFIG_DVB_BCM3510=m +CONFIG_DVB_LGDT330X=m +CONFIG_DVB_LGDT3305=m +CONFIG_DVB_S5H1409=m +CONFIG_DVB_AU8522=m +CONFIG_DVB_S5H1411=m +CONFIG_DVB_DIB8000=m +CONFIG_DVB_PLL=m +CONFIG_DVB_TUNER_DIB0070=m +CONFIG_DVB_LNBP21=m +CONFIG_DVB_ISL6405=m +CONFIG_DVB_ISL6421=m +CONFIG_DVB_ISL6423=m +CONFIG_DVB_LGS8GXX=m +CONFIG_DAB=y +CONFIG_USB_DABUSB=m + +# +# Graphics support +# +CONFIG_AGP=m +CONFIG_AGP_AMD64=m +CONFIG_AGP_INTEL=m +CONFIG_AGP_SIS=m +CONFIG_AGP_VIA=m +# CONFIG_VGA_ARB is not set +CONFIG_DRM=m +CONFIG_DRM_KMS_HELPER=m +CONFIG_DRM_TTM=m +CONFIG_DRM_TDFX=m +CONFIG_DRM_R128=m +CONFIG_DRM_RADEON=m +CONFIG_DRM_I810=m +CONFIG_DRM_I830=m +CONFIG_DRM_I915=m +# CONFIG_DRM_I915_KMS is not set +CONFIG_DRM_MGA=m +CONFIG_DRM_SIS=m +CONFIG_DRM_VIA=m +CONFIG_DRM_SAVAGE=m +CONFIG_VGASTATE=m +CONFIG_VIDEO_OUTPUT_CONTROL=m +CONFIG_FB=m +# CONFIG_FIRMWARE_EDID is not set +CONFIG_FB_DDC=m +# CONFIG_FB_BOOT_VESA_SUPPORT is not set +CONFIG_FB_CFB_FILLRECT=m +CONFIG_FB_CFB_COPYAREA=m +CONFIG_FB_CFB_IMAGEBLIT=m +# CONFIG_FB_CFB_REV_PIXELS_IN_BYTE is not set +CONFIG_FB_SYS_FILLRECT=m +CONFIG_FB_SYS_COPYAREA=m +CONFIG_FB_SYS_IMAGEBLIT=m +# CONFIG_FB_FOREIGN_ENDIAN is not set +CONFIG_FB_SYS_FOPS=m +CONFIG_FB_DEFERRED_IO=y +CONFIG_FB_HECUBA=m +CONFIG_FB_SVGALIB=m +# CONFIG_FB_MACMODES is not set +CONFIG_FB_BACKLIGHT=y +CONFIG_FB_MODE_HELPERS=y +CONFIG_FB_TILEBLITTING=y + +# +# Frame buffer hardware drivers +# +CONFIG_FB_CIRRUS=m +CONFIG_FB_PM2=m +CONFIG_FB_PM2_FIFO_DISCONNECT=y +CONFIG_FB_CYBER2000=m +CONFIG_FB_ARC=m +CONFIG_FB_VGA16=m +CONFIG_FB_UVESA=m +CONFIG_FB_N411=m +CONFIG_FB_HGA=m +# CONFIG_FB_HGA_ACCEL is not set +CONFIG_FB_S1D13XXX=m +CONFIG_FB_NVIDIA=m +CONFIG_FB_NVIDIA_I2C=y +# CONFIG_FB_NVIDIA_DEBUG is not set +CONFIG_FB_NVIDIA_BACKLIGHT=y +CONFIG_FB_RIVA=m +CONFIG_FB_RIVA_I2C=y +# CONFIG_FB_RIVA_DEBUG is not set +CONFIG_FB_RIVA_BACKLIGHT=y +CONFIG_FB_LE80578=m +CONFIG_FB_CARILLO_RANCH=m +CONFIG_FB_INTEL=m +# CONFIG_FB_INTEL_DEBUG is not set +CONFIG_FB_INTEL_I2C=y +CONFIG_FB_MATROX=m +CONFIG_FB_MATROX_MILLENIUM=y +CONFIG_FB_MATROX_MYSTIQUE=y +CONFIG_FB_MATROX_G=y +CONFIG_FB_MATROX_I2C=m +CONFIG_FB_MATROX_MAVEN=m +CONFIG_FB_RADEON=m +CONFIG_FB_RADEON_I2C=y +CONFIG_FB_RADEON_BACKLIGHT=y +# CONFIG_FB_RADEON_DEBUG is not set +CONFIG_FB_ATY128=m +CONFIG_FB_ATY128_BACKLIGHT=y +CONFIG_FB_ATY=m +CONFIG_FB_ATY_CT=y +CONFIG_FB_ATY_GENERIC_LCD=y +CONFIG_FB_ATY_GX=y +CONFIG_FB_ATY_BACKLIGHT=y +CONFIG_FB_S3=m +CONFIG_FB_SAVAGE=m +CONFIG_FB_SAVAGE_I2C=y +CONFIG_FB_SAVAGE_ACCEL=y +CONFIG_FB_SIS=m +CONFIG_FB_SIS_300=y +CONFIG_FB_SIS_315=y +CONFIG_FB_VIA=m +CONFIG_FB_NEOMAGIC=m +CONFIG_FB_KYRO=m +CONFIG_FB_3DFX=m +CONFIG_FB_3DFX_ACCEL=y +CONFIG_FB_3DFX_I2C=y +CONFIG_FB_VOODOO1=m +CONFIG_FB_VT8623=m +CONFIG_FB_TRIDENT=m +CONFIG_FB_ARK=m +CONFIG_FB_PM3=m +CONFIG_FB_CARMINE=m +CONFIG_FB_CARMINE_DRAM_EVAL=y +# CONFIG_CARMINE_DRAM_CUSTOM is not set +CONFIG_FB_GEODE=y +CONFIG_FB_GEODE_LX=m +CONFIG_FB_GEODE_GX=m +CONFIG_FB_GEODE_GX1=m +CONFIG_FB_TMIO=m +CONFIG_FB_TMIO_ACCELL=y +CONFIG_FB_SM501=m +# CONFIG_FB_VIRTUAL is not set +CONFIG_XEN_FBDEV_FRONTEND=m +CONFIG_FB_METRONOME=m +CONFIG_FB_MB862XX=m +# CONFIG_FB_MB862XX_PCI_GDC is not set +CONFIG_FB_BROADSHEET=m +CONFIG_BACKLIGHT_LCD_SUPPORT=y +CONFIG_LCD_CLASS_DEVICE=m +CONFIG_LCD_LMS283GF05=m +CONFIG_LCD_LTV350QV=m +CONFIG_LCD_ILI9320=m +CONFIG_LCD_TDO24M=m +CONFIG_LCD_VGG2432A4=m +CONFIG_LCD_PLATFORM=m +CONFIG_BACKLIGHT_CLASS_DEVICE=m +CONFIG_BACKLIGHT_GENERIC=m +CONFIG_BACKLIGHT_PROGEAR=m +CONFIG_BACKLIGHT_CARILLO_RANCH=m +CONFIG_BACKLIGHT_MBP_NVIDIA=m +CONFIG_BACKLIGHT_SAHARA=m + +# +# Display device support +# +CONFIG_DISPLAY_SUPPORT=m + +# +# Display hardware drivers +# + +# +# Console display driver support +# +CONFIG_VGA_CONSOLE=y +# CONFIG_VGACON_SOFT_SCROLLBACK is not set +CONFIG_DUMMY_CONSOLE=y +CONFIG_FRAMEBUFFER_CONSOLE=m +CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y +CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y +# CONFIG_FONTS is not set +CONFIG_FONT_8x8=y +CONFIG_FONT_8x16=y +# CONFIG_LOGO is not set +CONFIG_SOUND=m +CONFIG_SOUND_OSS_CORE=y +CONFIG_SOUND_OSS_CORE_PRECLAIM=y +CONFIG_SND=m +CONFIG_SND_TIMER=m +CONFIG_SND_PCM=m +CONFIG_SND_HWDEP=m +CONFIG_SND_RAWMIDI=m +CONFIG_SND_JACK=y +CONFIG_SND_SEQUENCER=m +CONFIG_SND_SEQ_DUMMY=m +CONFIG_SND_OSSEMUL=y +CONFIG_SND_MIXER_OSS=m +CONFIG_SND_PCM_OSS=m +CONFIG_SND_PCM_OSS_PLUGINS=y +CONFIG_SND_SEQUENCER_OSS=y +CONFIG_SND_HRTIMER=m +CONFIG_SND_SEQ_HRTIMER_DEFAULT=y +CONFIG_SND_DYNAMIC_MINORS=y +# CONFIG_SND_SUPPORT_OLD_API is not set +# CONFIG_SND_VERBOSE_PROCFS is not set +# CONFIG_SND_VERBOSE_PRINTK is not set +# CONFIG_SND_DEBUG is not set +CONFIG_SND_VMASTER=y +CONFIG_SND_DMA_SGBUF=y +CONFIG_SND_RAWMIDI_SEQ=m +CONFIG_SND_OPL3_LIB_SEQ=m +# CONFIG_SND_OPL4_LIB_SEQ is not set +# CONFIG_SND_SBAWE_SEQ is not set +CONFIG_SND_EMU10K1_SEQ=m +CONFIG_SND_MPU401_UART=m +CONFIG_SND_OPL3_LIB=m +CONFIG_SND_VX_LIB=m +CONFIG_SND_AC97_CODEC=m +CONFIG_SND_DRIVERS=y +CONFIG_SND_PCSP=m +CONFIG_SND_DUMMY=m +CONFIG_SND_VIRMIDI=m +CONFIG_SND_MTPAV=m +CONFIG_SND_MTS64=m +CONFIG_SND_SERIAL_U16550=m +CONFIG_SND_MPU401=m +CONFIG_SND_PORTMAN2X4=m +CONFIG_SND_AC97_POWER_SAVE=y +CONFIG_SND_AC97_POWER_SAVE_DEFAULT=0 +CONFIG_SND_SB_COMMON=m +CONFIG_SND_SB16_DSP=m +CONFIG_SND_PCI=y +CONFIG_SND_AD1889=m +CONFIG_SND_ALS300=m +CONFIG_SND_ALS4000=m +CONFIG_SND_ALI5451=m +CONFIG_SND_ATIIXP=m +CONFIG_SND_ATIIXP_MODEM=m +CONFIG_SND_AU8810=m +CONFIG_SND_AU8820=m +CONFIG_SND_AU8830=m +CONFIG_SND_AW2=m +CONFIG_SND_AZT3328=m +CONFIG_SND_BT87X=m +# CONFIG_SND_BT87X_OVERCLOCK is not set +CONFIG_SND_CA0106=m +CONFIG_SND_CMIPCI=m +CONFIG_SND_OXYGEN_LIB=m +CONFIG_SND_OXYGEN=m +CONFIG_SND_CS4281=m +CONFIG_SND_CS46XX=m +CONFIG_SND_CS46XX_NEW_DSP=y +CONFIG_SND_CS5530=m +CONFIG_SND_CS5535AUDIO=m +CONFIG_SND_CTXFI=m +CONFIG_SND_DARLA20=m +CONFIG_SND_GINA20=m +CONFIG_SND_LAYLA20=m +CONFIG_SND_DARLA24=m +CONFIG_SND_GINA24=m +CONFIG_SND_LAYLA24=m +CONFIG_SND_MONA=m +CONFIG_SND_MIA=m +CONFIG_SND_ECHO3G=m +CONFIG_SND_INDIGO=m +CONFIG_SND_INDIGOIO=m +CONFIG_SND_INDIGODJ=m +CONFIG_SND_INDIGOIOX=m +CONFIG_SND_INDIGODJX=m +CONFIG_SND_EMU10K1=m +CONFIG_SND_EMU10K1X=m +CONFIG_SND_ENS1370=m +CONFIG_SND_ENS1371=m +CONFIG_SND_ES1938=m +CONFIG_SND_ES1968=m +CONFIG_SND_FM801=m +# CONFIG_SND_FM801_TEA575X_BOOL is not set +CONFIG_SND_HDA_INTEL=m +CONFIG_SND_HDA_HWDEP=y +# CONFIG_SND_HDA_RECONFIG is not set +CONFIG_SND_HDA_INPUT_BEEP=y +CONFIG_SND_HDA_INPUT_JACK=y +# CONFIG_SND_HDA_PATCH_LOADER is not set +CONFIG_SND_HDA_CODEC_REALTEK=y +CONFIG_SND_HDA_CODEC_ANALOG=y +CONFIG_SND_HDA_CODEC_SIGMATEL=y +CONFIG_SND_HDA_CODEC_VIA=y +CONFIG_SND_HDA_CODEC_ATIHDMI=y +CONFIG_SND_HDA_CODEC_NVHDMI=y +CONFIG_SND_HDA_CODEC_INTELHDMI=y +CONFIG_SND_HDA_ELD=y +CONFIG_SND_HDA_CODEC_CIRRUS=y +CONFIG_SND_HDA_CODEC_CONEXANT=y +CONFIG_SND_HDA_CODEC_CA0110=y +CONFIG_SND_HDA_CODEC_CMEDIA=y +CONFIG_SND_HDA_CODEC_SI3054=y +CONFIG_SND_HDA_GENERIC=y +# CONFIG_SND_HDA_POWER_SAVE is not set +CONFIG_SND_HDSP=m +CONFIG_SND_HDSPM=m +CONFIG_SND_HIFIER=m +CONFIG_SND_ICE1712=m +CONFIG_SND_ICE1724=m +CONFIG_SND_INTEL8X0=m +CONFIG_SND_INTEL8X0M=m +CONFIG_SND_KORG1212=m +CONFIG_SND_LX6464ES=m +CONFIG_SND_MAESTRO3=m +CONFIG_SND_MIXART=m +CONFIG_SND_NM256=m +CONFIG_SND_PCXHR=m +CONFIG_SND_RIPTIDE=m +CONFIG_SND_RME32=m +CONFIG_SND_RME96=m +CONFIG_SND_RME9652=m +CONFIG_SND_SONICVIBES=m +CONFIG_SND_TRIDENT=m +CONFIG_SND_VIA82XX=m +CONFIG_SND_VIA82XX_MODEM=m +CONFIG_SND_VIRTUOSO=m +CONFIG_SND_VX222=m +CONFIG_SND_YMFPCI=m +CONFIG_SND_SPI=y +CONFIG_SND_USB=y +CONFIG_SND_USB_AUDIO=m +CONFIG_SND_USB_USX2Y=m +CONFIG_SND_USB_CAIAQ=m +# CONFIG_SND_USB_CAIAQ_INPUT is not set +CONFIG_SND_USB_US122L=m +CONFIG_SND_PCMCIA=y +CONFIG_SND_VXPOCKET=m +CONFIG_SND_PDAUDIOCF=m +CONFIG_SND_SOC=m +CONFIG_SND_SOC_I2C_AND_SPI=m +CONFIG_SND_SOC_ALL_CODECS=m +CONFIG_SND_SOC_WM_HUBS=m +CONFIG_SND_SOC_AD1836=m +CONFIG_SND_SOC_AD1938=m +CONFIG_SND_SOC_AD73311=m +CONFIG_SND_SOC_AK4104=m +CONFIG_SND_SOC_AK4535=m +CONFIG_SND_SOC_AK4642=m +CONFIG_SND_SOC_CS4270=m +CONFIG_SND_SOC_L3=m +CONFIG_SND_SOC_PCM3008=m +CONFIG_SND_SOC_SPDIF=m +CONFIG_SND_SOC_SSM2602=m +CONFIG_SND_SOC_TLV320AIC23=m +CONFIG_SND_SOC_TLV320AIC26=m +CONFIG_SND_SOC_TLV320AIC3X=m +CONFIG_SND_SOC_UDA134X=m +CONFIG_SND_SOC_UDA1380=m +CONFIG_SND_SOC_WM8400=m +CONFIG_SND_SOC_WM8510=m +CONFIG_SND_SOC_WM8523=m +CONFIG_SND_SOC_WM8580=m +CONFIG_SND_SOC_WM8728=m +CONFIG_SND_SOC_WM8731=m +CONFIG_SND_SOC_WM8750=m +CONFIG_SND_SOC_WM8753=m +CONFIG_SND_SOC_WM8776=m +CONFIG_SND_SOC_WM8900=m +CONFIG_SND_SOC_WM8903=m +CONFIG_SND_SOC_WM8940=m +CONFIG_SND_SOC_WM8960=m +CONFIG_SND_SOC_WM8961=m +CONFIG_SND_SOC_WM8971=m +CONFIG_SND_SOC_WM8974=m +CONFIG_SND_SOC_WM8988=m +CONFIG_SND_SOC_WM8990=m +CONFIG_SND_SOC_WM8993=m +CONFIG_SND_SOC_WM9081=m +CONFIG_SND_SOC_MAX9877=m +# CONFIG_SOUND_PRIME is not set +CONFIG_AC97_BUS=m +CONFIG_HID_SUPPORT=y +CONFIG_HID=m +CONFIG_HIDRAW=y + +# +# USB Input Devices +# +CONFIG_USB_HID=m +# CONFIG_HID_PID is not set +# CONFIG_USB_HIDDEV is not set + +# +# USB HID Boot Protocol drivers +# +CONFIG_USB_KBD=m +CONFIG_USB_MOUSE=m + +# +# Special HID drivers +# +# CONFIG_HID_A4TECH is not set +# CONFIG_HID_APPLE is not set +# CONFIG_HID_BELKIN is not set +# CONFIG_HID_CHERRY is not set +# CONFIG_HID_CHICONY is not set +# CONFIG_HID_CYPRESS is not set +# CONFIG_HID_DRAGONRISE is not set +# CONFIG_HID_EZKEY is not set +# CONFIG_HID_KYE is not set +# CONFIG_HID_GYRATION is not set +# CONFIG_HID_TWINHAN is not set +# CONFIG_HID_KENSINGTON is not set +# CONFIG_HID_LOGITECH is not set +# CONFIG_HID_MICROSOFT is not set +# CONFIG_HID_MONTEREY is not set +# CONFIG_HID_NTRIG is not set +# CONFIG_HID_PANTHERLORD is not set +# CONFIG_HID_PETALYNX is not set +# CONFIG_HID_SAMSUNG is not set +# CONFIG_HID_SONY is not set +# CONFIG_HID_SUNPLUS is not set +# CONFIG_HID_GREENASIA is not set +# CONFIG_HID_SMARTJOYPLUS is not set +# CONFIG_HID_TOPSEED is not set +# CONFIG_HID_THRUSTMASTER is not set +# CONFIG_HID_WACOM is not set +# CONFIG_HID_ZEROPLUS is not set +CONFIG_USB_SUPPORT=y +CONFIG_USB_ARCH_HAS_HCD=y +CONFIG_USB_ARCH_HAS_OHCI=y +CONFIG_USB_ARCH_HAS_EHCI=y +CONFIG_USB=m +# CONFIG_USB_DEBUG is not set +CONFIG_USB_ANNOUNCE_NEW_DEVICES=y + +# +# Miscellaneous USB options +# +CONFIG_USB_DEVICEFS=y +CONFIG_USB_DEVICE_CLASS=y +# CONFIG_USB_DYNAMIC_MINORS is not set +# CONFIG_USB_SUSPEND is not set +# CONFIG_USB_OTG is not set +# CONFIG_USB_OTG_WHITELIST is not set +# CONFIG_USB_OTG_BLACKLIST_HUB is not set +CONFIG_USB_MON=m +CONFIG_USB_WUSB=m +CONFIG_USB_WUSB_CBAF=m +# CONFIG_USB_WUSB_CBAF_DEBUG is not set + +# +# USB Host Controller Drivers +# +CONFIG_USB_C67X00_HCD=m +CONFIG_USB_XHCI_HCD=m +# CONFIG_USB_XHCI_HCD_DEBUGGING is not set +CONFIG_USB_EHCI_HCD=m +# CONFIG_USB_EHCI_ROOT_HUB_TT is not set +# CONFIG_USB_EHCI_TT_NEWSCHED is not set +CONFIG_USB_OXU210HP_HCD=m +CONFIG_USB_ISP116X_HCD=m +CONFIG_USB_ISP1760_HCD=m +CONFIG_USB_ISP1362_HCD=m +CONFIG_USB_OHCI_HCD=m +CONFIG_USB_OHCI_HCD_SSB=y +# CONFIG_USB_OHCI_BIG_ENDIAN_DESC is not set +# CONFIG_USB_OHCI_BIG_ENDIAN_MMIO is not set +CONFIG_USB_OHCI_LITTLE_ENDIAN=y +CONFIG_USB_UHCI_HCD=m +CONFIG_USB_U132_HCD=m +CONFIG_USB_SL811_HCD=m +CONFIG_USB_SL811_CS=m +CONFIG_USB_R8A66597_HCD=m +CONFIG_USB_WHCI_HCD=m +CONFIG_USB_HWA_HCD=m + +# +# Enable Host or Gadget support to see Inventra options +# + +# +# USB Device Class drivers +# +CONFIG_USB_ACM=m +CONFIG_USB_PRINTER=m +CONFIG_USB_WDM=m +CONFIG_USB_TMC=m + +# +# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may +# + +# +# also be needed; see USB_STORAGE Help for more info +# +CONFIG_USB_STORAGE=m +# CONFIG_USB_STORAGE_DEBUG is not set +CONFIG_USB_STORAGE_DATAFAB=m +CONFIG_USB_STORAGE_FREECOM=m +CONFIG_USB_STORAGE_ISD200=m +CONFIG_USB_STORAGE_USBAT=m +CONFIG_USB_STORAGE_SDDR09=m +CONFIG_USB_STORAGE_SDDR55=m +CONFIG_USB_STORAGE_JUMPSHOT=m +CONFIG_USB_STORAGE_ALAUDA=m +CONFIG_USB_STORAGE_ONETOUCH=m +CONFIG_USB_STORAGE_KARMA=m +CONFIG_USB_STORAGE_CYPRESS_ATACB=m +CONFIG_USB_LIBUSUAL=y + +# +# USB Imaging devices +# +# CONFIG_USB_MDC800 is not set +# CONFIG_USB_MICROTEK is not set + +# +# USB port drivers +# +CONFIG_USB_USS720=m +CONFIG_USB_SERIAL=m +CONFIG_USB_EZUSB=y +CONFIG_USB_SERIAL_GENERIC=y +CONFIG_USB_SERIAL_AIRCABLE=m +CONFIG_USB_SERIAL_ARK3116=m +CONFIG_USB_SERIAL_BELKIN=m +CONFIG_USB_SERIAL_CH341=m +CONFIG_USB_SERIAL_WHITEHEAT=m +CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m +CONFIG_USB_SERIAL_CP210X=m +CONFIG_USB_SERIAL_CYPRESS_M8=m +CONFIG_USB_SERIAL_EMPEG=m +CONFIG_USB_SERIAL_FTDI_SIO=m +CONFIG_USB_SERIAL_FUNSOFT=m +CONFIG_USB_SERIAL_VISOR=m +CONFIG_USB_SERIAL_IPAQ=m +CONFIG_USB_SERIAL_IR=m +CONFIG_USB_SERIAL_EDGEPORT=m +CONFIG_USB_SERIAL_EDGEPORT_TI=m +CONFIG_USB_SERIAL_GARMIN=m +CONFIG_USB_SERIAL_IPW=m +CONFIG_USB_SERIAL_IUU=m +CONFIG_USB_SERIAL_KEYSPAN_PDA=m +CONFIG_USB_SERIAL_KEYSPAN=m +CONFIG_USB_SERIAL_KLSI=m +CONFIG_USB_SERIAL_KOBIL_SCT=m +CONFIG_USB_SERIAL_MCT_U232=m +CONFIG_USB_SERIAL_MOS7720=m +CONFIG_USB_SERIAL_MOS7840=m +CONFIG_USB_SERIAL_MOTOROLA=m +CONFIG_USB_SERIAL_NAVMAN=m +CONFIG_USB_SERIAL_PL2303=m +CONFIG_USB_SERIAL_OTI6858=m +CONFIG_USB_SERIAL_QUALCOMM=m +CONFIG_USB_SERIAL_SPCP8X5=m +CONFIG_USB_SERIAL_HP4X=m +CONFIG_USB_SERIAL_SAFE=m +CONFIG_USB_SERIAL_SAFE_PADDED=y +CONFIG_USB_SERIAL_SIEMENS_MPI=m +CONFIG_USB_SERIAL_SIERRAWIRELESS=m +CONFIG_USB_SERIAL_SYMBOL=m +CONFIG_USB_SERIAL_TI=m +CONFIG_USB_SERIAL_CYBERJACK=m +CONFIG_USB_SERIAL_XIRCOM=m +CONFIG_USB_SERIAL_OPTION=m +CONFIG_USB_SERIAL_OMNINET=m +CONFIG_USB_SERIAL_OPTICON=m +CONFIG_USB_SERIAL_DEBUG=m + +# +# USB Miscellaneous drivers +# +CONFIG_USB_EMI62=m +CONFIG_USB_EMI26=m +CONFIG_USB_ADUTUX=m +CONFIG_USB_SEVSEG=m +CONFIG_USB_RIO500=m +# CONFIG_USB_LEGOTOWER is not set +CONFIG_USB_LCD=m +# CONFIG_USB_BERRY_CHARGE is not set +CONFIG_USB_LED=m +CONFIG_USB_CYPRESS_CY7C63=m +CONFIG_USB_CYTHERM=m +CONFIG_USB_IDMOUSE=m +CONFIG_USB_FTDI_ELAN=m +# CONFIG_USB_APPLEDISPLAY is not set +CONFIG_USB_SISUSBVGA=m +CONFIG_USB_SISUSBVGA_CON=y +CONFIG_USB_LD=m +# CONFIG_USB_TRANCEVIBRATOR is not set +CONFIG_USB_IOWARRIOR=m +CONFIG_USB_TEST=m +CONFIG_USB_ISIGHTFW=m +# CONFIG_USB_VST is not set +CONFIG_USB_ATM=m +CONFIG_USB_SPEEDTOUCH=m +CONFIG_USB_CXACRU=m +CONFIG_USB_UEAGLEATM=m +CONFIG_USB_XUSBATM=m +# CONFIG_USB_GADGET is not set + +# +# OTG and related infrastructure +# +CONFIG_USB_OTG_UTILS=y +CONFIG_USB_GPIO_VBUS=m +CONFIG_NOP_USB_XCEIV=m +CONFIG_UWB=m +CONFIG_UWB_HWA=m +CONFIG_UWB_WHCI=m +CONFIG_UWB_WLP=m +CONFIG_UWB_I1480U=m +CONFIG_UWB_I1480U_WLP=m +CONFIG_MMC=m +# CONFIG_MMC_DEBUG is not set +# CONFIG_MMC_UNSAFE_RESUME is not set + +# +# MMC/SD/SDIO Card Drivers +# +CONFIG_MMC_BLOCK=m +CONFIG_MMC_BLOCK_BOUNCE=y +CONFIG_SDIO_UART=m +CONFIG_MMC_TEST=m + +# +# MMC/SD/SDIO Host Controller Drivers +# +CONFIG_MMC_SDHCI=m +CONFIG_MMC_SDHCI_PCI=m +CONFIG_MMC_RICOH_MMC=m +CONFIG_MMC_SDHCI_PLTFM=m +CONFIG_MMC_WBSD=m +# CONFIG_MMC_AT91 is not set +# CONFIG_MMC_ATMELMCI is not set +CONFIG_MMC_TIFM_SD=m +# CONFIG_MMC_SPI is not set +CONFIG_MMC_SDRICOH_CS=m +CONFIG_MMC_CB710=m +CONFIG_MMC_VIA_SDMMC=m +CONFIG_MEMSTICK=m +# CONFIG_MEMSTICK_DEBUG is not set + +# +# MemoryStick drivers +# +# CONFIG_MEMSTICK_UNSAFE_RESUME is not set +CONFIG_MSPRO_BLOCK=m + +# +# MemoryStick Host Controller Drivers +# +CONFIG_MEMSTICK_TIFM_MS=m +CONFIG_MEMSTICK_JMICRON_38X=m +CONFIG_NEW_LEDS=y +CONFIG_LEDS_CLASS=m + +# +# LED drivers +# +CONFIG_LEDS_ALIX2=m +CONFIG_LEDS_PCA9532=m +CONFIG_LEDS_GPIO=m +CONFIG_LEDS_GPIO_PLATFORM=y +CONFIG_LEDS_LP3944=m +CONFIG_LEDS_CLEVO_MAIL=m +CONFIG_LEDS_PCA955X=m +CONFIG_LEDS_DAC124S085=m +CONFIG_LEDS_BD2802=m + +# +# LED Triggers +# +CONFIG_LEDS_TRIGGERS=y +CONFIG_LEDS_TRIGGER_TIMER=m +CONFIG_LEDS_TRIGGER_HEARTBEAT=m +CONFIG_LEDS_TRIGGER_BACKLIGHT=m +CONFIG_LEDS_TRIGGER_GPIO=m +CONFIG_LEDS_TRIGGER_DEFAULT_ON=m + +# +# iptables trigger is under Netfilter config (LED target) +# +CONFIG_ACCESSIBILITY=y +# CONFIG_A11Y_BRAILLE_CONSOLE is not set +CONFIG_INFINIBAND=m +CONFIG_INFINIBAND_USER_MAD=m +CONFIG_INFINIBAND_USER_ACCESS=m +CONFIG_INFINIBAND_USER_MEM=y +CONFIG_INFINIBAND_ADDR_TRANS=y +CONFIG_INFINIBAND_MTHCA=m +# CONFIG_INFINIBAND_MTHCA_DEBUG is not set +# CONFIG_INFINIBAND_IPATH is not set +CONFIG_INFINIBAND_AMSO1100=m +# CONFIG_INFINIBAND_AMSO1100_DEBUG is not set +CONFIG_INFINIBAND_CXGB3=m +# CONFIG_INFINIBAND_CXGB3_DEBUG is not set +CONFIG_MLX4_INFINIBAND=m +CONFIG_INFINIBAND_NES=m +# CONFIG_INFINIBAND_NES_DEBUG is not set +CONFIG_INFINIBAND_IPOIB=m +# CONFIG_INFINIBAND_IPOIB_CM is not set +# CONFIG_INFINIBAND_IPOIB_DEBUG is not set +CONFIG_INFINIBAND_SRP=m +CONFIG_INFINIBAND_ISER=m +# CONFIG_EDAC is not set +CONFIG_RTC_LIB=m +CONFIG_RTC_CLASS=m + +# +# RTC interfaces +# +CONFIG_RTC_INTF_SYSFS=y +CONFIG_RTC_INTF_PROC=y +CONFIG_RTC_INTF_DEV=y +CONFIG_RTC_INTF_DEV_UIE_EMUL=y +CONFIG_RTC_DRV_TEST=m + +# +# I2C RTC drivers +# +CONFIG_RTC_DRV_DS1307=m +CONFIG_RTC_DRV_DS1374=m +CONFIG_RTC_DRV_DS1672=m +CONFIG_RTC_DRV_MAX6900=m +CONFIG_RTC_DRV_RS5C372=m +CONFIG_RTC_DRV_ISL1208=m +CONFIG_RTC_DRV_X1205=m +CONFIG_RTC_DRV_PCF8563=m +CONFIG_RTC_DRV_PCF8583=m +CONFIG_RTC_DRV_M41T80=m +CONFIG_RTC_DRV_M41T80_WDT=y +CONFIG_RTC_DRV_S35390A=m +CONFIG_RTC_DRV_FM3130=m +CONFIG_RTC_DRV_RX8581=m +CONFIG_RTC_DRV_RX8025=m + +# +# SPI RTC drivers +# +CONFIG_RTC_DRV_M41T94=m +CONFIG_RTC_DRV_DS1305=m +CONFIG_RTC_DRV_DS1390=m +CONFIG_RTC_DRV_MAX6902=m +CONFIG_RTC_DRV_R9701=m +CONFIG_RTC_DRV_RS5C348=m +CONFIG_RTC_DRV_DS3234=m +CONFIG_RTC_DRV_PCF2123=m + +# +# Platform RTC drivers +# +CONFIG_RTC_DRV_CMOS=m +CONFIG_RTC_DRV_DS1286=m +CONFIG_RTC_DRV_DS1511=m +CONFIG_RTC_DRV_DS1553=m +CONFIG_RTC_DRV_DS1742=m +CONFIG_RTC_DRV_STK17TA8=m +CONFIG_RTC_DRV_M48T86=m +CONFIG_RTC_DRV_M48T35=m +CONFIG_RTC_DRV_M48T59=m +CONFIG_RTC_DRV_BQ4802=m +CONFIG_RTC_DRV_V3020=m +CONFIG_RTC_DRV_PCF50633=m + +# +# on-CPU RTC drivers +# +CONFIG_DMADEVICES=y + +# +# DMA Devices +# +CONFIG_ASYNC_TX_DISABLE_CHANNEL_SWITCH=y +CONFIG_INTEL_IOATDMA=m +CONFIG_DMA_ENGINE=y + +# +# DMA Clients +# +CONFIG_NET_DMA=y +# CONFIG_ASYNC_TX_DMA is not set +CONFIG_DMATEST=m +CONFIG_DCA=m +CONFIG_AUXDISPLAY=y +CONFIG_KS0108=m +CONFIG_KS0108_PORT=0x378 +CONFIG_KS0108_DELAY=2 +CONFIG_CFAG12864B=m +CONFIG_CFAG12864B_RATE=20 +CONFIG_UIO=m +CONFIG_UIO_CIF=m +CONFIG_UIO_PDRV=m +CONFIG_UIO_PDRV_GENIRQ=m +# CONFIG_UIO_SMX is not set +CONFIG_UIO_AEC=m +CONFIG_UIO_SERCOS3=m +# CONFIG_UIO_PCI_GENERIC is not set + +# +# TI VLYNQ +# +CONFIG_XEN_BALLOON=y +CONFIG_XEN_SCRUB_PAGES=y +CONFIG_XEN_DEV_EVTCHN=y +CONFIG_XEN_BACKEND=y +# CONFIG_XEN_NETDEV_BACKEND is not set +# CONFIG_XEN_BLKDEV_BACKEND is not set +# CONFIG_XEN_BLKDEV_TAP is not set +CONFIG_XEN_PCIDEV_BACKEND=y +CONFIG_XEN_PCIDEV_BACKEND_VPCI=y +# CONFIG_XEN_PCIDEV_BACKEND_PASS is not set +# CONFIG_XEN_PCIDEV_BACKEND_SLOT is not set +# CONFIG_XEN_PCIDEV_BACKEND_CONTROLLER is not set +# CONFIG_XEN_PCIDEV_BE_DEBUG is not set +CONFIG_XENFS=y +CONFIG_XEN_COMPAT_XENFS=y +CONFIG_XEN_SYS_HYPERVISOR=y +CONFIG_XEN_XENBUS_FRONTEND=y +# CONFIG_XEN_GNTDEV is not set +CONFIG_XEN_S3=y +CONFIG_ACPI_PROCESSOR_XEN=m +CONFIG_XEN_PLATFORM_PCI=m +CONFIG_STAGING=y +# CONFIG_STAGING_EXCLUDE_BUILD is not set +# CONFIG_ET131X is not set +# CONFIG_SLICOSS is not set +# CONFIG_VIDEO_GO7007 is not set +# CONFIG_VIDEO_CX25821 is not set +# CONFIG_USB_IP_COMMON is not set +# CONFIG_W35UND is not set +# CONFIG_PRISM2_USB is not set +# CONFIG_ECHO is not set +# CONFIG_POCH is not set +# CONFIG_OTUS is not set +# CONFIG_RT2860 is not set +# CONFIG_RT2870 is not set +# CONFIG_RT3090 is not set +# CONFIG_COMEDI is not set +# CONFIG_ASUS_OLED is not set +# CONFIG_PANEL is not set +# CONFIG_ALTERA_PCIE_CHDMA is not set +# CONFIG_RTL8187SE is not set +# CONFIG_RTL8192SU is not set +# CONFIG_RTL8192E is not set +# CONFIG_TRANZPORT is not set + +# +# Android +# + +# +# Qualcomm MSM Camera And Video +# + +# +# Camera Sensor Selection +# +# CONFIG_INPUT_GPIO is not set +# CONFIG_DST is not set +# CONFIG_POHMELFS is not set +# CONFIG_B3DFG is not set +# CONFIG_IDE_PHISON is not set +# CONFIG_PLAN9AUTH is not set +# CONFIG_LINE6_USB is not set +# CONFIG_DRM_RADEON_KMS is not set +# CONFIG_USB_SERIAL_QUATECH2 is not set +# CONFIG_USB_SERIAL_QUATECH_USB2 is not set +# CONFIG_VT6655 is not set +# CONFIG_VT6656 is not set +# CONFIG_FB_UDL is not set +CONFIG_HYPERV=m +CONFIG_HYPERV_STORAGE=m +CONFIG_HYPERV_BLOCK=m +CONFIG_HYPERV_NET=m +# CONFIG_VME_BUS is not set + +# +# RAR Register Driver +# +# CONFIG_RAR_REGISTER is not set +# CONFIG_IIO is not set +CONFIG_X86_PLATFORM_DEVICES=y +CONFIG_ACER_WMI=m +CONFIG_ASUS_LAPTOP=m +CONFIG_DELL_WMI=m +CONFIG_FUJITSU_LAPTOP=m +# CONFIG_FUJITSU_LAPTOP_DEBUG is not set +CONFIG_HP_WMI=m +CONFIG_MSI_LAPTOP=m +CONFIG_PANASONIC_LAPTOP=m +CONFIG_COMPAL_LAPTOP=m +CONFIG_SONY_LAPTOP=m +# CONFIG_SONYPI_COMPAT is not set +CONFIG_THINKPAD_ACPI=m +# CONFIG_THINKPAD_ACPI_DEBUGFACILITIES is not set +# CONFIG_THINKPAD_ACPI_DEBUG is not set +# CONFIG_THINKPAD_ACPI_UNSAFE_LEDS is not set +CONFIG_THINKPAD_ACPI_VIDEO=y +CONFIG_THINKPAD_ACPI_HOTKEY_POLL=y +CONFIG_INTEL_MENLOW=m +CONFIG_EEEPC_LAPTOP=m +CONFIG_ACPI_WMI=m +CONFIG_ACPI_ASUS=m +# CONFIG_TOPSTAR_LAPTOP is not set +CONFIG_ACPI_TOSHIBA=m + +# +# Firmware Drivers +# +CONFIG_EDD=m +# CONFIG_EDD_OFF is not set +CONFIG_FIRMWARE_MEMMAP=y +CONFIG_DELL_RBU=m +CONFIG_DCDBAS=m +CONFIG_DMIID=y +# CONFIG_ISCSI_IBFT_FIND is not set + +# +# File systems +# +CONFIG_EXT2_FS=m +CONFIG_EXT2_FS_XATTR=y +CONFIG_EXT2_FS_POSIX_ACL=y +CONFIG_EXT2_FS_SECURITY=y +CONFIG_EXT2_FS_XIP=y +CONFIG_EXT3_FS=m +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set +CONFIG_EXT3_FS_XATTR=y +CONFIG_EXT3_FS_POSIX_ACL=y +CONFIG_EXT3_FS_SECURITY=y +CONFIG_EXT4_FS=m +CONFIG_EXT4_FS_XATTR=y +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_EXT4_FS_SECURITY=y +# CONFIG_EXT4_DEBUG is not set +CONFIG_FS_XIP=y +CONFIG_JBD=m +# CONFIG_JBD_DEBUG is not set +CONFIG_JBD2=m +# CONFIG_JBD2_DEBUG is not set +CONFIG_FS_MBCACHE=m +CONFIG_REISERFS_FS=m +# CONFIG_REISERFS_CHECK is not set +CONFIG_REISERFS_PROC_INFO=y +CONFIG_REISERFS_FS_XATTR=y +CONFIG_REISERFS_FS_POSIX_ACL=y +# CONFIG_REISERFS_FS_SECURITY is not set +CONFIG_JFS_FS=m +CONFIG_JFS_POSIX_ACL=y +CONFIG_JFS_SECURITY=y +# CONFIG_JFS_DEBUG is not set +CONFIG_JFS_STATISTICS=y +CONFIG_FS_POSIX_ACL=y +CONFIG_XFS_FS=m +CONFIG_XFS_QUOTA=y +CONFIG_XFS_POSIX_ACL=y +CONFIG_XFS_RT=y +# CONFIG_XFS_DEBUG is not set +CONFIG_GFS2_FS=m +CONFIG_GFS2_FS_LOCKING_DLM=y +CONFIG_OCFS2_FS=m +CONFIG_OCFS2_FS_O2CB=m +CONFIG_OCFS2_FS_USERSPACE_CLUSTER=m +CONFIG_OCFS2_FS_STATS=y +CONFIG_OCFS2_DEBUG_MASKLOG=y +# CONFIG_OCFS2_DEBUG_FS is not set +# CONFIG_OCFS2_FS_POSIX_ACL is not set +CONFIG_BTRFS_FS=m +CONFIG_BTRFS_FS_POSIX_ACL=y +CONFIG_NILFS2_FS=m +CONFIG_FILE_LOCKING=y +CONFIG_FSNOTIFY=y +# CONFIG_DNOTIFY is not set +CONFIG_INOTIFY=y +CONFIG_INOTIFY_USER=y +CONFIG_QUOTA=y +CONFIG_QUOTA_NETLINK_INTERFACE=y +# CONFIG_PRINT_QUOTA_WARNING is not set +CONFIG_QUOTA_TREE=m +CONFIG_QFMT_V1=m +CONFIG_QFMT_V2=m +CONFIG_QUOTACTL=y +CONFIG_AUTOFS_FS=m +CONFIG_AUTOFS4_FS=m +CONFIG_FUSE_FS=m +# CONFIG_CUSE is not set + +# +# Caches +# +CONFIG_FSCACHE=m +CONFIG_FSCACHE_STATS=y +CONFIG_FSCACHE_HISTOGRAM=y +# CONFIG_FSCACHE_DEBUG is not set +# CONFIG_FSCACHE_OBJECT_LIST is not set +CONFIG_CACHEFILES=m +# CONFIG_CACHEFILES_DEBUG is not set +# CONFIG_CACHEFILES_HISTOGRAM is not set + +# +# CD-ROM/DVD Filesystems +# +CONFIG_ISO9660_FS=m +CONFIG_JOLIET=y +CONFIG_ZISOFS=y +CONFIG_UDF_FS=m +CONFIG_UDF_NLS=y + +# +# DOS/FAT/NT Filesystems +# +CONFIG_FAT_FS=m +CONFIG_MSDOS_FS=m +CONFIG_VFAT_FS=m +CONFIG_FAT_DEFAULT_CODEPAGE=437 +CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1" +CONFIG_NTFS_FS=m +# CONFIG_NTFS_DEBUG is not set +CONFIG_NTFS_RW=y + +# +# Pseudo filesystems +# +CONFIG_PROC_FS=y +# CONFIG_PROC_KCORE is not set +CONFIG_PROC_SYSCTL=y +CONFIG_PROC_PAGE_MONITOR=y +CONFIG_SYSFS=y +CONFIG_TMPFS=y +# CONFIG_TMPFS_POSIX_ACL is not set +# CONFIG_HUGETLBFS is not set +# CONFIG_HUGETLB_PAGE is not set +CONFIG_CONFIGFS_FS=m +CONFIG_MISC_FILESYSTEMS=y +# CONFIG_ADFS_FS is not set +# CONFIG_AFFS_FS is not set +CONFIG_ECRYPT_FS=m +CONFIG_HFS_FS=m +CONFIG_HFSPLUS_FS=m +# CONFIG_BEFS_FS is not set +# CONFIG_BFS_FS is not set +CONFIG_EFS_FS=m +CONFIG_JFFS2_FS=m +CONFIG_JFFS2_FS_DEBUG=0 +CONFIG_JFFS2_FS_WRITEBUFFER=y +# CONFIG_JFFS2_FS_WBUF_VERIFY is not set +CONFIG_JFFS2_SUMMARY=y +CONFIG_JFFS2_FS_XATTR=y +CONFIG_JFFS2_FS_POSIX_ACL=y +CONFIG_JFFS2_FS_SECURITY=y +CONFIG_JFFS2_COMPRESSION_OPTIONS=y +CONFIG_JFFS2_ZLIB=y +CONFIG_JFFS2_LZO=y +CONFIG_JFFS2_RTIME=y +CONFIG_JFFS2_RUBIN=y +# CONFIG_JFFS2_CMODE_NONE is not set +CONFIG_JFFS2_CMODE_PRIORITY=y +# CONFIG_JFFS2_CMODE_SIZE is not set +# CONFIG_JFFS2_CMODE_FAVOURLZO is not set +CONFIG_UBIFS_FS=m +# CONFIG_UBIFS_FS_XATTR is not set +# CONFIG_UBIFS_FS_ADVANCED_COMPR is not set +CONFIG_UBIFS_FS_LZO=y +CONFIG_UBIFS_FS_ZLIB=y +# CONFIG_UBIFS_FS_DEBUG is not set +CONFIG_CRAMFS=m +CONFIG_SQUASHFS=m +# CONFIG_SQUASHFS_EMBEDDED is not set +CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE=3 +# CONFIG_VXFS_FS is not set +CONFIG_MINIX_FS=m +CONFIG_OMFS_FS=m +CONFIG_HPFS_FS=m +# CONFIG_QNX4FS_FS is not set +CONFIG_ROMFS_FS=m +CONFIG_ROMFS_BACKED_BY_BLOCK=y +# CONFIG_ROMFS_BACKED_BY_MTD is not set +# CONFIG_ROMFS_BACKED_BY_BOTH is not set +CONFIG_ROMFS_ON_BLOCK=y +CONFIG_SYSV_FS=m +CONFIG_UFS_FS=m +# CONFIG_UFS_FS_WRITE is not set +# CONFIG_UFS_DEBUG is not set +CONFIG_EXOFS_FS=m +# CONFIG_EXOFS_DEBUG is not set +CONFIG_NETWORK_FILESYSTEMS=y +CONFIG_NFS_FS=m +CONFIG_NFS_V3=y +# CONFIG_NFS_V3_ACL is not set +CONFIG_NFS_V4=y +# CONFIG_NFS_V4_1 is not set +# CONFIG_NFS_FSCACHE is not set +CONFIG_NFSD=m +CONFIG_NFSD_V3=y +# CONFIG_NFSD_V3_ACL is not set +CONFIG_NFSD_V4=y +CONFIG_LOCKD=m +CONFIG_LOCKD_V4=y +CONFIG_EXPORTFS=m +CONFIG_NFS_COMMON=y +CONFIG_SUNRPC=m +CONFIG_SUNRPC_GSS=m +CONFIG_SUNRPC_XPRT_RDMA=m +CONFIG_RPCSEC_GSS_KRB5=m +# CONFIG_RPCSEC_GSS_SPKM3 is not set +# CONFIG_SMB_FS is not set +CONFIG_CIFS=m +# CONFIG_CIFS_STATS is not set +# CONFIG_CIFS_WEAK_PW_HASH is not set +# CONFIG_CIFS_UPCALL is not set +CONFIG_CIFS_XATTR=y +CONFIG_CIFS_POSIX=y +# CONFIG_CIFS_DEBUG2 is not set +CONFIG_CIFS_DFS_UPCALL=y +CONFIG_CIFS_EXPERIMENTAL=y +# CONFIG_NCP_FS is not set +# CONFIG_CODA_FS is not set +# CONFIG_AFS_FS is not set +# CONFIG_9P_FS is not set + +# +# Partition Types +# +CONFIG_PARTITION_ADVANCED=y +# CONFIG_ACORN_PARTITION is not set +# CONFIG_OSF_PARTITION is not set +# CONFIG_AMIGA_PARTITION is not set +# CONFIG_ATARI_PARTITION is not set +# CONFIG_MAC_PARTITION is not set +CONFIG_MSDOS_PARTITION=y +# CONFIG_BSD_DISKLABEL is not set +# CONFIG_MINIX_SUBPARTITION is not set +# CONFIG_SOLARIS_X86_PARTITION is not set +# CONFIG_UNIXWARE_DISKLABEL is not set +# CONFIG_LDM_PARTITION is not set +# CONFIG_SGI_PARTITION is not set +# CONFIG_ULTRIX_PARTITION is not set +# CONFIG_SUN_PARTITION is not set +# CONFIG_KARMA_PARTITION is not set +CONFIG_EFI_PARTITION=y +# CONFIG_SYSV68_PARTITION is not set +CONFIG_NLS=m +CONFIG_NLS_DEFAULT="iso8859-1" +CONFIG_NLS_CODEPAGE_437=m +CONFIG_NLS_CODEPAGE_737=m +CONFIG_NLS_CODEPAGE_775=m +CONFIG_NLS_CODEPAGE_850=m +CONFIG_NLS_CODEPAGE_852=m +CONFIG_NLS_CODEPAGE_855=m +CONFIG_NLS_CODEPAGE_857=m +CONFIG_NLS_CODEPAGE_860=m +CONFIG_NLS_CODEPAGE_861=m +CONFIG_NLS_CODEPAGE_862=m +CONFIG_NLS_CODEPAGE_863=m +CONFIG_NLS_CODEPAGE_864=m +CONFIG_NLS_CODEPAGE_865=m +CONFIG_NLS_CODEPAGE_866=m +CONFIG_NLS_CODEPAGE_869=m +CONFIG_NLS_CODEPAGE_936=m +CONFIG_NLS_CODEPAGE_950=m +CONFIG_NLS_CODEPAGE_932=m +CONFIG_NLS_CODEPAGE_949=m +CONFIG_NLS_CODEPAGE_874=m +CONFIG_NLS_ISO8859_8=m +CONFIG_NLS_CODEPAGE_1250=m +CONFIG_NLS_CODEPAGE_1251=m +CONFIG_NLS_ASCII=m +CONFIG_NLS_ISO8859_1=m +CONFIG_NLS_ISO8859_2=m +CONFIG_NLS_ISO8859_3=m +CONFIG_NLS_ISO8859_4=m +CONFIG_NLS_ISO8859_5=m +CONFIG_NLS_ISO8859_6=m +CONFIG_NLS_ISO8859_7=m +CONFIG_NLS_ISO8859_9=m +CONFIG_NLS_ISO8859_13=m +CONFIG_NLS_ISO8859_14=m +CONFIG_NLS_ISO8859_15=m +CONFIG_NLS_KOI8_R=m +CONFIG_NLS_KOI8_U=m +CONFIG_NLS_UTF8=m +CONFIG_DLM=m +# CONFIG_DLM_DEBUG is not set + +# +# Kernel hacking +# +CONFIG_TRACE_IRQFLAGS_SUPPORT=y +# CONFIG_PRINTK_TIME is not set +CONFIG_ENABLE_WARN_DEPRECATED=y +# CONFIG_ENABLE_MUST_CHECK is not set +CONFIG_FRAME_WARN=1024 +# CONFIG_MAGIC_SYSRQ is not set +# CONFIG_STRIP_ASM_SYMS is not set +# CONFIG_UNUSED_SYMBOLS is not set +CONFIG_DEBUG_FS=y +# CONFIG_HEADERS_CHECK is not set +# CONFIG_DEBUG_KERNEL is not set +# CONFIG_DEBUG_BUGVERBOSE is not set +# CONFIG_DEBUG_MEMORY_INIT is not set +CONFIG_ARCH_WANT_FRAME_POINTERS=y +CONFIG_FRAME_POINTER=y +# CONFIG_RCU_CPU_STALL_DETECTOR is not set +# CONFIG_LATENCYTOP is not set +CONFIG_SYSCTL_SYSCALL_CHECK=y +CONFIG_USER_STACKTRACE_SUPPORT=y +CONFIG_HAVE_FUNCTION_TRACER=y +CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y +CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST=y +CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y +CONFIG_HAVE_DYNAMIC_FTRACE=y +CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_HAVE_SYSCALL_TRACEPOINTS=y +CONFIG_RING_BUFFER=y +CONFIG_RING_BUFFER_ALLOW_SWAP=y +CONFIG_TRACING_SUPPORT=y +# CONFIG_FTRACE is not set +# CONFIG_PROVIDE_OHCI1394_DMA_INIT is not set +# CONFIG_FIREWIRE_OHCI_REMOTE_DMA is not set +# CONFIG_DYNAMIC_DEBUG is not set +# CONFIG_DMA_API_DEBUG is not set +# CONFIG_SAMPLES is not set +CONFIG_HAVE_ARCH_KGDB=y +CONFIG_HAVE_ARCH_KMEMCHECK=y +CONFIG_STRICT_DEVMEM=y +# CONFIG_X86_VERBOSE_BOOTUP is not set +# CONFIG_EARLY_PRINTK is not set +# CONFIG_IOMMU_STRESS is not set +CONFIG_HAVE_MMIOTRACE_SUPPORT=y +CONFIG_IO_DELAY_TYPE_0X80=0 +CONFIG_IO_DELAY_TYPE_0XED=1 +CONFIG_IO_DELAY_TYPE_UDELAY=2 +CONFIG_IO_DELAY_TYPE_NONE=3 +CONFIG_IO_DELAY_0X80=y +# CONFIG_IO_DELAY_0XED is not set +# CONFIG_IO_DELAY_UDELAY is not set +# CONFIG_IO_DELAY_NONE is not set +CONFIG_DEFAULT_IO_DELAY_TYPE=0 +# CONFIG_OPTIMIZE_INLINING is not set + +# +# Security options +# +CONFIG_KEYS=y +# CONFIG_KEYS_DEBUG_PROC_KEYS is not set +CONFIG_SECURITY=y +CONFIG_SECURITYFS=y +# CONFIG_SECURITY_NETWORK is not set +# CONFIG_SECURITY_PATH is not set +# CONFIG_SECURITY_FILE_CAPABILITIES is not set +# CONFIG_SECURITY_TOMOYO is not set +# CONFIG_IMA is not set +CONFIG_XOR_BLOCKS=m +CONFIG_ASYNC_CORE=m +CONFIG_ASYNC_MEMCPY=m +CONFIG_ASYNC_XOR=m +CONFIG_ASYNC_PQ=m +CONFIG_ASYNC_RAID6_RECOV=m +CONFIG_ASYNC_TX_DISABLE_PQ_VAL_DMA=y +CONFIG_ASYNC_TX_DISABLE_XOR_VAL_DMA=y +CONFIG_CRYPTO=y + +# +# Crypto core or helper +# +CONFIG_CRYPTO_FIPS=y +CONFIG_CRYPTO_ALGAPI=y +CONFIG_CRYPTO_ALGAPI2=y +CONFIG_CRYPTO_AEAD=m +CONFIG_CRYPTO_AEAD2=y +CONFIG_CRYPTO_BLKCIPHER=m +CONFIG_CRYPTO_BLKCIPHER2=y +CONFIG_CRYPTO_HASH=y +CONFIG_CRYPTO_HASH2=y +CONFIG_CRYPTO_RNG=m +CONFIG_CRYPTO_RNG2=y +CONFIG_CRYPTO_PCOMP=y +CONFIG_CRYPTO_MANAGER=m +CONFIG_CRYPTO_MANAGER2=y +CONFIG_CRYPTO_GF128MUL=m +CONFIG_CRYPTO_NULL=m +CONFIG_CRYPTO_WORKQUEUE=y +CONFIG_CRYPTO_CRYPTD=m +CONFIG_CRYPTO_AUTHENC=m +CONFIG_CRYPTO_TEST=m + +# +# Authenticated Encryption with Associated Data +# +CONFIG_CRYPTO_CCM=m +CONFIG_CRYPTO_GCM=m +CONFIG_CRYPTO_SEQIV=m + +# +# Block modes +# +CONFIG_CRYPTO_CBC=m +CONFIG_CRYPTO_CTR=m +CONFIG_CRYPTO_CTS=m +CONFIG_CRYPTO_ECB=m +CONFIG_CRYPTO_LRW=m +CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_XTS=m + +# +# Hash modes +# +CONFIG_CRYPTO_HMAC=m +CONFIG_CRYPTO_XCBC=m +CONFIG_CRYPTO_VMAC=m + +# +# Digest +# +CONFIG_CRYPTO_CRC32C=m +CONFIG_CRYPTO_CRC32C_INTEL=m +CONFIG_CRYPTO_GHASH=m +CONFIG_CRYPTO_MD4=m +CONFIG_CRYPTO_MD5=y +CONFIG_CRYPTO_MICHAEL_MIC=m +CONFIG_CRYPTO_RMD128=m +CONFIG_CRYPTO_RMD160=m +CONFIG_CRYPTO_RMD256=m +CONFIG_CRYPTO_RMD320=m +CONFIG_CRYPTO_SHA1=m +CONFIG_CRYPTO_SHA256=y +CONFIG_CRYPTO_SHA512=m +CONFIG_CRYPTO_TGR192=m +CONFIG_CRYPTO_WP512=m + +# +# Ciphers +# +CONFIG_CRYPTO_AES=m +# CONFIG_CRYPTO_AES_X86_64 is not set +# CONFIG_CRYPTO_AES_NI_INTEL is not set +CONFIG_CRYPTO_ANUBIS=m +CONFIG_CRYPTO_ARC4=m +CONFIG_CRYPTO_BLOWFISH=m +CONFIG_CRYPTO_CAMELLIA=m +CONFIG_CRYPTO_CAST5=m +CONFIG_CRYPTO_CAST6=m +CONFIG_CRYPTO_DES=m +CONFIG_CRYPTO_FCRYPT=m +CONFIG_CRYPTO_KHAZAD=m +CONFIG_CRYPTO_SALSA20=m +# CONFIG_CRYPTO_SALSA20_X86_64 is not set +CONFIG_CRYPTO_SEED=m +CONFIG_CRYPTO_SERPENT=m +CONFIG_CRYPTO_TEA=m +CONFIG_CRYPTO_TWOFISH=m +CONFIG_CRYPTO_TWOFISH_COMMON=m +# CONFIG_CRYPTO_TWOFISH_X86_64 is not set + +# +# Compression +# +CONFIG_CRYPTO_DEFLATE=m +CONFIG_CRYPTO_ZLIB=m +CONFIG_CRYPTO_LZO=m + +# +# Random Number Generation +# +CONFIG_CRYPTO_ANSI_CPRNG=m +CONFIG_CRYPTO_HW=y +CONFIG_CRYPTO_DEV_PADLOCK=m +CONFIG_CRYPTO_DEV_PADLOCK_AES=m +CONFIG_CRYPTO_DEV_PADLOCK_SHA=m +CONFIG_CRYPTO_DEV_HIFN_795X=m +CONFIG_CRYPTO_DEV_HIFN_795X_RNG=y +CONFIG_HAVE_KVM=y +CONFIG_HAVE_KVM_IRQCHIP=y +CONFIG_HAVE_KVM_EVENTFD=y +CONFIG_KVM_APIC_ARCHITECTURE=y +CONFIG_VIRTUALIZATION=y +CONFIG_KVM=m +CONFIG_KVM_INTEL=m +CONFIG_KVM_AMD=m +CONFIG_VIRTIO=m +CONFIG_VIRTIO_RING=m +CONFIG_VIRTIO_PCI=m +CONFIG_VIRTIO_BALLOON=m +# CONFIG_BINARY_PRINTF is not set + +# +# Library routines +# +CONFIG_BITREVERSE=y +CONFIG_GENERIC_FIND_FIRST_BIT=y +CONFIG_GENERIC_FIND_NEXT_BIT=y +CONFIG_GENERIC_FIND_LAST_BIT=y +CONFIG_CRC_CCITT=m +CONFIG_CRC16=m +CONFIG_CRC_T10DIF=m +CONFIG_CRC_ITU_T=m +CONFIG_CRC32=y +CONFIG_CRC7=m +CONFIG_LIBCRC32C=m +CONFIG_ZLIB_INFLATE=y +CONFIG_ZLIB_DEFLATE=m +CONFIG_LZO_COMPRESS=m +CONFIG_LZO_DECOMPRESS=m +CONFIG_DECOMPRESS_GZIP=y +CONFIG_DECOMPRESS_BZIP2=y +CONFIG_DECOMPRESS_LZMA=y +CONFIG_GENERIC_ALLOCATOR=y +CONFIG_REED_SOLOMON=m +CONFIG_REED_SOLOMON_DEC16=y +CONFIG_TEXTSEARCH=y +CONFIG_TEXTSEARCH_KMP=m +CONFIG_TEXTSEARCH_BM=m +CONFIG_TEXTSEARCH_FSM=m +CONFIG_HAS_IOMEM=y +CONFIG_HAS_IOPORT=y +CONFIG_HAS_DMA=y +CONFIG_CHECK_SIGNATURE=y +CONFIG_NLATTR=y diff --git a/testing/linux-xen0/pvops.patch b/testing/linux-xen0/pvops.patch new file mode 100644 index 000000000..49969705b --- /dev/null +++ b/testing/linux-xen0/pvops.patch @@ -0,0 +1,37837 @@ +diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt +index 5f6aa11..9ec8558 100644 +--- a/Documentation/kernel-parameters.txt ++++ b/Documentation/kernel-parameters.txt +@@ -113,6 +113,7 @@ parameter is applicable: + More X86-64 boot options can be found in + Documentation/x86/x86_64/boot-options.txt . + X86 Either 32bit or 64bit x86 (same as X86-32+X86-64) ++ XEN Xen support is enabled + + In addition, the following text indicates that the option: + +@@ -2760,6 +2761,18 @@ and is between 256 and 4096 characters. It is defined in the file + xd= [HW,XT] Original XT pre-IDE (RLL encoded) disks. + xd_geo= See header of drivers/block/xd.c. + ++ xen_emul_unplug= [HW,X86,XEN] ++ Unplug Xen emulated devices ++ Format: [unplug0,][unplug1] ++ ide-disks -- unplug primary master IDE devices ++ aux-ide-disks -- unplug non-primary-master IDE devices ++ nics -- unplug network devices ++ all -- unplug all emulated devices (NICs and IDE disks) ++ unnecessary -- unplugging emulated devices is ++ unnecessary even if the host did not respond to ++ the unplug protocol ++ never -- do not unplug even if version check succeeds ++ + xirc2ps_cs= [NET,PCMCIA] + Format: + <irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]] +diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt +index 29a6ff8..81f9b94 100644 +--- a/Documentation/x86/x86_64/boot-options.txt ++++ b/Documentation/x86/x86_64/boot-options.txt +@@ -267,10 +267,14 @@ IOMMU (input/output memory management unit) + + iommu options only relevant to the software bounce buffering (SWIOTLB) IOMMU + implementation: +- swiotlb=<pages>[,force] ++ swiotlb=[npages=<pages>] ++ swiotlb=[force] ++ swiotlb=[overflow=<size>] ++ + <pages> Prereserve that many 128K pages for the software IO + bounce buffering. + force Force all IO through the software TLB. ++ <size> Size in bytes of the overflow buffer. + + Settings for the IBM Calgary hardware IOMMU currently found in IBM + pSeries and xSeries machines: +diff --git a/arch/ia64/include/asm/dma-mapping.h b/arch/ia64/include/asm/dma-mapping.h +index 8d3c79c..7d09a09 100644 +--- a/arch/ia64/include/asm/dma-mapping.h ++++ b/arch/ia64/include/asm/dma-mapping.h +@@ -73,7 +73,7 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) + if (!dev->dma_mask) + return 0; + +- return addr + size <= *dev->dma_mask; ++ return addr + size - 1 <= *dev->dma_mask; + } + + static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) +diff --git a/arch/ia64/include/asm/swiotlb.h b/arch/ia64/include/asm/swiotlb.h +index dcbaea7..f0acde6 100644 +--- a/arch/ia64/include/asm/swiotlb.h ++++ b/arch/ia64/include/asm/swiotlb.h +@@ -4,8 +4,6 @@ + #include <linux/dma-mapping.h> + #include <linux/swiotlb.h> + +-extern int swiotlb_force; +- + #ifdef CONFIG_SWIOTLB + extern int swiotlb; + extern void pci_swiotlb_init(void); +diff --git a/arch/ia64/include/asm/xen/events.h b/arch/ia64/include/asm/xen/events.h +index b8370c8..baa74c8 100644 +--- a/arch/ia64/include/asm/xen/events.h ++++ b/arch/ia64/include/asm/xen/events.h +@@ -36,10 +36,6 @@ static inline int xen_irqs_disabled(struct pt_regs *regs) + return !(ia64_psr(regs)->i); + } + +-static inline void handle_irq(int irq, struct pt_regs *regs) +-{ +- __do_IRQ(irq); +-} + #define irq_ctx_init(cpu) do { } while (0) + + #endif /* _ASM_IA64_XEN_EVENTS_H */ +diff --git a/arch/ia64/kernel/pci-swiotlb.c b/arch/ia64/kernel/pci-swiotlb.c +index 285aae8..53292ab 100644 +--- a/arch/ia64/kernel/pci-swiotlb.c ++++ b/arch/ia64/kernel/pci-swiotlb.c +@@ -41,7 +41,7 @@ struct dma_map_ops swiotlb_dma_ops = { + void __init swiotlb_dma_init(void) + { + dma_ops = &swiotlb_dma_ops; +- swiotlb_init(); ++ swiotlb_init(1); + } + + void __init pci_swiotlb_init(void) +@@ -51,7 +51,7 @@ void __init pci_swiotlb_init(void) + swiotlb = 1; + printk(KERN_INFO "PCI-DMA: Re-initialize machine vector.\n"); + machvec_init("dig"); +- swiotlb_init(); ++ swiotlb_init(1); + dma_ops = &swiotlb_dma_ops; + #else + panic("Unable to find Intel IOMMU"); +diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h +index e281dae..80a973b 100644 +--- a/arch/powerpc/include/asm/dma-mapping.h ++++ b/arch/powerpc/include/asm/dma-mapping.h +@@ -197,7 +197,7 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) + if (!dev->dma_mask) + return 0; + +- return addr + size <= *dev->dma_mask; ++ return addr + size - 1 <= *dev->dma_mask; + } + + static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) +diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c +index 53bcf3d..b152de3 100644 +--- a/arch/powerpc/kernel/setup_32.c ++++ b/arch/powerpc/kernel/setup_32.c +@@ -345,7 +345,7 @@ void __init setup_arch(char **cmdline_p) + + #ifdef CONFIG_SWIOTLB + if (ppc_swiotlb_enable) +- swiotlb_init(); ++ swiotlb_init(1); + #endif + + paging_init(); +diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c +index 04f638d..df2c9e9 100644 +--- a/arch/powerpc/kernel/setup_64.c ++++ b/arch/powerpc/kernel/setup_64.c +@@ -550,7 +550,7 @@ void __init setup_arch(char **cmdline_p) + + #ifdef CONFIG_SWIOTLB + if (ppc_swiotlb_enable) +- swiotlb_init(); ++ swiotlb_init(1); + #endif + + paging_init(); +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index cb5a57c..a3b7475 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -1885,6 +1885,10 @@ config PCI_OLPC + def_bool y + depends on PCI && OLPC && (PCI_GOOLPC || PCI_GOANY) + ++config PCI_XEN ++ bool ++ select SWIOTLB ++ + config PCI_DOMAINS + def_bool y + depends on PCI +diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h +index 18aa3f8..4413ba4 100644 +--- a/arch/x86/include/asm/amd_iommu.h ++++ b/arch/x86/include/asm/amd_iommu.h +@@ -23,20 +23,16 @@ + #include <linux/irqreturn.h> + + #ifdef CONFIG_AMD_IOMMU +-extern int amd_iommu_init(void); + extern int amd_iommu_init_dma_ops(void); + extern int amd_iommu_init_passthrough(void); + extern void amd_iommu_detect(void); + extern irqreturn_t amd_iommu_int_handler(int irq, void *data); + extern void amd_iommu_flush_all_domains(void); + extern void amd_iommu_flush_all_devices(void); +-extern void amd_iommu_shutdown(void); + extern void amd_iommu_apply_erratum_63(u16 devid); + extern void amd_iommu_init_api(void); + #else +-static inline int amd_iommu_init(void) { return -ENODEV; } + static inline void amd_iommu_detect(void) { } +-static inline void amd_iommu_shutdown(void) { } + #endif + + #endif /* _ASM_X86_AMD_IOMMU_H */ +diff --git a/arch/x86/include/asm/calgary.h b/arch/x86/include/asm/calgary.h +index b03bedb..0918654 100644 +--- a/arch/x86/include/asm/calgary.h ++++ b/arch/x86/include/asm/calgary.h +@@ -62,10 +62,8 @@ struct cal_chipset_ops { + extern int use_calgary; + + #ifdef CONFIG_CALGARY_IOMMU +-extern int calgary_iommu_init(void); + extern void detect_calgary(void); + #else +-static inline int calgary_iommu_init(void) { return 1; } + static inline void detect_calgary(void) { return; } + #endif + +diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h +index 6a25d5d..ac91eed 100644 +--- a/arch/x86/include/asm/dma-mapping.h ++++ b/arch/x86/include/asm/dma-mapping.h +@@ -20,7 +20,8 @@ + # define ISA_DMA_BIT_MASK DMA_BIT_MASK(32) + #endif + +-extern dma_addr_t bad_dma_address; ++#define DMA_ERROR_CODE 0 ++ + extern int iommu_merge; + extern struct device x86_dma_fallback_dev; + extern int panic_on_overflow; +@@ -48,7 +49,7 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) + if (ops->mapping_error) + return ops->mapping_error(dev, dma_addr); + +- return (dma_addr == bad_dma_address); ++ return (dma_addr == DMA_ERROR_CODE); + } + + #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) +@@ -66,7 +67,7 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) + if (!dev->dma_mask) + return 0; + +- return addr + size <= *dev->dma_mask; ++ return addr + size - 1 <= *dev->dma_mask; + } + + static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) +diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h +index 40b4e61..fa3fd43 100644 +--- a/arch/x86/include/asm/e820.h ++++ b/arch/x86/include/asm/e820.h +@@ -109,6 +109,8 @@ extern void reserve_early(u64 start, u64 end, char *name); + extern void reserve_early_overlap_ok(u64 start, u64 end, char *name); + extern void free_early(u64 start, u64 end); + extern void early_res_to_bootmem(u64 start, u64 end); ++extern u64 early_res_next_free(u64 start); ++extern u64 early_res_next_reserved(u64 addr, u64 max); + extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align); + + extern unsigned long e820_end_of_ram_pfn(void); +diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h +index 6cfdafa..4ac5b0f 100644 +--- a/arch/x86/include/asm/gart.h ++++ b/arch/x86/include/asm/gart.h +@@ -35,8 +35,7 @@ extern int gart_iommu_aperture_allowed; + extern int gart_iommu_aperture_disabled; + + extern void early_gart_iommu_check(void); +-extern void gart_iommu_init(void); +-extern void gart_iommu_shutdown(void); ++extern int gart_iommu_init(void); + extern void __init gart_parse_options(char *); + extern void gart_iommu_hole_init(void); + +@@ -48,12 +47,6 @@ extern void gart_iommu_hole_init(void); + static inline void early_gart_iommu_check(void) + { + } +-static inline void gart_iommu_init(void) +-{ +-} +-static inline void gart_iommu_shutdown(void) +-{ +-} + static inline void gart_parse_options(char *options) + { + } +diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h +index 3251e23..fa152cb 100644 +--- a/arch/x86/include/asm/hpet.h ++++ b/arch/x86/include/asm/hpet.h +@@ -68,6 +68,7 @@ extern unsigned long force_hpet_address; + extern int hpet_force_user; + extern u8 hpet_msi_disable; + extern int is_hpet_enabled(void); ++extern int disable_hpet(char *); + extern int hpet_enable(void); + extern void hpet_disable(void); + extern unsigned long hpet_readl(unsigned long a); +@@ -108,6 +109,7 @@ extern void hpet_unregister_irq_handler(rtc_irq_handler handler); + #else /* CONFIG_HPET_TIMER */ + + static inline int hpet_enable(void) { return 0; } ++static inline int disable_hpet(char *s) { return 0; } + static inline int is_hpet_enabled(void) { return 0; } + #define hpet_readl(a) 0 + +diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h +index 439a9ac..bf88684 100644 +--- a/arch/x86/include/asm/hugetlb.h ++++ b/arch/x86/include/asm/hugetlb.h +@@ -36,16 +36,28 @@ static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, + free_pgd_range(tlb, addr, end, floor, ceiling); + } + ++static inline pte_t huge_ptep_get(pte_t *ptep) ++{ ++ return *ptep; ++} ++ + static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) + { +- set_pte_at(mm, addr, ptep, pte); ++#if PAGETABLE_LEVELS >= 3 ++ set_pmd((pmd_t *)ptep, native_make_pmd(native_pte_val(pte))); ++#else ++ set_pgd((pgd_t *)ptep, native_make_pgd(native_pte_val(pte))); ++#endif + } + + static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) + { +- return ptep_get_and_clear(mm, addr, ptep); ++ pte_t pte = huge_ptep_get(ptep); ++ ++ set_huge_pte_at(mm, addr, ptep, __pte(0)); ++ return pte; + } + + static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, +@@ -66,19 +78,25 @@ static inline pte_t huge_pte_wrprotect(pte_t pte) + static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) + { +- ptep_set_wrprotect(mm, addr, ptep); ++ pte_t pte = huge_ptep_get(ptep); ++ ++ pte = pte_wrprotect(pte); ++ set_huge_pte_at(mm, addr, ptep, pte); + } + + static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t pte, int dirty) + { +- return ptep_set_access_flags(vma, addr, ptep, pte, dirty); +-} ++ pte_t oldpte = huge_ptep_get(ptep); ++ int changed = !pte_same(oldpte, pte); + +-static inline pte_t huge_ptep_get(pte_t *ptep) +-{ +- return *ptep; ++ if (changed && dirty) { ++ set_huge_pte_at(vma->vm_mm, addr, ptep, pte); ++ flush_tlb_page(vma, addr); ++ } ++ ++ return changed; + } + + static inline int arch_prepare_hugepage(struct page *page) +diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h +index 6a63b86..9ad387e 100644 +--- a/arch/x86/include/asm/io.h ++++ b/arch/x86/include/asm/io.h +@@ -7,6 +7,10 @@ + #include <asm-generic/int-ll64.h> + #include <asm/page.h> + ++#include <xen/xen.h> ++ ++extern int isapnp_disable; ++ + #define build_mmio_read(name, size, type, reg, barrier) \ + static inline type name(const volatile void __iomem *addr) \ + { type ret; asm volatile("mov" size " %1,%0":reg (ret) \ +@@ -199,6 +203,18 @@ extern void __iomem *early_ioremap(resource_size_t phys_addr, + extern void __iomem *early_memremap(resource_size_t phys_addr, + unsigned long size); + extern void early_iounmap(void __iomem *addr, unsigned long size); ++extern bool is_early_ioremap_ptep(pte_t *ptep); ++ ++#ifdef CONFIG_XEN ++struct bio_vec; ++ ++extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, ++ const struct bio_vec *vec2); ++ ++#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ ++ (__BIOVEC_PHYS_MERGEABLE(vec1, vec2) && \ ++ (!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2))) ++#endif /* CONFIG_XEN */ + + #define IO_SPACE_LIMIT 0xffff + +diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h +index 5f61f6e..b852da9 100644 +--- a/arch/x86/include/asm/io_apic.h ++++ b/arch/x86/include/asm/io_apic.h +@@ -172,6 +172,7 @@ extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); + extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); + + extern void probe_nr_irqs_gsi(void); ++extern int get_nr_irqs_gsi(void); + + extern int setup_ioapic_entry(int apic, int irq, + struct IO_APIC_route_entry *entry, +@@ -201,4 +202,6 @@ static inline void probe_nr_irqs_gsi(void) { } + + #endif + ++void xen_io_apic_init(void); ++ + #endif /* _ASM_X86_IO_APIC_H */ +diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h +index fd6d21b..345c99c 100644 +--- a/arch/x86/include/asm/iommu.h ++++ b/arch/x86/include/asm/iommu.h +@@ -1,8 +1,6 @@ + #ifndef _ASM_X86_IOMMU_H + #define _ASM_X86_IOMMU_H + +-extern void pci_iommu_shutdown(void); +-extern void no_iommu_init(void); + extern struct dma_map_ops nommu_dma_ops; + extern int force_iommu, no_iommu; + extern int iommu_detected; +diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h +index 6e90a04..ba4dc7b 100644 +--- a/arch/x86/include/asm/irq_vectors.h ++++ b/arch/x86/include/asm/irq_vectors.h +@@ -120,6 +120,12 @@ + */ + #define MCE_SELF_VECTOR 0xeb + ++#ifdef CONFIG_XEN ++/* Xen vector callback to receive events in a HVM domain */ ++#define XEN_HVM_EVTCHN_CALLBACK 0xe9 ++#endif ++ ++ + /* + * First APIC vector available to drivers: (vectors 0x30-0xee) we + * start at 0x31(0x41) to spread out vectors evenly between priority +@@ -157,6 +163,14 @@ static inline int invalid_vm86_irq(int irq) + #define CPU_VECTOR_LIMIT ( 8 * NR_CPUS ) + #define IO_APIC_VECTOR_LIMIT ( 32 * MAX_IO_APICS ) + ++#ifndef __ASSEMBLY__ ++# if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SPARSE_IRQ) ++extern int nr_dynamic_irqs; ++# else ++# define NR_DYNAMIC_IRQS 256 ++# endif ++#endif ++ + #ifdef CONFIG_X86_IO_APIC + # ifdef CONFIG_SPARSE_IRQ + # define NR_IRQS \ +@@ -165,13 +179,13 @@ static inline int invalid_vm86_irq(int irq) + (NR_VECTORS + IO_APIC_VECTOR_LIMIT)) + # else + # if NR_CPUS < MAX_IO_APICS +-# define NR_IRQS (NR_VECTORS + 4*CPU_VECTOR_LIMIT) ++# define NR_IRQS (NR_VECTORS + 4*CPU_VECTOR_LIMIT) + NR_DYNAMIC_IRQS + # else +-# define NR_IRQS (NR_VECTORS + IO_APIC_VECTOR_LIMIT) ++# define NR_IRQS (NR_VECTORS + IO_APIC_VECTOR_LIMIT) + NR_DYNAMIC_IRQS + # endif + # endif + #else /* !CONFIG_X86_IO_APIC: */ +-# define NR_IRQS NR_IRQS_LEGACY ++# define NR_IRQS NR_IRQS_LEGACY + NR_DYNAMIC_IRQS + #endif + + #endif /* _ASM_X86_IRQ_VECTORS_H */ +diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h +index ef51b50..e15fca1 100644 +--- a/arch/x86/include/asm/microcode.h ++++ b/arch/x86/include/asm/microcode.h +@@ -55,4 +55,13 @@ static inline struct microcode_ops * __init init_amd_microcode(void) + } + #endif + ++#ifdef CONFIG_MICROCODE_XEN ++extern struct microcode_ops * __init init_xen_microcode(void); ++#else ++static inline struct microcode_ops * __init init_xen_microcode(void) ++{ ++ return NULL; ++} ++#endif ++ + #endif /* _ASM_X86_MICROCODE_H */ +diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h +index 80a1dee..67eaa91 100644 +--- a/arch/x86/include/asm/mmu.h ++++ b/arch/x86/include/asm/mmu.h +@@ -13,6 +13,9 @@ typedef struct { + int size; + struct mutex lock; + void *vdso; ++#ifdef CONFIG_XEN ++ int has_foreign_mappings; ++#endif + } mm_context_t; + + #ifdef CONFIG_SMP +diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h +index efb3899..e571db4 100644 +--- a/arch/x86/include/asm/paravirt.h ++++ b/arch/x86/include/asm/paravirt.h +@@ -330,11 +330,18 @@ static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g) + { + PVOP_VCALL3(pv_cpu_ops.write_idt_entry, dt, entry, g); + } ++ + static inline void set_iopl_mask(unsigned mask) + { + PVOP_VCALL1(pv_cpu_ops.set_iopl_mask, mask); + } + ++static inline void set_io_bitmap(struct thread_struct *thread, ++ unsigned long bytes_updated) ++{ ++ PVOP_VCALL2(pv_cpu_ops.set_io_bitmap, thread, bytes_updated); ++} ++ + /* The paravirtualized I/O functions */ + static inline void slow_down_io(void) + { +@@ -770,15 +777,28 @@ static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock) + #define PV_RESTORE_REGS "popl %edx; popl %ecx;" + + /* save and restore all caller-save registers, except return value */ +-#define PV_SAVE_ALL_CALLER_REGS "pushl %ecx;" +-#define PV_RESTORE_ALL_CALLER_REGS "popl %ecx;" ++#define __PV_SAVE_ALL_CALLER_REGS "pushl %ecx;" ++#define __PV_RESTORE_ALL_CALLER_REGS "popl %ecx;" ++ ++#ifdef CONFIG_FRAME_POINTER ++#define PV_SAVE_ALL_CALLER_REGS \ ++ "push %ebp;" \ ++ "mov %esp, %ebp;" \ ++ __PV_SAVE_ALL_CALLER_REGS ++#define PV_RESTORE_ALL_CALLER_REGS \ ++ __PV_RESTORE_ALL_CALLER_REGS \ ++ "leave;" ++#else ++#define PV_SAVE_ALL_CALLER_REGS __PV_SAVE_ALL_CALLER_REGS ++#define PV_RESTORE_ALL_CALLER_REGS __PV_RESTORE_ALL_CALLER_REGS ++#endif + + #define PV_FLAGS_ARG "0" + #define PV_EXTRA_CLOBBERS + #define PV_VEXTRA_CLOBBERS + #else + /* save and restore all caller-save registers, except return value */ +-#define PV_SAVE_ALL_CALLER_REGS \ ++#define __PV_SAVE_ALL_CALLER_REGS \ + "push %rcx;" \ + "push %rdx;" \ + "push %rsi;" \ +@@ -787,7 +807,7 @@ static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock) + "push %r9;" \ + "push %r10;" \ + "push %r11;" +-#define PV_RESTORE_ALL_CALLER_REGS \ ++#define __PV_RESTORE_ALL_CALLER_REGS \ + "pop %r11;" \ + "pop %r10;" \ + "pop %r9;" \ +@@ -797,6 +817,19 @@ static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock) + "pop %rdx;" \ + "pop %rcx;" + ++#ifdef CONFIG_FRAME_POINTER ++#define PV_SAVE_ALL_CALLER_REGS \ ++ "push %rbp;" \ ++ "mov %rsp, %rbp;" \ ++ __PV_SAVE_ALL_CALLER_REGS ++#define PV_RESTORE_ALL_CALLER_REGS \ ++ __PV_RESTORE_ALL_CALLER_REGS \ ++ "leaveq;" ++#else ++#define PV_SAVE_ALL_CALLER_REGS __PV_SAVE_ALL_CALLER_REGS ++#define PV_RESTORE_ALL_CALLER_REGS __PV_RESTORE_ALL_CALLER_REGS ++#endif ++ + /* We save some registers, but all of them, that's too much. We clobber all + * caller saved registers but the argument parameter */ + #define PV_SAVE_REGS "pushq %%rdi;" +diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h +index 9357473..3202dcc 100644 +--- a/arch/x86/include/asm/paravirt_types.h ++++ b/arch/x86/include/asm/paravirt_types.h +@@ -135,6 +135,8 @@ struct pv_cpu_ops { + void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t); + + void (*set_iopl_mask)(unsigned mask); ++ void (*set_io_bitmap)(struct thread_struct *thread, ++ unsigned long bytes_updated); + + void (*wbinvd)(void); + void (*io_delay)(void); +diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h +index ada8c20..faa0af1 100644 +--- a/arch/x86/include/asm/pci.h ++++ b/arch/x86/include/asm/pci.h +@@ -21,6 +21,7 @@ struct pci_sysdata { + extern int pci_routeirq; + extern int noioapicquirk; + extern int noioapicreroute; ++extern int pci_scan_all_fns; + + /* scan a bus after allocating a pci_sysdata for it */ + extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops, +@@ -49,6 +50,11 @@ extern unsigned int pcibios_assign_all_busses(void); + #define pcibios_assign_all_busses() 0 + #endif + ++static inline int pcibios_scan_all_fns(struct pci_bus *bus, int devfn) ++{ ++ return pci_scan_all_fns; ++} ++ + extern unsigned long pci_mem_start; + #define PCIBIOS_MIN_IO 0x1000 + #define PCIBIOS_MIN_MEM (pci_mem_start) +@@ -87,6 +93,7 @@ extern void pci_iommu_alloc(void); + + /* MSI arch hook */ + #define arch_setup_msi_irqs arch_setup_msi_irqs ++#define arch_teardown_msi_irqs arch_teardown_msi_irqs + + #define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys) + +@@ -128,6 +135,7 @@ extern void pci_iommu_alloc(void); + #include <asm-generic/pci-dma-compat.h> + + /* generic pci stuff */ ++#define HAVE_ARCH_PCIBIOS_SCAN_ALL_FNS + #include <asm-generic/pci.h> + #define PCIBIOS_MAX_MEM_32 0xffffffff + +diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h +index b399988..30cbf49 100644 +--- a/arch/x86/include/asm/pci_x86.h ++++ b/arch/x86/include/asm/pci_x86.h +@@ -45,6 +45,7 @@ enum pci_bf_sort_state { + extern unsigned int pcibios_max_latency; + + void pcibios_resource_survey(void); ++void pcibios_set_cache_line_size(void); + + /* pci-pc.c */ + +@@ -106,6 +107,7 @@ extern int pci_direct_probe(void); + extern void pci_direct_init(int type); + extern void pci_pcbios_init(void); + extern int pci_olpc_init(void); ++extern int pci_xen_init(void); + extern void __init dmi_check_pciprobe(void); + extern void __init dmi_check_skip_isa_align(void); + +diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h +index af6fd36..430e3cc 100644 +--- a/arch/x86/include/asm/pgtable.h ++++ b/arch/x86/include/asm/pgtable.h +@@ -15,7 +15,6 @@ + : (prot)) + + #ifndef __ASSEMBLY__ +- + /* + * ZERO_PAGE is a global shared page that is always zero: used + * for zero-mapped memory areas etc.. +@@ -26,6 +25,8 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; + extern spinlock_t pgd_lock; + extern struct list_head pgd_list; + ++extern struct mm_struct *pgd_page_get_mm(struct page *page); ++ + #ifdef CONFIG_PARAVIRT + #include <asm/paravirt.h> + #else /* !CONFIG_PARAVIRT */ +@@ -76,6 +77,11 @@ extern struct list_head pgd_list; + + #endif /* CONFIG_PARAVIRT */ + ++static inline pteval_t pte_flags(pte_t pte) ++{ ++ return pte_val(pte) & PTE_FLAGS_MASK; ++} ++ + /* + * The following only work if pte_present() is true. + * Undefined behaviour if not.. +@@ -397,6 +403,9 @@ static inline unsigned long pages_to_mb(unsigned long npg) + #define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ + remap_pfn_range(vma, vaddr, pfn, size, prot) + ++#define arch_vm_get_page_prot arch_vm_get_page_prot ++extern pgprot_t arch_vm_get_page_prot(unsigned vm_flags); ++ + #if PAGETABLE_LEVELS > 2 + static inline int pud_none(pud_t pud) + { +@@ -616,6 +625,9 @@ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) + memcpy(dst, src, count * sizeof(pgd_t)); + } + ++int create_lookup_pte_addr(struct mm_struct *mm, ++ unsigned long address, ++ uint64_t *ptep); + + #include <asm-generic/pgtable.h> + #endif /* __ASSEMBLY__ */ +diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h +index c57a301..4e46931 100644 +--- a/arch/x86/include/asm/pgtable_64.h ++++ b/arch/x86/include/asm/pgtable_64.h +@@ -160,7 +160,7 @@ extern void cleanup_highmap(void); + #define pgtable_cache_init() do { } while (0) + #define check_pgt_cache() do { } while (0) + +-#define PAGE_AGP PAGE_KERNEL_NOCACHE ++#define PAGE_AGP PAGE_KERNEL_IO_NOCACHE + #define HAVE_PAGE_AGP 1 + + /* fs/proc/kcore.c */ +diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h +index d1f4a76..a81b0ed 100644 +--- a/arch/x86/include/asm/pgtable_types.h ++++ b/arch/x86/include/asm/pgtable_types.h +@@ -265,11 +265,6 @@ static inline pteval_t native_pte_val(pte_t pte) + return pte.pte; + } + +-static inline pteval_t pte_flags(pte_t pte) +-{ +- return native_pte_val(pte) & PTE_FLAGS_MASK; +-} +- + #define pgprot_val(x) ((x).pgprot) + #define __pgprot(x) ((pgprot_t) { (x) } ) + +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index 78bb4d7..2232bd2 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -551,6 +551,9 @@ static inline void native_set_iopl_mask(unsigned mask) + #endif + } + ++extern void native_set_io_bitmap(struct thread_struct *thread, ++ unsigned long updated_bytes); ++ + static inline void + native_load_sp0(struct tss_struct *tss, struct thread_struct *thread) + { +@@ -592,6 +595,7 @@ static inline void load_sp0(struct tss_struct *tss, + } + + #define set_iopl_mask native_set_iopl_mask ++#define set_io_bitmap native_set_io_bitmap + #endif /* CONFIG_PARAVIRT */ + + /* +diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h +index 53235fd..daaacab 100644 +--- a/arch/x86/include/asm/pvclock.h ++++ b/arch/x86/include/asm/pvclock.h +@@ -10,5 +10,6 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src); + void pvclock_read_wallclock(struct pvclock_wall_clock *wall, + struct pvclock_vcpu_time_info *vcpu, + struct timespec *ts); ++void pvclock_resume(void); + + #endif /* _ASM_X86_PVCLOCK_H */ +diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h +index 18e496c..154a5f1 100644 +--- a/arch/x86/include/asm/setup.h ++++ b/arch/x86/include/asm/setup.h +@@ -95,6 +95,11 @@ void *extend_brk(size_t size, size_t align); + : : "i" (sz)); \ + } + ++/* Helper for reserving space for arrays of things */ ++#define RESERVE_BRK_ARRAY(type, name, entries) \ ++ type *name; \ ++ RESERVE_BRK(name, sizeof(type) * entries) ++ + #ifdef __i386__ + + void __init i386_start_kernel(void); +diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h +index b9e4e20..8085277 100644 +--- a/arch/x86/include/asm/swiotlb.h ++++ b/arch/x86/include/asm/swiotlb.h +@@ -3,15 +3,16 @@ + + #include <linux/swiotlb.h> + +-/* SWIOTLB interface */ +- +-extern int swiotlb_force; +- + #ifdef CONFIG_SWIOTLB + extern int swiotlb; +-extern void pci_swiotlb_init(void); ++extern int __init pci_swiotlb_detect(void); ++extern void __init pci_swiotlb_init(void); + #else + #define swiotlb 0 ++static inline int pci_swiotlb_detect(void) ++{ ++ return 0; ++} + static inline void pci_swiotlb_init(void) + { + } +diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h +index 1bb6e39..ef0fa4d 100644 +--- a/arch/x86/include/asm/syscalls.h ++++ b/arch/x86/include/asm/syscalls.h +@@ -33,11 +33,11 @@ long sys_rt_sigreturn(struct pt_regs *); + asmlinkage int sys_set_thread_area(struct user_desc __user *); + asmlinkage int sys_get_thread_area(struct user_desc __user *); + +-/* X86_32 only */ +-#ifdef CONFIG_X86_32 + /* kernel/ioport.c */ +-long sys_iopl(struct pt_regs *); ++asmlinkage long sys_iopl(unsigned int); + ++/* X86_32 only */ ++#ifdef CONFIG_X86_32 + /* kernel/process_32.c */ + int sys_clone(struct pt_regs *); + int sys_execve(struct pt_regs *); +@@ -68,8 +68,6 @@ int sys_vm86(struct pt_regs *); + #else /* CONFIG_X86_32 */ + + /* X86_64 only */ +-/* kernel/ioport.c */ +-asmlinkage long sys_iopl(unsigned int, struct pt_regs *); + + /* kernel/process_64.c */ + asmlinkage long sys_clone(unsigned long, unsigned long, +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index 7f3eba0..e4fc8ea 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -89,6 +89,10 @@ static inline void __flush_tlb_one(unsigned long addr) + + #ifndef CONFIG_SMP + ++static inline void __init init_smp_flush(void) ++{ ++} ++ + #define flush_tlb() __flush_tlb() + #define flush_tlb_all() __flush_tlb_all() + #define local_flush_tlb() __flush_tlb() +@@ -129,6 +133,8 @@ static inline void reset_lazy_tlbstate(void) + + #define local_flush_tlb() __flush_tlb() + ++extern void init_smp_flush(void); ++ + extern void flush_tlb_all(void); + extern void flush_tlb_current_task(void); + extern void flush_tlb_mm(struct mm_struct *); +diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h +index 2c756fd..d8e7145 100644 +--- a/arch/x86/include/asm/x86_init.h ++++ b/arch/x86/include/asm/x86_init.h +@@ -91,6 +91,14 @@ struct x86_init_timers { + }; + + /** ++ * struct x86_init_iommu - platform specific iommu setup ++ * @iommu_init: platform specific iommu setup ++ */ ++struct x86_init_iommu { ++ int (*iommu_init)(void); ++}; ++ ++/** + * struct x86_init_ops - functions for platform specific setup + * + */ +@@ -101,6 +109,7 @@ struct x86_init_ops { + struct x86_init_oem oem; + struct x86_init_paging paging; + struct x86_init_timers timers; ++ struct x86_init_iommu iommu; + }; + + /** +@@ -121,6 +130,7 @@ struct x86_platform_ops { + unsigned long (*calibrate_tsc)(void); + unsigned long (*get_wallclock)(void); + int (*set_wallclock)(unsigned long nowtime); ++ void (*iommu_shutdown)(void); + }; + + extern struct x86_init_ops x86_init; +diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h +index 9c371e4..41c4be0 100644 +--- a/arch/x86/include/asm/xen/hypercall.h ++++ b/arch/x86/include/asm/xen/hypercall.h +@@ -45,6 +45,8 @@ + #include <xen/interface/xen.h> + #include <xen/interface/sched.h> + #include <xen/interface/physdev.h> ++#include <xen/interface/platform.h> ++#include <xen/interface/xen-mca.h> + + /* + * The hypercall asms have to meet several constraints: +@@ -200,6 +202,23 @@ extern struct { char _entry[32]; } hypercall_page[]; + (type)__res; \ + }) + ++static inline long ++privcmd_call(unsigned call, ++ unsigned long a1, unsigned long a2, ++ unsigned long a3, unsigned long a4, ++ unsigned long a5) ++{ ++ __HYPERCALL_DECLS; ++ __HYPERCALL_5ARG(a1, a2, a3, a4, a5); ++ ++ asm volatile("call *%[call]" ++ : __HYPERCALL_5PARAM ++ : [call] "a" (&hypercall_page[call]) ++ : __HYPERCALL_CLOBBER5); ++ ++ return (long)__res; ++} ++ + static inline int + HYPERVISOR_set_trap_table(struct trap_info *table) + { +@@ -282,6 +301,20 @@ HYPERVISOR_set_timer_op(u64 timeout) + } + + static inline int ++HYPERVISOR_mca(struct xen_mc *mc_op) ++{ ++ mc_op->interface_version = XEN_MCA_INTERFACE_VERSION; ++ return _hypercall1(int, mca, mc_op); ++} ++ ++static inline int ++HYPERVISOR_dom0_op(struct xen_platform_op *platform_op) ++{ ++ platform_op->interface_version = XENPF_INTERFACE_VERSION; ++ return _hypercall1(int, dom0_op, platform_op); ++} ++ ++static inline int + HYPERVISOR_set_debugreg(int reg, unsigned long value) + { + return _hypercall2(int, set_debugreg, reg, value); +@@ -417,6 +450,12 @@ HYPERVISOR_nmi_op(unsigned long op, unsigned long arg) + return _hypercall2(int, nmi_op, op, arg); + } + ++static inline unsigned long __must_check ++HYPERVISOR_hvm_op(int op, void *arg) ++{ ++ return _hypercall2(unsigned long, hvm_op, op, arg); ++} ++ + static inline void + MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) + { +@@ -424,6 +463,14 @@ MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) + mcl->args[0] = set; + } + ++#if defined(CONFIG_X86_64) ++#define MULTI_UVMFLAGS_INDEX 2 ++#define MULTI_UVMDOMID_INDEX 3 ++#else ++#define MULTI_UVMFLAGS_INDEX 3 ++#define MULTI_UVMDOMID_INDEX 4 ++#endif ++ + static inline void + MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va, + pte_t new_val, unsigned long flags) +@@ -432,12 +479,11 @@ MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va, + mcl->args[0] = va; + if (sizeof(new_val) == sizeof(long)) { + mcl->args[1] = new_val.pte; +- mcl->args[2] = flags; + } else { + mcl->args[1] = new_val.pte; + mcl->args[2] = new_val.pte >> 32; +- mcl->args[3] = flags; + } ++ mcl->args[MULTI_UVMFLAGS_INDEX] = flags; + } + + static inline void +diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h +index d5b7e90..396ff4c 100644 +--- a/arch/x86/include/asm/xen/hypervisor.h ++++ b/arch/x86/include/asm/xen/hypervisor.h +@@ -37,31 +37,4 @@ + extern struct shared_info *HYPERVISOR_shared_info; + extern struct start_info *xen_start_info; + +-enum xen_domain_type { +- XEN_NATIVE, /* running on bare hardware */ +- XEN_PV_DOMAIN, /* running in a PV domain */ +- XEN_HVM_DOMAIN, /* running in a Xen hvm domain */ +-}; +- +-#ifdef CONFIG_XEN +-extern enum xen_domain_type xen_domain_type; +-#else +-#define xen_domain_type XEN_NATIVE +-#endif +- +-#define xen_domain() (xen_domain_type != XEN_NATIVE) +-#define xen_pv_domain() (xen_domain() && \ +- xen_domain_type == XEN_PV_DOMAIN) +-#define xen_hvm_domain() (xen_domain() && \ +- xen_domain_type == XEN_HVM_DOMAIN) +- +-#ifdef CONFIG_XEN_DOM0 +-#include <xen/interface/xen.h> +- +-#define xen_initial_domain() (xen_pv_domain() && \ +- xen_start_info->flags & SIF_INITDOMAIN) +-#else /* !CONFIG_XEN_DOM0 */ +-#define xen_initial_domain() (0) +-#endif /* CONFIG_XEN_DOM0 */ +- + #endif /* _ASM_X86_XEN_HYPERVISOR_H */ +diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h +index e8506c1..9539998 100644 +--- a/arch/x86/include/asm/xen/interface.h ++++ b/arch/x86/include/asm/xen/interface.h +@@ -61,9 +61,9 @@ DEFINE_GUEST_HANDLE(void); + #define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START) + #endif + +-#ifndef machine_to_phys_mapping +-#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START) +-#endif ++#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START) ++#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END) ++#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>__MACH2PHYS_SHIFT) + + /* Maximum number of virtual CPUs in multi-processor guests. */ + #define MAX_VIRT_CPUS 32 +@@ -97,6 +97,8 @@ DEFINE_GUEST_HANDLE(void); + #define TI_SET_IF(_ti, _if) ((_ti)->flags |= ((!!(_if))<<2)) + + #ifndef __ASSEMBLY__ ++#include <linux/types.h> ++ + struct trap_info { + uint8_t vector; /* exception vector */ + uint8_t flags; /* 0-3: privilege level; 4: clear event enable? */ +diff --git a/arch/x86/include/asm/xen/interface_32.h b/arch/x86/include/asm/xen/interface_32.h +index 42a7e00..8413688 100644 +--- a/arch/x86/include/asm/xen/interface_32.h ++++ b/arch/x86/include/asm/xen/interface_32.h +@@ -32,6 +32,11 @@ + /* And the trap vector is... */ + #define TRAP_INSTR "int $0x82" + ++#define __MACH2PHYS_VIRT_START 0xF5800000 ++#define __MACH2PHYS_VIRT_END 0xF6800000 ++ ++#define __MACH2PHYS_SHIFT 2 ++ + /* + * Virtual addresses beyond this are not modifiable by guest OSes. The + * machine->physical mapping table starts at this address, read-only. +diff --git a/arch/x86/include/asm/xen/interface_64.h b/arch/x86/include/asm/xen/interface_64.h +index 100d266..839a481 100644 +--- a/arch/x86/include/asm/xen/interface_64.h ++++ b/arch/x86/include/asm/xen/interface_64.h +@@ -39,18 +39,7 @@ + #define __HYPERVISOR_VIRT_END 0xFFFF880000000000 + #define __MACH2PHYS_VIRT_START 0xFFFF800000000000 + #define __MACH2PHYS_VIRT_END 0xFFFF804000000000 +- +-#ifndef HYPERVISOR_VIRT_START +-#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START) +-#define HYPERVISOR_VIRT_END mk_unsigned_long(__HYPERVISOR_VIRT_END) +-#endif +- +-#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START) +-#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END) +-#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>3) +-#ifndef machine_to_phys_mapping +-#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START) +-#endif ++#define __MACH2PHYS_SHIFT 3 + + /* + * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base) +diff --git a/arch/x86/include/asm/xen/iommu.h b/arch/x86/include/asm/xen/iommu.h +new file mode 100644 +index 0000000..75df312 +--- /dev/null ++++ b/arch/x86/include/asm/xen/iommu.h +@@ -0,0 +1,12 @@ ++#ifndef ASM_X86__XEN_IOMMU_H ++ ++#ifdef CONFIG_PCI_XEN ++extern void xen_iommu_init(void); ++#else ++static inline void xen_iommu_init(void) ++{ ++} ++#endif ++ ++#endif ++ +diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h +index 018a0a4..05c5cf5 100644 +--- a/arch/x86/include/asm/xen/page.h ++++ b/arch/x86/include/asm/xen/page.h +@@ -5,6 +5,7 @@ + #include <linux/types.h> + #include <linux/spinlock.h> + #include <linux/pfn.h> ++#include <linux/mm.h> + + #include <asm/uaccess.h> + #include <asm/page.h> +@@ -28,23 +29,32 @@ typedef struct xpaddr { + + /**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/ + #define INVALID_P2M_ENTRY (~0UL) +-#define FOREIGN_FRAME_BIT (1UL<<31) ++#define FOREIGN_FRAME_BIT (1UL << (sizeof(unsigned long) * 8 - 1)) + #define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) + + /* Maximum amount of memory we can handle in a domain in pages */ + #define MAX_DOMAIN_PAGES \ + ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE)) + ++extern unsigned long *machine_to_phys_mapping; ++extern unsigned int machine_to_phys_order; + + extern unsigned long get_phys_to_machine(unsigned long pfn); +-extern void set_phys_to_machine(unsigned long pfn, unsigned long mfn); ++extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); + + static inline unsigned long pfn_to_mfn(unsigned long pfn) + { ++ unsigned long mfn; ++ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return pfn; + +- return get_phys_to_machine(pfn) & ~FOREIGN_FRAME_BIT; ++ mfn = get_phys_to_machine(pfn); ++ ++ if (mfn != INVALID_P2M_ENTRY) ++ mfn &= ~FOREIGN_FRAME_BIT; ++ ++ return mfn; + } + + static inline int phys_to_machine_mapping_valid(unsigned long pfn) +@@ -62,10 +72,8 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) + if (xen_feature(XENFEAT_auto_translated_physmap)) + return mfn; + +-#if 0 + if (unlikely((mfn >> machine_to_phys_order) != 0)) +- return max_mapnr; +-#endif ++ return ~0; + + pfn = 0; + /* +@@ -112,13 +120,9 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine) + */ + static inline unsigned long mfn_to_local_pfn(unsigned long mfn) + { +- extern unsigned long max_mapnr; + unsigned long pfn = mfn_to_pfn(mfn); +- if ((pfn < max_mapnr) +- && !xen_feature(XENFEAT_auto_translated_physmap) +- && (get_phys_to_machine(pfn) != mfn)) +- return max_mapnr; /* force !pfn_valid() */ +- /* XXX fixme; not true with sparsemem */ ++ if (get_phys_to_machine(pfn) != mfn) ++ return -1; /* force !pfn_valid() */ + return pfn; + } + +@@ -163,6 +167,7 @@ static inline pte_t __pte_ma(pteval_t x) + + #define pgd_val_ma(x) ((x).pgd) + ++void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid); + + xmaddr_t arbitrary_virt_to_machine(void *address); + unsigned long arbitrary_virt_to_mfn(void *vaddr); +diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h +new file mode 100644 +index 0000000..6683196 +--- /dev/null ++++ b/arch/x86/include/asm/xen/pci.h +@@ -0,0 +1,104 @@ ++#ifndef _ASM_X86_XEN_PCI_H ++#define _ASM_X86_XEN_PCI_H ++ ++#if defined(CONFIG_PCI_MSI) ++#if defined(CONFIG_PCI_XEN) ++int xen_register_pirq(u32 gsi, int triggering); ++int xen_register_gsi(u32 gsi, int triggering, int polarity); ++int xen_create_msi_irq(struct pci_dev *dev, ++ struct msi_desc *msidesc, ++ int type); ++void xen_pci_teardown_msi_dev(struct pci_dev *dev); ++void xen_pci_teardown_msi_irq(int irq); ++int xen_pci_setup_msi_irqs(struct pci_dev *dev, int nvec, int type); ++ ++/* The drivers/pci/xen-pcifront.c sets this structure to ++ * its own functions. ++ */ ++struct xen_pci_frontend_ops { ++ int (*enable_msi)(struct pci_dev *dev, int **vectors); ++ void (*disable_msi)(struct pci_dev *dev); ++ int (*enable_msix)(struct pci_dev *dev, int **vectors, int nvec); ++ void (*disable_msix)(struct pci_dev *dev); ++}; ++ ++extern struct xen_pci_frontend_ops *xen_pci_frontend; ++ ++static inline int xen_pci_frontend_enable_msi(struct pci_dev *dev, ++ int **vectors) ++{ ++ if (xen_pci_frontend && xen_pci_frontend->enable_msi) ++ return xen_pci_frontend->enable_msi(dev, vectors); ++ return -ENODEV; ++} ++static inline void xen_pci_frontend_disable_msi(struct pci_dev *dev) ++{ ++ if (xen_pci_frontend && xen_pci_frontend->disable_msi) ++ xen_pci_frontend->disable_msi(dev); ++} ++static inline int xen_pci_frontend_enable_msix(struct pci_dev *dev, ++ int **vectors, int nvec) ++{ ++ if (xen_pci_frontend && xen_pci_frontend->enable_msix) ++ return xen_pci_frontend->enable_msix(dev, vectors, nvec); ++ return -ENODEV; ++} ++static inline void xen_pci_frontend_disable_msix(struct pci_dev *dev) ++{ ++ if (xen_pci_frontend && xen_pci_frontend->disable_msix) ++ xen_pci_frontend->disable_msix(dev); ++} ++#else ++static inline int xen_create_msi_irq(struct pci_dev *dev, ++ struct msi_desc *msidesc, ++ int type) ++{ ++ return -1; ++} ++static inline void xen_pci_teardown_msi_dev(struct pci_dev *dev) { } ++static inline void xen_pci_teardown_msi_irq(int irq) { } ++static inline int xen_pci_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) ++{ ++ return -ENODEV; ++} ++#endif /* CONFIG_PCI_XEN */ ++ ++#endif /* CONFIG_PCI_MSI */ ++ ++#ifdef CONFIG_XEN_DOM0_PCI ++int xen_register_gsi(u32 gsi, int triggering, int polarity); ++int xen_find_device_domain_owner(struct pci_dev *dev); ++int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain); ++int xen_unregister_device_domain_owner(struct pci_dev *dev); ++ ++#else ++static inline int xen_register_gsi(u32 gsi, int triggering, int polarity) ++{ ++ return -1; ++} ++ ++static inline int xen_find_device_domain_owner(struct pci_dev *dev) ++{ ++ return -1; ++} ++static inline int xen_register_device_domain_owner(struct pci_dev *dev, ++ uint16_t domain) ++{ ++ return -1; ++} ++static inline int xen_unregister_device_domain_owner(struct pci_dev *dev) ++{ ++ return -1; ++} ++#endif ++ ++#if defined(CONFIG_PCI_MSI) && defined(CONFIG_XEN_DOM0_PCI) ++int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type); ++#else ++static inline int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) ++{ ++ return -1; ++} ++#endif ++ ++#endif /* _ASM_X86_XEN_PCI_H */ +diff --git a/arch/x86/include/asm/xen/swiotlb-xen.h b/arch/x86/include/asm/xen/swiotlb-xen.h +new file mode 100644 +index 0000000..e4fe299 +--- /dev/null ++++ b/arch/x86/include/asm/xen/swiotlb-xen.h +@@ -0,0 +1,14 @@ ++#ifndef _ASM_X86_SWIOTLB_XEN_H ++#define _ASM_X86_SWIOTLB_XEN_H ++ ++#ifdef CONFIG_PCI_XEN ++extern int xen_swiotlb; ++extern int __init pci_xen_swiotlb_detect(void); ++extern void __init pci_xen_swiotlb_init(void); ++#else ++#define xen_swiotlb 0 ++static inline int __init pci_xen_swiotlb_detect(void) { return 0; } ++static inline void __init pci_xen_swiotlb_init(void) { } ++#endif ++ ++#endif /* _ASM_X86_SWIOTLB_XEN_H */ +diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile +index d1911ab..cfe00bc 100644 +--- a/arch/x86/kernel/Makefile ++++ b/arch/x86/kernel/Makefile +@@ -113,6 +113,7 @@ obj-$(CONFIG_X86_MRST) += mrst.o + microcode-y := microcode_core.o + microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o + microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o ++microcode-$(CONFIG_MICROCODE_XEN) += microcode_xen.o + obj-$(CONFIG_MICROCODE) += microcode.o + + obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o +diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c +index 23c2da8..a2a5125 100644 +--- a/arch/x86/kernel/acpi/boot.c ++++ b/arch/x86/kernel/acpi/boot.c +@@ -42,6 +42,10 @@ + #include <asm/mpspec.h> + #include <asm/smp.h> + ++#include <asm/xen/pci.h> ++ ++#include <asm/xen/hypervisor.h> ++ + static int __initdata acpi_force = 0; + u32 acpi_rsdt_forced; + int acpi_disabled; +@@ -149,6 +153,10 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled) + { + unsigned int ver = 0; + ++ /* We don't want to register lapics when in Xen dom0 */ ++ if (xen_initial_domain()) ++ return; ++ + if (!enabled) { + ++disabled_cpus; + return; +@@ -461,9 +469,13 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) + */ + int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) + { +- unsigned int irq; ++ int irq; + unsigned int plat_gsi = gsi; + ++ irq = xen_register_gsi(gsi, trigger, polarity); ++ if (irq >= 0) ++ return irq; ++ + #ifdef CONFIG_PCI + /* + * Make sure all (legacy) PCI IRQs are set as level-triggered. +@@ -740,6 +752,10 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table) + + static void __init acpi_register_lapic_address(unsigned long address) + { ++ /* Xen dom0 doesn't have usable lapics */ ++ if (xen_initial_domain()) ++ return; ++ + mp_lapic_addr = address; + + set_fixmap_nocache(FIX_APIC_BASE, address); +@@ -860,6 +876,9 @@ int __init acpi_probe_gsi(void) + max_gsi = gsi; + } + ++ if (xen_initial_domain()) ++ max_gsi += 255; /* Plus maximum entries of an ioapic. */ ++ + return max_gsi + 1; + } + +diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c +index d85d1b2..8aabedd 100644 +--- a/arch/x86/kernel/acpi/processor.c ++++ b/arch/x86/kernel/acpi/processor.c +@@ -12,6 +12,8 @@ + #include <acpi/processor.h> + #include <asm/acpi.h> + ++#include <xen/xen.h> ++ + static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c) + { + struct acpi_object_list *obj_list; +@@ -59,7 +61,7 @@ static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c) + /* + * If mwait/monitor is unsupported, C2/C3_FFH will be disabled + */ +- if (!cpu_has(c, X86_FEATURE_MWAIT)) ++ if (!cpu_has(c, X86_FEATURE_MWAIT) && !xen_initial_domain()) + buf[2] &= ~(ACPI_PDC_C_C2C3_FFH); + + obj->type = ACPI_TYPE_BUFFER; +@@ -88,6 +90,19 @@ void arch_acpi_processor_init_pdc(struct acpi_processor *pr) + + EXPORT_SYMBOL(arch_acpi_processor_init_pdc); + ++/* Initialize _PDC data based on the CPU vendor */ ++void xen_arch_acpi_processor_init_pdc(struct acpi_processor *pr) ++{ ++ struct cpuinfo_x86 *c = &cpu_data(0); ++ ++ pr->pdc = NULL; ++ if (c->x86_vendor == X86_VENDOR_INTEL) ++ init_intel_pdc(pr, c); ++ ++ return; ++} ++EXPORT_SYMBOL(xen_arch_acpi_processor_init_pdc); ++ + void arch_acpi_processor_cleanup_pdc(struct acpi_processor *pr) + { + if (pr->pdc) { +diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c +index ca93638..9eff23c 100644 +--- a/arch/x86/kernel/acpi/sleep.c ++++ b/arch/x86/kernel/acpi/sleep.c +@@ -12,6 +12,8 @@ + #include <asm/segment.h> + #include <asm/desc.h> + ++#include <xen/acpi.h> ++ + #include "realmode/wakeup.h" + #include "sleep.h" + +diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c +index 7cd33f7..b8497c6 100644 +--- a/arch/x86/kernel/amd_iommu.c ++++ b/arch/x86/kernel/amd_iommu.c +@@ -928,7 +928,7 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev, + } + + if (unlikely(address == -1)) +- address = bad_dma_address; ++ address = DMA_ERROR_CODE; + + WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size); + +@@ -1545,7 +1545,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, + + pte = dma_ops_get_pte(dom, address); + if (!pte) +- return bad_dma_address; ++ return DMA_ERROR_CODE; + + __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; + +@@ -1626,7 +1626,7 @@ static dma_addr_t __map_single(struct device *dev, + retry: + address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask, + dma_mask); +- if (unlikely(address == bad_dma_address)) { ++ if (unlikely(address == DMA_ERROR_CODE)) { + /* + * setting next_address here will let the address + * allocator only scan the new allocated range in the +@@ -1647,7 +1647,7 @@ retry: + start = address; + for (i = 0; i < pages; ++i) { + ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); +- if (ret == bad_dma_address) ++ if (ret == DMA_ERROR_CODE) + goto out_unmap; + + paddr += PAGE_SIZE; +@@ -1675,7 +1675,7 @@ out_unmap: + + dma_ops_free_addresses(dma_dom, address, pages); + +- return bad_dma_address; ++ return DMA_ERROR_CODE; + } + + /* +@@ -1692,7 +1692,7 @@ static void __unmap_single(struct amd_iommu *iommu, + dma_addr_t i, start; + unsigned int pages; + +- if ((dma_addr == bad_dma_address) || ++ if ((dma_addr == DMA_ERROR_CODE) || + (dma_addr + size > dma_dom->aperture_size)) + return; + +@@ -1735,7 +1735,7 @@ static dma_addr_t map_page(struct device *dev, struct page *page, + INC_STATS_COUNTER(cnt_map_single); + + if (!check_device(dev)) +- return bad_dma_address; ++ return DMA_ERROR_CODE; + + dma_mask = *dev->dma_mask; + +@@ -1746,12 +1746,12 @@ static dma_addr_t map_page(struct device *dev, struct page *page, + return (dma_addr_t)paddr; + + if (!dma_ops_domain(domain)) +- return bad_dma_address; ++ return DMA_ERROR_CODE; + + spin_lock_irqsave(&domain->lock, flags); + addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false, + dma_mask); +- if (addr == bad_dma_address) ++ if (addr == DMA_ERROR_CODE) + goto out; + + iommu_completion_wait(iommu); +@@ -1960,7 +1960,7 @@ static void *alloc_coherent(struct device *dev, size_t size, + *dma_addr = __map_single(dev, iommu, domain->priv, paddr, + size, DMA_BIDIRECTIONAL, true, dma_mask); + +- if (*dma_addr == bad_dma_address) { ++ if (*dma_addr == DMA_ERROR_CODE) { + spin_unlock_irqrestore(&domain->lock, flags); + goto out_free; + } +@@ -2122,8 +2122,7 @@ int __init amd_iommu_init_dma_ops(void) + prealloc_protection_domains(); + + iommu_detected = 1; +- force_iommu = 1; +- bad_dma_address = 0; ++ swiotlb = 0; + #ifdef CONFIG_GART_IOMMU + gart_iommu_aperture_disabled = 1; + gart_iommu_aperture = 0; +diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c +index 400be99..0069df5 100644 +--- a/arch/x86/kernel/amd_iommu_init.c ++++ b/arch/x86/kernel/amd_iommu_init.c +@@ -29,6 +29,7 @@ + #include <asm/amd_iommu.h> + #include <asm/iommu.h> + #include <asm/gart.h> ++#include <asm/x86_init.h> + + /* + * definitions for the ACPI scanning code +@@ -1206,19 +1207,10 @@ static struct sys_device device_amd_iommu = { + * functions. Finally it prints some information about AMD IOMMUs and + * the driver state and enables the hardware. + */ +-int __init amd_iommu_init(void) ++static int __init amd_iommu_init(void) + { + int i, ret = 0; + +- +- if (no_iommu) { +- printk(KERN_INFO "AMD-Vi disabled by kernel command line\n"); +- return 0; +- } +- +- if (!amd_iommu_detected) +- return -ENODEV; +- + /* + * First parse ACPI tables to find the largest Bus/Dev/Func + * we need to handle. Upon this information the shared data +@@ -1333,6 +1325,7 @@ int __init amd_iommu_init(void) + else + printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n"); + ++ x86_platform.iommu_shutdown = disable_iommus; + out: + return ret; + +@@ -1361,11 +1354,6 @@ free: + goto out; + } + +-void amd_iommu_shutdown(void) +-{ +- disable_iommus(); +-} +- + /**************************************************************************** + * + * Early detect code. This code runs at IOMMU detection time in the DMA +@@ -1380,16 +1368,13 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table) + + void __init amd_iommu_detect(void) + { +- if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture)) ++ if (no_iommu || (iommu_detected && !gart_iommu_aperture)) + return; + + if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { + iommu_detected = 1; + amd_iommu_detected = 1; +-#ifdef CONFIG_GART_IOMMU +- gart_iommu_aperture_disabled = 1; +- gart_iommu_aperture = 0; +-#endif ++ x86_init.iommu.iommu_init = amd_iommu_init; + } + } + +diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c +index 082089e..8d34362 100644 +--- a/arch/x86/kernel/aperture_64.c ++++ b/arch/x86/kernel/aperture_64.c +@@ -28,6 +28,7 @@ + #include <asm/pci-direct.h> + #include <asm/dma.h> + #include <asm/k8.h> ++#include <asm/x86_init.h> + + int gart_iommu_aperture; + int gart_iommu_aperture_disabled __initdata; +@@ -401,6 +402,7 @@ void __init gart_iommu_hole_init(void) + + iommu_detected = 1; + gart_iommu_aperture = 1; ++ x86_init.iommu.iommu_init = gart_iommu_init; + + ctl = read_pci_config(bus, slot, 3, + AMD64_GARTAPERTURECTL); +@@ -469,7 +471,7 @@ out: + + if (aper_alloc) { + /* Got the aperture from the AGP bridge */ +- } else if (swiotlb && !valid_agp) { ++ } else if (!valid_agp) { + /* Do nothing */ + } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) || + force_iommu || +diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c +index 8928d97..4848d5d 100644 +--- a/arch/x86/kernel/apic/io_apic.c ++++ b/arch/x86/kernel/apic/io_apic.c +@@ -63,7 +63,12 @@ + #include <asm/uv/uv_hub.h> + #include <asm/uv/uv_irq.h> + ++#include <asm/xen/hypervisor.h> + #include <asm/apic.h> ++#include <asm/xen/hypervisor.h> ++#include <asm/xen/pci.h> ++ ++#include <asm/xen/pci.h> + + #define __apicdebuginit(type) static type __init + #define for_each_irq_pin(entry, head) \ +@@ -395,14 +400,18 @@ static inline void io_apic_eoi(unsigned int apic, unsigned int vector) + + static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) + { +- struct io_apic __iomem *io_apic = io_apic_base(apic); ++ struct io_apic __iomem *io_apic; ++ ++ io_apic = io_apic_base(apic); + writel(reg, &io_apic->index); + return readl(&io_apic->data); + } + + static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) + { +- struct io_apic __iomem *io_apic = io_apic_base(apic); ++ struct io_apic __iomem *io_apic; ++ ++ io_apic = io_apic_base(apic); + writel(reg, &io_apic->index); + writel(value, &io_apic->data); + } +@@ -415,7 +424,9 @@ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned i + */ + static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) + { +- struct io_apic __iomem *io_apic = io_apic_base(apic); ++ struct io_apic __iomem *io_apic; ++ ++ io_apic = io_apic_base(apic); + + if (sis_apic_bug) + writel(reg, &io_apic->index); +@@ -3494,6 +3505,9 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) + if (type == PCI_CAP_ID_MSI && nvec > 1) + return 1; + ++ if (xen_pv_domain()) ++ return xen_pci_setup_msi_irqs(dev, nvec, type); ++ + node = dev_to_node(&dev->dev); + irq_want = nr_irqs_gsi; + sub_handle = 0; +@@ -3543,7 +3557,29 @@ error: + + void arch_teardown_msi_irq(unsigned int irq) + { +- destroy_irq(irq); ++ if (xen_domain()) ++ xen_pci_teardown_msi_irq(irq); ++ else ++ destroy_irq(irq); ++} ++ ++void arch_teardown_msi_irqs(struct pci_dev *dev) ++{ ++ struct msi_desc *entry; ++ ++ /* If we are non-privileged PV domain, we have to ++ * to call xen_teardown_msi_dev first. */ ++ if (xen_domain()) ++ xen_pci_teardown_msi_dev(dev); ++ ++ list_for_each_entry(entry, &dev->msi_list, list) { ++ int i, nvec; ++ if (entry->irq == 0) ++ continue; ++ nvec = 1 << entry->msi_attrib.multiple; ++ for (i = 0; i < nvec; i++) ++ arch_teardown_msi_irq(entry->irq + i); ++ } + } + + #if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP) +@@ -3860,7 +3896,14 @@ void __init probe_nr_irqs_gsi(void) + printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi); + } + ++int get_nr_irqs_gsi(void) ++{ ++ return nr_irqs_gsi; ++} ++ + #ifdef CONFIG_SPARSE_IRQ ++int nr_dynamic_irqs; ++ + int __init arch_probe_nr_irqs(void) + { + int nr; +@@ -3878,6 +3921,8 @@ int __init arch_probe_nr_irqs(void) + if (nr < nr_irqs) + nr_irqs = nr; + ++ nr_irqs += nr_dynamic_irqs; ++ + return 0; + } + #endif +diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c +index 7ff61d6..d1e6e60 100644 +--- a/arch/x86/kernel/apic/nmi.c ++++ b/arch/x86/kernel/apic/nmi.c +@@ -558,6 +558,9 @@ void arch_trigger_all_cpu_backtrace(void) + { + int i; + ++ if (!cpu_has_apic) ++ return; ++ + cpumask_copy(&backtrace_mask, cpu_online_mask); + + printk(KERN_INFO "sending NMI to all CPUs:\n"); +diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile +index f4361b5..404e458 100644 +--- a/arch/x86/kernel/cpu/mtrr/Makefile ++++ b/arch/x86/kernel/cpu/mtrr/Makefile +@@ -1,3 +1,4 @@ + obj-y := main.o if.o generic.o state.o cleanup.o + obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o ++obj-$(CONFIG_XEN_DOM0) += xen.o + +diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c +index 33af141..378f8dc 100644 +--- a/arch/x86/kernel/cpu/mtrr/amd.c ++++ b/arch/x86/kernel/cpu/mtrr/amd.c +@@ -108,6 +108,11 @@ amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type) + return 0; + } + ++static int amd_num_var_ranges(void) ++{ ++ return 2; ++} ++ + static struct mtrr_ops amd_mtrr_ops = { + .vendor = X86_VENDOR_AMD, + .set = amd_set_mtrr, +@@ -115,6 +120,7 @@ static struct mtrr_ops amd_mtrr_ops = { + .get_free_region = generic_get_free_region, + .validate_add_page = amd_validate_add_page, + .have_wrcomb = positive_have_wrcomb, ++ .num_var_ranges = amd_num_var_ranges, + }; + + int __init amd_init_mtrr(void) +diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c +index de89f14..7c686a0 100644 +--- a/arch/x86/kernel/cpu/mtrr/centaur.c ++++ b/arch/x86/kernel/cpu/mtrr/centaur.c +@@ -110,6 +110,11 @@ centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int t + return 0; + } + ++static int centaur_num_var_ranges(void) ++{ ++ return 8; ++} ++ + static struct mtrr_ops centaur_mtrr_ops = { + .vendor = X86_VENDOR_CENTAUR, + .set = centaur_set_mcr, +@@ -117,6 +122,7 @@ static struct mtrr_ops centaur_mtrr_ops = { + .get_free_region = centaur_get_free_region, + .validate_add_page = centaur_validate_add_page, + .have_wrcomb = positive_have_wrcomb, ++ .num_var_ranges = centaur_num_var_ranges, + }; + + int __init centaur_init_mtrr(void) +diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c +index 228d982..fd6edcc 100644 +--- a/arch/x86/kernel/cpu/mtrr/cyrix.c ++++ b/arch/x86/kernel/cpu/mtrr/cyrix.c +@@ -265,6 +265,11 @@ static void cyrix_set_all(void) + post_set(); + } + ++static int cyrix_num_var_ranges(void) ++{ ++ return 8; ++} ++ + static struct mtrr_ops cyrix_mtrr_ops = { + .vendor = X86_VENDOR_CYRIX, + .set_all = cyrix_set_all, +@@ -273,6 +278,7 @@ static struct mtrr_ops cyrix_mtrr_ops = { + .get_free_region = cyrix_get_free_region, + .validate_add_page = generic_validate_add_page, + .have_wrcomb = positive_have_wrcomb, ++ .num_var_ranges = cyrix_num_var_ranges, + }; + + int __init cyrix_init_mtrr(void) +diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c +index 55da0c5..42f30cd 100644 +--- a/arch/x86/kernel/cpu/mtrr/generic.c ++++ b/arch/x86/kernel/cpu/mtrr/generic.c +@@ -749,8 +749,16 @@ int positive_have_wrcomb(void) + return 1; + } + +-/* +- * Generic structure... ++static int generic_num_var_ranges(void) ++{ ++ unsigned long config = 0, dummy; ++ ++ rdmsr(MSR_MTRRcap, config, dummy); ++ ++ return config & 0xff; ++} ++ ++/* generic structure... + */ + struct mtrr_ops generic_mtrr_ops = { + .use_intel_if = 1, +@@ -760,4 +768,5 @@ struct mtrr_ops generic_mtrr_ops = { + .set = generic_set_mtrr, + .validate_add_page = generic_validate_add_page, + .have_wrcomb = generic_have_wrcomb, ++ .num_var_ranges = generic_num_var_ranges, + }; +diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c +index 84e83de..c8cb9ed 100644 +--- a/arch/x86/kernel/cpu/mtrr/main.c ++++ b/arch/x86/kernel/cpu/mtrr/main.c +@@ -110,21 +110,6 @@ static int have_wrcomb(void) + return mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0; + } + +-/* This function returns the number of variable MTRRs */ +-static void __init set_num_var_ranges(void) +-{ +- unsigned long config = 0, dummy; +- +- if (use_intel()) +- rdmsr(MSR_MTRRcap, config, dummy); +- else if (is_cpu(AMD)) +- config = 2; +- else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) +- config = 8; +- +- num_var_ranges = config & 0xff; +-} +- + static void __init init_table(void) + { + int i, max; +@@ -711,8 +696,11 @@ void __init mtrr_bp_init(void) + } + } + ++ /* Let Xen code override the above if it wants */ ++ xen_init_mtrr(); ++ + if (mtrr_if) { +- set_num_var_ranges(); ++ num_var_ranges = mtrr_if->num_var_ranges(); + init_table(); + if (use_intel()) { + get_mtrr_state(); +diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h +index a501dee..98569c3 100644 +--- a/arch/x86/kernel/cpu/mtrr/mtrr.h ++++ b/arch/x86/kernel/cpu/mtrr/mtrr.h +@@ -5,6 +5,8 @@ + #include <linux/types.h> + #include <linux/stddef.h> + ++#include <asm/mtrr.h> ++ + #define MTRR_CHANGE_MASK_FIXED 0x01 + #define MTRR_CHANGE_MASK_VARIABLE 0x02 + #define MTRR_CHANGE_MASK_DEFTYPE 0x04 +@@ -25,6 +27,8 @@ struct mtrr_ops { + int (*validate_add_page)(unsigned long base, unsigned long size, + unsigned int type); + int (*have_wrcomb)(void); ++ ++ int (*num_var_ranges)(void); + }; + + extern int generic_get_free_region(unsigned long base, unsigned long size, +@@ -73,6 +77,13 @@ void mtrr_wrmsr(unsigned, unsigned, unsigned); + int amd_init_mtrr(void); + int cyrix_init_mtrr(void); + int centaur_init_mtrr(void); ++#ifdef CONFIG_XEN_DOM0 ++void xen_init_mtrr(void); ++#else ++static inline void xen_init_mtrr(void) ++{ ++} ++#endif + + extern int changed_by_mtrr_cleanup; + extern int mtrr_cleanup(unsigned address_bits); +diff --git a/arch/x86/kernel/cpu/mtrr/xen.c b/arch/x86/kernel/cpu/mtrr/xen.c +new file mode 100644 +index 0000000..852018b +--- /dev/null ++++ b/arch/x86/kernel/cpu/mtrr/xen.c +@@ -0,0 +1,109 @@ ++#include <linux/init.h> ++#include <linux/mm.h> ++ ++#include <asm/pat.h> ++ ++#include "mtrr.h" ++ ++#include <xen/xen.h> ++#include <xen/interface/platform.h> ++#include <asm/xen/hypervisor.h> ++#include <asm/xen/hypercall.h> ++ ++static void xen_set_mtrr(unsigned int reg, unsigned long base, ++ unsigned long size, mtrr_type type) ++{ ++ struct xen_platform_op op; ++ int error; ++ ++ /* mtrr_ops->set() is called once per CPU, ++ * but Xen's ops apply to all CPUs. ++ */ ++ if (smp_processor_id()) ++ return; ++ ++ if (size == 0) { ++ op.cmd = XENPF_del_memtype; ++ op.u.del_memtype.handle = 0; ++ op.u.del_memtype.reg = reg; ++ } else { ++ op.cmd = XENPF_add_memtype; ++ op.u.add_memtype.mfn = base; ++ op.u.add_memtype.nr_mfns = size; ++ op.u.add_memtype.type = type; ++ } ++ ++ error = HYPERVISOR_dom0_op(&op); ++ BUG_ON(error != 0); ++} ++ ++static void xen_get_mtrr(unsigned int reg, unsigned long *base, ++ unsigned long *size, mtrr_type *type) ++{ ++ struct xen_platform_op op; ++ ++ op.cmd = XENPF_read_memtype; ++ op.u.read_memtype.reg = reg; ++ if (HYPERVISOR_dom0_op(&op) != 0) { ++ *base = 0; ++ *size = 0; ++ *type = 0; ++ return; ++ } ++ ++ *size = op.u.read_memtype.nr_mfns; ++ *base = op.u.read_memtype.mfn; ++ *type = op.u.read_memtype.type; ++} ++ ++static int __init xen_num_var_ranges(void) ++{ ++ int ranges; ++ struct xen_platform_op op; ++ ++ op.cmd = XENPF_read_memtype; ++ ++ for (ranges = 0; ; ranges++) { ++ op.u.read_memtype.reg = ranges; ++ if (HYPERVISOR_dom0_op(&op) != 0) ++ break; ++ } ++ return ranges; ++} ++ ++/* ++ * DOM0 TODO: Need to fill in the remaining mtrr methods to have full ++ * working userland mtrr support. ++ */ ++static struct mtrr_ops xen_mtrr_ops = { ++ .vendor = X86_VENDOR_UNKNOWN, ++ .get_free_region = generic_get_free_region, ++ .set = xen_set_mtrr, ++ .get = xen_get_mtrr, ++ .have_wrcomb = positive_have_wrcomb, ++ .validate_add_page = generic_validate_add_page, ++ .use_intel_if = 0, ++ .num_var_ranges = xen_num_var_ranges, ++}; ++ ++void __init xen_init_mtrr(void) ++{ ++ /* ++ * Check that we're running under Xen, and privileged enough ++ * to play with MTRRs. ++ */ ++ if (!xen_initial_domain()) ++ return; ++ ++ /* ++ * Check that the CPU has an MTRR implementation we can ++ * support. ++ */ ++ if (cpu_has_mtrr || ++ cpu_has_k6_mtrr || ++ cpu_has_cyrix_arr || ++ cpu_has_centaur_mcr) { ++ mtrr_if = &xen_mtrr_ops; ++ pat_init(); ++ } ++} +diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c +index ff95824..ebd4c51 100644 +--- a/arch/x86/kernel/crash.c ++++ b/arch/x86/kernel/crash.c +@@ -28,7 +28,6 @@ + #include <asm/reboot.h> + #include <asm/virtext.h> + +- + #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) + + static void kdump_nmi_callback(int cpu, struct die_args *args) +diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c +index d17d482..4d0aded 100644 +--- a/arch/x86/kernel/e820.c ++++ b/arch/x86/kernel/e820.c +@@ -750,6 +750,36 @@ static int __init find_overlapped_early(u64 start, u64 end) + return i; + } + ++u64 __init early_res_next_free(u64 addr) ++{ ++ int i; ++ u64 end = addr; ++ struct early_res *r; ++ ++ for (i = 0; i < MAX_EARLY_RES; i++) { ++ r = &early_res[i]; ++ if (addr >= r->start && addr < r->end) { ++ end = r->end; ++ break; ++ } ++ } ++ return end; ++} ++ ++u64 __init early_res_next_reserved(u64 addr, u64 max) ++{ ++ int i; ++ struct early_res *r; ++ u64 next_res = max; ++ ++ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { ++ r = &early_res[i]; ++ if ((r->start >= addr) && (r->start < next_res)) ++ next_res = r->start; ++ } ++ return next_res; ++} ++ + /* + * Drop the i-th range from the early reservation map, + * by copying any higher ranges down one over it, and +diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S +index c097e7d..7764118 100644 +--- a/arch/x86/kernel/entry_32.S ++++ b/arch/x86/kernel/entry_32.S +@@ -1088,6 +1088,9 @@ ENTRY(xen_failsafe_callback) + .previous + ENDPROC(xen_failsafe_callback) + ++BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK, ++ xen_evtchn_do_upcall) ++ + #endif /* CONFIG_XEN */ + + #ifdef CONFIG_FUNCTION_TRACER +diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S +index b5c061f..a626344 100644 +--- a/arch/x86/kernel/entry_64.S ++++ b/arch/x86/kernel/entry_64.S +@@ -1364,6 +1364,9 @@ ENTRY(xen_failsafe_callback) + CFI_ENDPROC + END(xen_failsafe_callback) + ++apicinterrupt XEN_HVM_EVTCHN_CALLBACK \ ++ xen_hvm_callback_vector xen_evtchn_do_upcall ++ + #endif /* CONFIG_XEN */ + + /* +diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c +index 0b06cd7..f59b07a 100644 +--- a/arch/x86/kernel/head64.c ++++ b/arch/x86/kernel/head64.c +@@ -79,6 +79,8 @@ void __init x86_64_start_kernel(char * real_mode_data) + /* Cleanup the over mapped high alias */ + cleanup_highmap(); + ++ max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT; ++ + for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) { + #ifdef CONFIG_EARLY_PRINTK + set_intr_gate(i, &early_idt_handlers[i]); +diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c +index c771e1a..8b970b8 100644 +--- a/arch/x86/kernel/hpet.c ++++ b/arch/x86/kernel/hpet.c +@@ -98,7 +98,7 @@ static int __init hpet_setup(char *str) + } + __setup("hpet=", hpet_setup); + +-static int __init disable_hpet(char *str) ++int __init disable_hpet(char *str) + { + boot_hpet_disable = 1; + return 1; +diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c +index 99c4d30..919c1a8 100644 +--- a/arch/x86/kernel/ioport.c ++++ b/arch/x86/kernel/ioport.c +@@ -30,13 +30,29 @@ static void set_bitmap(unsigned long *bitmap, unsigned int base, + } + } + ++void native_set_io_bitmap(struct thread_struct *t, ++ unsigned long bytes_updated) ++{ ++ struct tss_struct *tss; ++ ++ if (!bytes_updated) ++ return; ++ ++ tss = &__get_cpu_var(init_tss); ++ ++ /* Update the TSS: */ ++ if (t->io_bitmap_ptr) ++ memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated); ++ else ++ memset(tss->io_bitmap, 0xff, bytes_updated); ++} ++ + /* + * this changes the io permissions bitmap in the current task. + */ + asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) + { + struct thread_struct *t = ¤t->thread; +- struct tss_struct *tss; + unsigned int i, max_long, bytes, bytes_updated; + + if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) +@@ -61,13 +77,13 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) + } + + /* +- * do it in the per-thread copy and in the TSS ... ++ * do it in the per-thread copy + * +- * Disable preemption via get_cpu() - we must not switch away ++ * Disable preemption - we must not switch away + * because the ->io_bitmap_max value must match the bitmap + * contents: + */ +- tss = &per_cpu(init_tss, get_cpu()); ++ preempt_disable(); + + set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); + +@@ -85,10 +101,9 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) + + t->io_bitmap_max = bytes; + +- /* Update the TSS: */ +- memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated); ++ set_io_bitmap(t, bytes_updated); + +- put_cpu(); ++ preempt_enable(); + + return 0; + } +@@ -119,11 +134,10 @@ static int do_iopl(unsigned int level, struct pt_regs *regs) + return 0; + } + +-#ifdef CONFIG_X86_32 +-long sys_iopl(struct pt_regs *regs) ++asmlinkage long sys_iopl(unsigned int level) + { +- unsigned int level = regs->bx; + struct thread_struct *t = ¤t->thread; ++ struct pt_regs *regs = task_pt_regs(current); + int rc; + + rc = do_iopl(level, regs); +@@ -135,9 +149,3 @@ long sys_iopl(struct pt_regs *regs) + out: + return rc; + } +-#else +-asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs) +-{ +- return do_iopl(level, regs); +-} +-#endif +diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c +index ec6ef60..fa5b061 100644 +--- a/arch/x86/kernel/ldt.c ++++ b/arch/x86/kernel/ldt.c +@@ -109,6 +109,9 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) + + mutex_init(&mm->context.lock); + mm->context.size = 0; ++#ifdef CONFIG_XEN ++ mm->context.has_foreign_mappings = 0; ++#endif + old_mm = current->mm; + if (old_mm && old_mm->context.size > 0) { + mutex_lock(&old_mm->context.lock); +diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c +index 378e9a8..86ca771 100644 +--- a/arch/x86/kernel/microcode_core.c ++++ b/arch/x86/kernel/microcode_core.c +@@ -81,6 +81,8 @@ + #include <linux/fs.h> + #include <linux/mm.h> + ++#include <xen/xen.h> ++#include <asm/xen/hypervisor.h> + #include <asm/microcode.h> + #include <asm/processor.h> + +@@ -503,7 +505,9 @@ static int __init microcode_init(void) + struct cpuinfo_x86 *c = &cpu_data(0); + int error; + +- if (c->x86_vendor == X86_VENDOR_INTEL) ++ if (xen_pv_domain()) ++ microcode_ops = init_xen_microcode(); ++ else if (c->x86_vendor == X86_VENDOR_INTEL) + microcode_ops = init_intel_microcode(); + else if (c->x86_vendor == X86_VENDOR_AMD) + microcode_ops = init_amd_microcode(); +diff --git a/arch/x86/kernel/microcode_xen.c b/arch/x86/kernel/microcode_xen.c +new file mode 100644 +index 0000000..16c742e +--- /dev/null ++++ b/arch/x86/kernel/microcode_xen.c +@@ -0,0 +1,201 @@ ++/* ++ * Xen microcode update driver ++ * ++ * Xen does most of the work here. We just pass the whole blob into ++ * Xen, and it will apply it to all CPUs as appropriate. Xen will ++ * worry about how different CPU models are actually updated. ++ */ ++#include <linux/sched.h> ++#include <linux/module.h> ++#include <linux/firmware.h> ++#include <linux/vmalloc.h> ++#include <linux/uaccess.h> ++ ++#include <asm/microcode.h> ++ ++#include <xen/xen.h> ++#include <xen/interface/platform.h> ++#include <xen/interface/xen.h> ++ ++#include <asm/xen/hypercall.h> ++#include <asm/xen/hypervisor.h> ++ ++MODULE_DESCRIPTION("Xen microcode update driver"); ++MODULE_LICENSE("GPL"); ++ ++struct xen_microcode { ++ size_t len; ++ char data[0]; ++}; ++ ++static int xen_microcode_update(int cpu) ++{ ++ int err; ++ struct xen_platform_op op; ++ struct ucode_cpu_info *uci = ucode_cpu_info + cpu; ++ struct xen_microcode *uc = uci->mc; ++ ++ if (uc == NULL || uc->len == 0) { ++ /* ++ * We do all cpus at once, so we don't need to do ++ * other cpus explicitly (besides, these vcpu numbers ++ * have no relationship to underlying physical cpus). ++ */ ++ return 0; ++ } ++ ++ op.cmd = XENPF_microcode_update; ++ set_xen_guest_handle(op.u.microcode.data, uc->data); ++ op.u.microcode.length = uc->len; ++ ++ err = HYPERVISOR_dom0_op(&op); ++ ++ if (err != 0) ++ printk(KERN_WARNING "microcode_xen: microcode update failed: %d\n", err); ++ ++ return err; ++} ++ ++static enum ucode_state xen_request_microcode_fw(int cpu, struct device *device) ++{ ++ char name[30]; ++ struct cpuinfo_x86 *c = &cpu_data(cpu); ++ const struct firmware *firmware; ++ struct ucode_cpu_info *uci = ucode_cpu_info + cpu; ++ enum ucode_state ret; ++ struct xen_microcode *uc; ++ size_t size; ++ int err; ++ ++ switch (c->x86_vendor) { ++ case X86_VENDOR_INTEL: ++ snprintf(name, sizeof(name), "intel-ucode/%02x-%02x-%02x", ++ c->x86, c->x86_model, c->x86_mask); ++ break; ++ ++ case X86_VENDOR_AMD: ++ snprintf(name, sizeof(name), "amd-ucode/microcode_amd.bin"); ++ break; ++ ++ default: ++ return UCODE_NFOUND; ++ } ++ ++ err = request_firmware(&firmware, name, device); ++ if (err) { ++ pr_debug("microcode: data file %s load failed\n", name); ++ return UCODE_NFOUND; ++ } ++ ++ /* ++ * Only bother getting real firmware for cpu 0; the others get ++ * dummy placeholders. ++ */ ++ if (cpu == 0) ++ size = firmware->size; ++ else ++ size = 0; ++ ++ if (uci->mc != NULL) { ++ vfree(uci->mc); ++ uci->mc = NULL; ++ } ++ ++ ret = UCODE_ERROR; ++ uc = vmalloc(sizeof(*uc) + size); ++ if (uc == NULL) ++ goto out; ++ ++ ret = UCODE_OK; ++ uc->len = size; ++ memcpy(uc->data, firmware->data, uc->len); ++ ++ uci->mc = uc; ++ ++out: ++ release_firmware(firmware); ++ ++ return ret; ++} ++ ++static enum ucode_state xen_request_microcode_user(int cpu, ++ const void __user *buf, size_t size) ++{ ++ struct ucode_cpu_info *uci = ucode_cpu_info + cpu; ++ struct xen_microcode *uc; ++ enum ucode_state ret; ++ size_t unread; ++ ++ if (cpu != 0) { ++ /* No real firmware for non-zero cpus; just store a ++ placeholder */ ++ size = 0; ++ } ++ ++ if (uci->mc != NULL) { ++ vfree(uci->mc); ++ uci->mc = NULL; ++ } ++ ++ ret = UCODE_ERROR; ++ uc = vmalloc(sizeof(*uc) + size); ++ if (uc == NULL) ++ goto out; ++ ++ uc->len = size; ++ ++ ret = UCODE_NFOUND; ++ ++ /* XXX This sporadically returns uncopied bytes, so we return ++ EFAULT. As far as I can see, the usermode code ++ (microcode_ctl) isn't doing anything wrong... */ ++ unread = copy_from_user(uc->data, buf, size); ++ ++ if (unread != 0) { ++ printk(KERN_WARNING "failed to read %zd of %zd bytes at %p -> %p\n", ++ unread, size, buf, uc->data); ++ goto out; ++ } ++ ++ ret = UCODE_OK; ++ ++out: ++ if (ret == 0) ++ uci->mc = uc; ++ else ++ vfree(uc); ++ ++ return ret; ++} ++ ++static void xen_microcode_fini_cpu(int cpu) ++{ ++ struct ucode_cpu_info *uci = ucode_cpu_info + cpu; ++ ++ vfree(uci->mc); ++ uci->mc = NULL; ++} ++ ++static int xen_collect_cpu_info(int cpu, struct cpu_signature *sig) ++{ ++ sig->sig = 0; ++ sig->pf = 0; ++ sig->rev = 0; ++ ++ return 0; ++} ++ ++static struct microcode_ops microcode_xen_ops = { ++ .request_microcode_user = xen_request_microcode_user, ++ .request_microcode_fw = xen_request_microcode_fw, ++ .collect_cpu_info = xen_collect_cpu_info, ++ .apply_microcode = xen_microcode_update, ++ .microcode_fini_cpu = xen_microcode_fini_cpu, ++}; ++ ++struct microcode_ops * __init init_xen_microcode(void) ++{ ++ if (!xen_initial_domain()) ++ return NULL; ++ return µcode_xen_ops; ++} +diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c +index 1b1739d..f7e115c 100644 +--- a/arch/x86/kernel/paravirt.c ++++ b/arch/x86/kernel/paravirt.c +@@ -376,6 +376,7 @@ struct pv_cpu_ops pv_cpu_ops = { + .swapgs = native_swapgs, + + .set_iopl_mask = native_set_iopl_mask, ++ .set_io_bitmap = native_set_io_bitmap, + .io_delay = native_io_delay, + + .start_context_switch = paravirt_nop, +diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c +index 1a2d4b1..2f158a5 100644 +--- a/arch/x86/kernel/pci-calgary_64.c ++++ b/arch/x86/kernel/pci-calgary_64.c +@@ -46,6 +46,7 @@ + #include <asm/dma.h> + #include <asm/rio.h> + #include <asm/bios_ebda.h> ++#include <asm/x86_init.h> + + #ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT + int use_calgary __read_mostly = 1; +@@ -249,7 +250,7 @@ static unsigned long iommu_range_alloc(struct device *dev, + if (panic_on_overflow) + panic("Calgary: fix the allocator.\n"); + else +- return bad_dma_address; ++ return DMA_ERROR_CODE; + } + } + +@@ -265,11 +266,11 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, + void *vaddr, unsigned int npages, int direction) + { + unsigned long entry; +- dma_addr_t ret = bad_dma_address; ++ dma_addr_t ret = DMA_ERROR_CODE; + + entry = iommu_range_alloc(dev, tbl, npages); + +- if (unlikely(entry == bad_dma_address)) ++ if (unlikely(entry == DMA_ERROR_CODE)) + goto error; + + /* set the return dma address */ +@@ -284,7 +285,7 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, + error: + printk(KERN_WARNING "Calgary: failed to allocate %u pages in " + "iommu %p\n", npages, tbl); +- return bad_dma_address; ++ return DMA_ERROR_CODE; + } + + static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, +@@ -295,8 +296,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, + unsigned long flags; + + /* were we called with bad_dma_address? */ +- badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE); +- if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) { ++ badend = DMA_ERROR_CODE + (EMERGENCY_PAGES * PAGE_SIZE); ++ if (unlikely((dma_addr >= DMA_ERROR_CODE) && (dma_addr < badend))) { + WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA " + "address 0x%Lx\n", dma_addr); + return; +@@ -380,7 +381,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, + npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE); + + entry = iommu_range_alloc(dev, tbl, npages); +- if (entry == bad_dma_address) { ++ if (entry == DMA_ERROR_CODE) { + /* makes sure unmap knows to stop */ + s->dma_length = 0; + goto error; +@@ -398,7 +399,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, + error: + calgary_unmap_sg(dev, sg, nelems, dir, NULL); + for_each_sg(sg, s, nelems, i) { +- sg->dma_address = bad_dma_address; ++ sg->dma_address = DMA_ERROR_CODE; + sg->dma_length = 0; + } + return 0; +@@ -453,7 +454,7 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size, + + /* set up tces to cover the allocated range */ + mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); +- if (mapping == bad_dma_address) ++ if (mapping == DMA_ERROR_CODE) + goto free; + *dma_handle = mapping; + return ret; +@@ -734,7 +735,7 @@ static void __init calgary_reserve_regions(struct pci_dev *dev) + struct iommu_table *tbl = pci_iommu(dev->bus); + + /* reserve EMERGENCY_PAGES from bad_dma_address and up */ +- iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES); ++ iommu_range_reserve(tbl, DMA_ERROR_CODE, EMERGENCY_PAGES); + + /* avoid the BIOS/VGA first 640KB-1MB region */ + /* for CalIOC2 - avoid the entire first MB */ +@@ -1349,6 +1350,23 @@ static void __init get_tce_space_from_tar(void) + return; + } + ++static int __init calgary_iommu_init(void) ++{ ++ int ret; ++ ++ /* ok, we're trying to use Calgary - let's roll */ ++ printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n"); ++ ++ ret = calgary_init(); ++ if (ret) { ++ printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " ++ "falling back to no_iommu\n", ret); ++ return ret; ++ } ++ ++ return 0; ++} ++ + void __init detect_calgary(void) + { + int bus; +@@ -1362,7 +1380,7 @@ void __init detect_calgary(void) + * if the user specified iommu=off or iommu=soft or we found + * another HW IOMMU already, bail out. + */ +- if (swiotlb || no_iommu || iommu_detected) ++ if (no_iommu || iommu_detected) + return; + + if (!use_calgary) +@@ -1447,9 +1465,7 @@ void __init detect_calgary(void) + printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n", + specified_table_size); + +- /* swiotlb for devices that aren't behind the Calgary. */ +- if (max_pfn > MAX_DMA32_PFN) +- swiotlb = 1; ++ x86_init.iommu.iommu_init = calgary_iommu_init; + } + return; + +@@ -1462,35 +1478,6 @@ cleanup: + } + } + +-int __init calgary_iommu_init(void) +-{ +- int ret; +- +- if (no_iommu || (swiotlb && !calgary_detected)) +- return -ENODEV; +- +- if (!calgary_detected) +- return -ENODEV; +- +- /* ok, we're trying to use Calgary - let's roll */ +- printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n"); +- +- ret = calgary_init(); +- if (ret) { +- printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " +- "falling back to no_iommu\n", ret); +- return ret; +- } +- +- force_iommu = 1; +- bad_dma_address = 0x0; +- /* dma_ops is set to swiotlb or nommu */ +- if (!dma_ops) +- dma_ops = &nommu_dma_ops; +- +- return 0; +-} +- + static int __init calgary_parse_options(char *p) + { + unsigned int bridge; +diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c +index 6ac3931..3e57c58 100644 +--- a/arch/x86/kernel/pci-dma.c ++++ b/arch/x86/kernel/pci-dma.c +@@ -11,10 +11,12 @@ + #include <asm/gart.h> + #include <asm/calgary.h> + #include <asm/amd_iommu.h> ++#include <asm/x86_init.h> ++#include <asm/xen/swiotlb-xen.h> + + static int forbid_dac __read_mostly; + +-struct dma_map_ops *dma_ops; ++struct dma_map_ops *dma_ops = &nommu_dma_ops; + EXPORT_SYMBOL(dma_ops); + + static int iommu_sac_force __read_mostly; +@@ -42,9 +44,6 @@ int iommu_detected __read_mostly = 0; + */ + int iommu_pass_through __read_mostly; + +-dma_addr_t bad_dma_address __read_mostly = 0; +-EXPORT_SYMBOL(bad_dma_address); +- + /* Dummy device used for NULL arguments (normally ISA). */ + struct device x86_dma_fallback_dev = { + .init_name = "fallback device", +@@ -126,18 +125,19 @@ void __init pci_iommu_alloc(void) + /* free the range so iommu could get some range less than 4G */ + dma32_free_bootmem(); + #endif ++ if (pci_xen_swiotlb_detect() || pci_swiotlb_detect()) ++ goto out; + +- /* +- * The order of these functions is important for +- * fall-back/fail-over reasons +- */ + gart_iommu_hole_init(); + + detect_calgary(); + + detect_intel_iommu(); + ++ /* needs to be called after gart_iommu_hole_init */ + amd_iommu_detect(); ++out: ++ pci_xen_swiotlb_init(); + + pci_swiotlb_init(); + } +@@ -289,25 +289,17 @@ static int __init pci_iommu_init(void) + #ifdef CONFIG_PCI + dma_debug_add_bus(&pci_bus_type); + #endif ++ x86_init.iommu.iommu_init(); + +- calgary_iommu_init(); +- +- intel_iommu_init(); +- +- amd_iommu_init(); ++ if (swiotlb || xen_swiotlb) { ++ printk(KERN_INFO "PCI-DMA: " ++ "Using software bounce buffering for IO (SWIOTLB)\n"); ++ swiotlb_print_info(); ++ } else ++ swiotlb_free(); + +- gart_iommu_init(); +- +- no_iommu_init(); + return 0; + } +- +-void pci_iommu_shutdown(void) +-{ +- gart_iommu_shutdown(); +- +- amd_iommu_shutdown(); +-} + /* Must execute after PCI subsystem */ + rootfs_initcall(pci_iommu_init); + +diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c +index 1c76691..8c9dd05 100644 +--- a/arch/x86/kernel/pci-gart_64.c ++++ b/arch/x86/kernel/pci-gart_64.c +@@ -39,6 +39,7 @@ + #include <asm/swiotlb.h> + #include <asm/dma.h> + #include <asm/k8.h> ++#include <asm/x86_init.h> + + static unsigned long iommu_bus_base; /* GART remapping area (physical) */ + static unsigned long iommu_size; /* size of remapping area bytes */ +@@ -46,6 +47,8 @@ static unsigned long iommu_pages; /* .. and in pages */ + + static u32 *iommu_gatt_base; /* Remapping table */ + ++static dma_addr_t bad_dma_addr; ++ + /* + * If this is disabled the IOMMU will use an optimized flushing strategy + * of only flushing when an mapping is reused. With it true the GART is +@@ -216,7 +219,7 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, + if (panic_on_overflow) + panic("dma_map_area overflow %lu bytes\n", size); + iommu_full(dev, size, dir); +- return bad_dma_address; ++ return bad_dma_addr; + } + + for (i = 0; i < npages; i++) { +@@ -302,7 +305,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, + + if (nonforced_iommu(dev, addr, s->length)) { + addr = dma_map_area(dev, addr, s->length, dir, 0); +- if (addr == bad_dma_address) { ++ if (addr == bad_dma_addr) { + if (i > 0) + gart_unmap_sg(dev, sg, i, dir, NULL); + nents = 0; +@@ -455,7 +458,7 @@ error: + + iommu_full(dev, pages << PAGE_SHIFT, dir); + for_each_sg(sg, s, nents, i) +- s->dma_address = bad_dma_address; ++ s->dma_address = bad_dma_addr; + return 0; + } + +@@ -479,7 +482,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, + DMA_BIDIRECTIONAL, align_mask); + + flush_gart(); +- if (paddr != bad_dma_address) { ++ if (paddr != bad_dma_addr) { + *dma_addr = paddr; + return page_address(page); + } +@@ -499,6 +502,11 @@ gart_free_coherent(struct device *dev, size_t size, void *vaddr, + free_pages((unsigned long)vaddr, get_order(size)); + } + ++static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr) ++{ ++ return (dma_addr == bad_dma_addr); ++} ++ + static int no_agp; + + static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) +@@ -689,14 +697,15 @@ static struct dma_map_ops gart_dma_ops = { + .unmap_page = gart_unmap_page, + .alloc_coherent = gart_alloc_coherent, + .free_coherent = gart_free_coherent, ++ .mapping_error = gart_mapping_error, + }; + +-void gart_iommu_shutdown(void) ++static void gart_iommu_shutdown(void) + { + struct pci_dev *dev; + int i; + +- if (no_agp && (dma_ops != &gart_dma_ops)) ++ if (no_agp) + return; + + for (i = 0; i < num_k8_northbridges; i++) { +@@ -711,7 +720,7 @@ void gart_iommu_shutdown(void) + } + } + +-void __init gart_iommu_init(void) ++int __init gart_iommu_init(void) + { + struct agp_kern_info info; + unsigned long iommu_start; +@@ -721,7 +730,7 @@ void __init gart_iommu_init(void) + long i; + + if (num_k8_northbridges == 0) +- return; ++ return 0; + + #ifndef CONFIG_AGP_AMD64 + no_agp = 1; +@@ -733,13 +742,6 @@ void __init gart_iommu_init(void) + (agp_copy_info(agp_bridge, &info) < 0); + #endif + +- if (swiotlb) +- return; +- +- /* Did we detect a different HW IOMMU? */ +- if (iommu_detected && !gart_iommu_aperture) +- return; +- + if (no_iommu || + (!force_iommu && max_pfn <= MAX_DMA32_PFN) || + !gart_iommu_aperture || +@@ -749,7 +751,7 @@ void __init gart_iommu_init(void) + "but GART IOMMU not available.\n"); + printk(KERN_WARNING "falling back to iommu=soft.\n"); + } +- return; ++ return 0; + } + + /* need to map that range */ +@@ -794,7 +796,7 @@ void __init gart_iommu_init(void) + + iommu_start = aper_size - iommu_size; + iommu_bus_base = info.aper_base + iommu_start; +- bad_dma_address = iommu_bus_base; ++ bad_dma_addr = iommu_bus_base; + iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); + + /* +@@ -841,6 +843,10 @@ void __init gart_iommu_init(void) + + flush_gart(); + dma_ops = &gart_dma_ops; ++ x86_platform.iommu_shutdown = gart_iommu_shutdown; ++ swiotlb = 0; ++ ++ return 0; + } + + void __init gart_parse_options(char *p) +diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c +index a3933d4..22be12b 100644 +--- a/arch/x86/kernel/pci-nommu.c ++++ b/arch/x86/kernel/pci-nommu.c +@@ -33,7 +33,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page, + dma_addr_t bus = page_to_phys(page) + offset; + WARN_ON(size == 0); + if (!check_addr("map_single", dev, bus, size)) +- return bad_dma_address; ++ return DMA_ERROR_CODE; + flush_write_buffers(); + return bus; + } +@@ -103,12 +103,3 @@ struct dma_map_ops nommu_dma_ops = { + .sync_sg_for_device = nommu_sync_sg_for_device, + .is_phys = 1, + }; +- +-void __init no_iommu_init(void) +-{ +- if (dma_ops) +- return; +- +- force_iommu = 0; /* no HW IOMMU */ +- dma_ops = &nommu_dma_ops; +-} +diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c +index aaa6b78..7d2829d 100644 +--- a/arch/x86/kernel/pci-swiotlb.c ++++ b/arch/x86/kernel/pci-swiotlb.c +@@ -42,18 +42,31 @@ static struct dma_map_ops swiotlb_dma_ops = { + .dma_supported = NULL, + }; + +-void __init pci_swiotlb_init(void) ++/* ++ * pci_swiotlb_detect - set swiotlb to 1 if necessary ++ * ++ * This returns non-zero if we are forced to use swiotlb (by the boot ++ * option). ++ */ ++int __init pci_swiotlb_detect(void) + { ++ int use_swiotlb = swiotlb | swiotlb_force; ++ + /* don't initialize swiotlb if iommu=off (no_iommu=1) */ + #ifdef CONFIG_X86_64 +- if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)) ++ if (!no_iommu && max_pfn > MAX_DMA32_PFN) + swiotlb = 1; + #endif + if (swiotlb_force) + swiotlb = 1; ++ ++ return use_swiotlb; ++} ++ ++void __init pci_swiotlb_init(void) ++{ + if (swiotlb) { +- printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); +- swiotlb_init(); ++ swiotlb_init(0); + dma_ops = &swiotlb_dma_ops; + } + } +diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c +index 5fd5b07..11d8667 100644 +--- a/arch/x86/kernel/process.c ++++ b/arch/x86/kernel/process.c +@@ -73,16 +73,12 @@ void exit_thread(void) + unsigned long *bp = t->io_bitmap_ptr; + + if (bp) { +- struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); +- ++ preempt_disable(); + t->io_bitmap_ptr = NULL; + clear_thread_flag(TIF_IO_BITMAP); +- /* +- * Careful, clear this in the TSS too: +- */ +- memset(tss->io_bitmap, 0xff, t->io_bitmap_max); ++ set_io_bitmap(t, t->io_bitmap_max); + t->io_bitmap_max = 0; +- put_cpu(); ++ preempt_enable(); + kfree(bp); + } + } +@@ -199,19 +195,10 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, + hard_enable_TSC(); + } + +- if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { +- /* +- * Copy the relevant range of the IO bitmap. +- * Normally this is 128 bytes or less: +- */ +- memcpy(tss->io_bitmap, next->io_bitmap_ptr, +- max(prev->io_bitmap_max, next->io_bitmap_max)); +- } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { +- /* +- * Clear any possible leftover bits: +- */ +- memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); +- } ++ if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP) || ++ test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) ++ set_io_bitmap(next, ++ max(prev->io_bitmap_max, next->io_bitmap_max)); + } + + int sys_fork(struct pt_regs *regs) +diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c +index dfdfe46..b12fe8d 100644 +--- a/arch/x86/kernel/pvclock.c ++++ b/arch/x86/kernel/pvclock.c +@@ -111,6 +111,11 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) + + static atomic64_t last_value = ATOMIC64_INIT(0); + ++void pvclock_resume(void) ++{ ++ atomic64_set(&last_value, 0); ++} ++ + cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) + { + struct pvclock_shadow_time shadow; +diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c +index 200fcde..ff8cc40 100644 +--- a/arch/x86/kernel/reboot.c ++++ b/arch/x86/kernel/reboot.c +@@ -23,7 +23,7 @@ + # include <linux/ctype.h> + # include <linux/mc146818rtc.h> + #else +-# include <asm/iommu.h> ++# include <asm/x86_init.h> + #endif + + /* +@@ -647,7 +647,7 @@ void native_machine_shutdown(void) + #endif + + #ifdef CONFIG_X86_64 +- pci_iommu_shutdown(); ++ x86_platform.iommu_shutdown(); + #endif + } + +diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c +index 5449a26..56b4707 100644 +--- a/arch/x86/kernel/setup.c ++++ b/arch/x86/kernel/setup.c +@@ -70,6 +70,7 @@ + #include <linux/tboot.h> + + #include <video/edid.h> ++#include <xen/xen.h> + + #include <asm/mtrr.h> + #include <asm/apic.h> +@@ -89,6 +90,7 @@ + #include <asm/cacheflush.h> + #include <asm/processor.h> + #include <asm/bugs.h> ++#include <asm/tlbflush.h> + + #include <asm/system.h> + #include <asm/vsyscall.h> +@@ -909,7 +911,6 @@ void __init setup_arch(char **cmdline_p) + max_low_pfn = max_pfn; + + high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; +- max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT; + #endif + + #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION +@@ -967,6 +968,9 @@ void __init setup_arch(char **cmdline_p) + + initmem_init(0, max_pfn); + ++ /* Initialize cross-cpu tlb flushes */ ++ init_smp_flush(); ++ + #ifdef CONFIG_ACPI_SLEEP + /* + * Reserve low memory region for sleep support. +@@ -1037,6 +1041,7 @@ void __init setup_arch(char **cmdline_p) + probe_nr_irqs_gsi(); + + kvm_guest_init(); ++ xen_hvm_guest_init(); + + e820_reserve_resources(); + e820_mark_nosave_regions(max_low_pfn); +diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c +index 4449a4a..d11c5ff 100644 +--- a/arch/x86/kernel/x86_init.c ++++ b/arch/x86/kernel/x86_init.c +@@ -14,10 +14,13 @@ + #include <asm/time.h> + #include <asm/irq.h> + #include <asm/tsc.h> ++#include <asm/iommu.h> + + void __cpuinit x86_init_noop(void) { } + void __init x86_init_uint_noop(unsigned int unused) { } + void __init x86_init_pgd_noop(pgd_t *unused) { } ++int __init iommu_init_noop(void) { return 0; } ++void iommu_shutdown_noop(void) { } + + /* + * The platform setup functions are preset with the default functions +@@ -62,6 +65,10 @@ struct x86_init_ops x86_init __initdata = { + .tsc_pre_init = x86_init_noop, + .timer_init = hpet_time_init, + }, ++ ++ .iommu = { ++ .iommu_init = iommu_init_noop, ++ }, + }; + + struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { +@@ -72,4 +79,5 @@ struct x86_platform_ops x86_platform = { + .calibrate_tsc = native_calibrate_tsc, + .get_wallclock = mach_get_cmos_time, + .set_wallclock = mach_set_rtc_mmss, ++ .iommu_shutdown = iommu_shutdown_noop, + }; +diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile +index 06630d2..ad895ae 100644 +--- a/arch/x86/mm/Makefile ++++ b/arch/x86/mm/Makefile +@@ -6,6 +6,11 @@ nostackp := $(call cc-option, -fno-stack-protector) + CFLAGS_physaddr.o := $(nostackp) + CFLAGS_setup_nx.o := $(nostackp) + ++# Make sure __phys_addr has no stackprotector ++nostackp := $(call cc-option, -fno-stack-protector) ++CFLAGS_ioremap.o := $(nostackp) ++CFLAGS_init.o := $(nostackp) ++ + obj-$(CONFIG_SMP) += tlb.o + + obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o +diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c +index 1739358..e003b83 100644 +--- a/arch/x86/mm/fault.c ++++ b/arch/x86/mm/fault.c +@@ -228,7 +228,16 @@ void vmalloc_sync_all(void) + + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { +- if (!vmalloc_sync_one(page_address(page), address)) ++ spinlock_t *pgt_lock; ++ int ret; ++ ++ pgt_lock = &pgd_page_get_mm(page)->page_table_lock; ++ ++ spin_lock(pgt_lock); ++ ret = vmalloc_sync_one(page_address(page), address); ++ spin_unlock(pgt_lock); ++ ++ if (!ret) + break; + } + spin_unlock_irqrestore(&pgd_lock, flags); +@@ -340,11 +349,19 @@ void vmalloc_sync_all(void) + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; ++ spinlock_t *pgt_lock; ++ + pgd = (pgd_t *)page_address(page) + pgd_index(address); ++ ++ pgt_lock = &pgd_page_get_mm(page)->page_table_lock; ++ spin_lock(pgt_lock); ++ + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); ++ ++ spin_unlock(pgt_lock); + } + spin_unlock_irqrestore(&pgd_lock, flags); + } +diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c +index 71da1bc..892b8eb 100644 +--- a/arch/x86/mm/gup.c ++++ b/arch/x86/mm/gup.c +@@ -313,6 +313,11 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, + goto slow_irqon; + #endif + ++#ifdef CONFIG_XEN ++ if (unlikely(mm->context.has_foreign_mappings)) ++ goto slow_irqon; ++#endif ++ + /* + * XXX: batch / limit 'nr', to avoid large irq off latency + * needs some instrumenting to determine the common sizes used by +diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c +index 30938c1..10c3719 100644 +--- a/arch/x86/mm/init_32.c ++++ b/arch/x86/mm/init_32.c +@@ -430,22 +430,45 @@ static int __init add_highpages_work_fn(unsigned long start_pfn, + { + int node_pfn; + struct page *page; ++ phys_addr_t chunk_end, chunk_max; + unsigned long final_start_pfn, final_end_pfn; +- struct add_highpages_data *data; +- +- data = (struct add_highpages_data *)datax; ++ struct add_highpages_data *data = (struct add_highpages_data *)datax; + + final_start_pfn = max(start_pfn, data->start_pfn); + final_end_pfn = min(end_pfn, data->end_pfn); + if (final_start_pfn >= final_end_pfn) + return 0; + +- for (node_pfn = final_start_pfn; node_pfn < final_end_pfn; +- node_pfn++) { +- if (!pfn_valid(node_pfn)) +- continue; +- page = pfn_to_page(node_pfn); +- add_one_highpage_init(page, node_pfn); ++ chunk_end = PFN_PHYS(final_start_pfn); ++ chunk_max = PFN_PHYS(final_end_pfn); ++ ++ /* ++ * Check for reserved areas. ++ */ ++ for (;;) { ++ phys_addr_t chunk_start; ++ chunk_start = early_res_next_free(chunk_end); ++ ++ /* ++ * Reserved area. Just count high mem pages. ++ */ ++ for (node_pfn = PFN_DOWN(chunk_end); ++ node_pfn < PFN_DOWN(chunk_start); node_pfn++) { ++ if (pfn_valid(node_pfn)) ++ totalhigh_pages++; ++ } ++ ++ if (chunk_start >= chunk_max) ++ break; ++ ++ chunk_end = early_res_next_reserved(chunk_start, chunk_max); ++ for (node_pfn = PFN_DOWN(chunk_start); ++ node_pfn < PFN_DOWN(chunk_end); node_pfn++) { ++ if (!pfn_valid(node_pfn)) ++ continue; ++ page = pfn_to_page(node_pfn); ++ add_one_highpage_init(page, node_pfn); ++ } + } + + return 0; +@@ -459,7 +482,6 @@ void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn, + + data.start_pfn = start_pfn; + data.end_pfn = end_pfn; +- + work_with_active_regions(nid, add_highpages_work_fn, &data); + } + +diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c +index 2feb9bd..2601df2 100644 +--- a/arch/x86/mm/ioremap.c ++++ b/arch/x86/mm/ioremap.c +@@ -425,6 +425,11 @@ static inline pte_t * __init early_ioremap_pte(unsigned long addr) + return &bm_pte[pte_index(addr)]; + } + ++bool __init is_early_ioremap_ptep(pte_t *ptep) ++{ ++ return ptep >= &bm_pte[0] && ptep < &bm_pte[PAGE_SIZE/sizeof(pte_t)]; ++} ++ + static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata; + + void __init early_ioremap_init(void) +diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c +index e78cd0e..fb91994 100644 +--- a/arch/x86/mm/pat.c ++++ b/arch/x86/mm/pat.c +@@ -666,7 +666,7 @@ void io_free_memtype(resource_size_t start, resource_size_t end) + pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, + unsigned long size, pgprot_t vma_prot) + { +- return vma_prot; ++ return __pgprot(pgprot_val(vma_prot) | _PAGE_IOMAP); + } + + #ifdef CONFIG_STRICT_DEVMEM +diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c +index c9ba9de..1fcc191 100644 +--- a/arch/x86/mm/pgtable.c ++++ b/arch/x86/mm/pgtable.c +@@ -4,6 +4,9 @@ + #include <asm/tlb.h> + #include <asm/fixmap.h> + ++#include <xen/xen.h> ++#include <asm/xen/hypervisor.h> ++ + #define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO + + #ifdef CONFIG_HIGHPTE +@@ -14,6 +17,16 @@ + + gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP; + ++pgprot_t arch_vm_get_page_prot(unsigned vm_flags) ++{ ++ pgprot_t ret = __pgprot(0); ++ ++ if (vm_flags & VM_IO) ++ ret = __pgprot(_PAGE_IOMAP); ++ ++ return ret; ++} ++ + pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) + { + return (pte_t *)__get_free_page(PGALLOC_GFP); +@@ -86,7 +99,19 @@ static inline void pgd_list_del(pgd_t *pgd) + #define UNSHARED_PTRS_PER_PGD \ + (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) + +-static void pgd_ctor(pgd_t *pgd) ++ ++static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) ++{ ++ BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm)); ++ virt_to_page(pgd)->index = (pgoff_t)mm; ++} ++ ++struct mm_struct *pgd_page_get_mm(struct page *page) ++{ ++ return (struct mm_struct *)page->index; ++} ++ ++static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) + { + /* If the pgd points to a shared pagetable level (either the + ptes in non-PAE, or shared PMD in PAE), then just copy the +@@ -104,8 +129,10 @@ static void pgd_ctor(pgd_t *pgd) + } + + /* list required to sync kernel mapping updates */ +- if (!SHARED_KERNEL_PMD) ++ if (!SHARED_KERNEL_PMD) { ++ pgd_set_mm(pgd, mm); + pgd_list_add(pgd); ++ } + } + + static void pgd_dtor(pgd_t *pgd) +@@ -271,7 +298,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) + */ + spin_lock_irqsave(&pgd_lock, flags); + +- pgd_ctor(pgd); ++ pgd_ctor(mm, pgd); + pgd_prepopulate_pmd(mm, pgd, pmds); + + spin_unlock_irqrestore(&pgd_lock, flags); +@@ -288,6 +315,12 @@ out: + + void pgd_free(struct mm_struct *mm, pgd_t *pgd) + { ++#ifdef CONFIG_XEN ++ /* EEW */ ++ extern void xen_late_unpin_pgd(struct mm_struct *mm, pgd_t *pgd); ++ if (xen_pv_domain()) ++ xen_late_unpin_pgd(mm, pgd); ++#endif + pgd_mop_up_pmds(mm, pgd); + pgd_dtor(pgd); + paravirt_pgd_free(mm, pgd); +diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c +index 36fe08e..7317947 100644 +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -148,13 +148,25 @@ void smp_invalidate_interrupt(struct pt_regs *regs) + * BUG(); + */ + +- if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) { +- if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { ++ if (f->flush_mm == NULL || ++ f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) { ++ int tlbstate = percpu_read(cpu_tlbstate.state); ++ ++ /* ++ * flush_mm == NULL means flush everything, including ++ * global tlbs, which will only happen when flushing ++ * kernel mappings. ++ */ ++ if (f->flush_mm == NULL) ++ __flush_tlb_all(); ++ else if (tlbstate == TLBSTATE_OK) { + if (f->flush_va == TLB_FLUSH_ALL) + local_flush_tlb(); + else + __flush_tlb_one(f->flush_va); +- } else ++ } ++ ++ if (tlbstate == TLBSTATE_LAZY) + leave_mm(cpu); + } + out: +@@ -217,16 +229,13 @@ void native_flush_tlb_others(const struct cpumask *cpumask, + flush_tlb_others_ipi(cpumask, mm, va); + } + +-static int __cpuinit init_smp_flush(void) ++void __init init_smp_flush(void) + { + int i; + + for (i = 0; i < ARRAY_SIZE(flush_state); i++) + spin_lock_init(&flush_state[i].tlbstate_lock); +- +- return 0; + } +-core_initcall(init_smp_flush); + + void flush_tlb_current_task(void) + { +@@ -274,17 +283,19 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) + + preempt_enable(); + } ++EXPORT_SYMBOL_GPL(flush_tlb_page); + +-static void do_flush_tlb_all(void *info) ++void flush_tlb_all(void) + { +- unsigned long cpu = smp_processor_id(); ++ /* flush_tlb_others expects preempt to be disabled */ ++ int cpu = get_cpu(); ++ ++ flush_tlb_others(cpu_online_mask, NULL, TLB_FLUSH_ALL); + + __flush_tlb_all(); + if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) + leave_mm(cpu); +-} + +-void flush_tlb_all(void) +-{ +- on_each_cpu(do_flush_tlb_all, NULL, 1); ++ put_cpu(); + } ++EXPORT_SYMBOL_GPL(flush_tlb_all); +diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile +index d49202e..64182c5 100644 +--- a/arch/x86/pci/Makefile ++++ b/arch/x86/pci/Makefile +@@ -4,6 +4,7 @@ obj-$(CONFIG_PCI_BIOS) += pcbios.o + obj-$(CONFIG_PCI_MMCONFIG) += mmconfig_$(BITS).o direct.o mmconfig-shared.o + obj-$(CONFIG_PCI_DIRECT) += direct.o + obj-$(CONFIG_PCI_OLPC) += olpc.o ++obj-$(CONFIG_PCI_XEN) += xen.o + + obj-y += fixup.o + obj-$(CONFIG_ACPI) += acpi.o +diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c +index 1331fcf..30a9808 100644 +--- a/arch/x86/pci/common.c ++++ b/arch/x86/pci/common.c +@@ -22,6 +22,7 @@ unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 | + unsigned int pci_early_dump_regs; + static int pci_bf_sort; + int pci_routeirq; ++int pci_scan_all_fns; + int noioapicquirk; + #ifdef CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS + int noioapicreroute = 0; +@@ -412,26 +413,31 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum) + + extern u8 pci_cache_line_size; + +-int __init pcibios_init(void) ++void __init pcibios_set_cache_line_size(void) + { + struct cpuinfo_x86 *c = &boot_cpu_data; + +- if (!raw_pci_ops) { +- printk(KERN_WARNING "PCI: System does not support PCI\n"); +- return 0; +- } +- + /* + * Assume PCI cacheline size of 32 bytes for all x86s except K7/K8 + * and P4. It's also good for 386/486s (which actually have 16) + * as quite a few PCI devices do not support smaller values. + */ ++ + pci_cache_line_size = 32 >> 2; + if (c->x86 >= 6 && c->x86_vendor == X86_VENDOR_AMD) + pci_cache_line_size = 64 >> 2; /* K7 & K8 */ + else if (c->x86 > 6 && c->x86_vendor == X86_VENDOR_INTEL) + pci_cache_line_size = 128 >> 2; /* P4 */ ++} ++ ++int __init pcibios_init(void) ++{ ++ if (!raw_pci_ops) { ++ printk(KERN_WARNING "PCI: System does not support PCI\n"); ++ return 0; ++ } + ++ pcibios_set_cache_line_size(); + pcibios_resource_survey(); + + if (pci_bf_sort >= pci_force_bf) +diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c +index a672f12..91d040e 100644 +--- a/arch/x86/pci/i386.c ++++ b/arch/x86/pci/i386.c +@@ -283,6 +283,8 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, + + prot = pgprot_val(vma->vm_page_prot); + ++ prot |= _PAGE_IOMAP; /* creating a mapping for IO */ ++ + /* + * Return error if pat is not enabled and write_combine is requested. + * Caller can followup with UC MINUS request and add a WC mtrr if there +diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c +index 25a1f8e..4e2f90a 100644 +--- a/arch/x86/pci/init.c ++++ b/arch/x86/pci/init.c +@@ -15,10 +15,16 @@ static __init int pci_arch_init(void) + if (!(pci_probe & PCI_PROBE_NOEARLY)) + pci_mmcfg_early_init(); + ++#ifdef CONFIG_PCI_XEN ++ if (!pci_xen_init()) ++ return 0; ++#endif ++ + #ifdef CONFIG_PCI_OLPC + if (!pci_olpc_init()) + return 0; /* skip additional checks if it's an XO */ + #endif ++ + #ifdef CONFIG_PCI_BIOS + pci_pcbios_init(); + #endif +diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c +new file mode 100644 +index 0000000..67fa926 +--- /dev/null ++++ b/arch/x86/pci/xen.c +@@ -0,0 +1,154 @@ ++/* ++ * Xen PCI Frontend Stub - puts some "dummy" functions in to the Linux ++ * x86 PCI core to support the Xen PCI Frontend ++ * ++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> ++ */ ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/pci.h> ++#include <linux/acpi.h> ++ ++#include <asm/io.h> ++#include <asm/pci_x86.h> ++ ++#include <asm/xen/hypervisor.h> ++ ++#include <xen/events.h> ++#include <asm/xen/pci.h> ++ ++#if defined(CONFIG_PCI_MSI) ++#include <linux/msi.h> ++ ++struct xen_pci_frontend_ops *xen_pci_frontend; ++EXPORT_SYMBOL_GPL(xen_pci_frontend); ++ ++/* ++ * For MSI interrupts we have to use drivers/xen/event.s functions to ++ * allocate an irq_desc and setup the right */ ++ ++ ++int xen_pci_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) ++{ ++ int irq, ret, i; ++ struct msi_desc *msidesc; ++ int *v; ++ ++ /* Dom0 has another mechanism for this. The exit path ++ * (xen_pci_teardown_msi_irq) is shared with Dom0. ++ */ ++ if (xen_initial_domain()) ++ return xen_setup_msi_irqs(dev, nvec, type); ++ ++ v = kzalloc(sizeof(int) * max(1, nvec), GFP_KERNEL); ++ if (!v) ++ return -ENOMEM; ++ ++ if (!xen_initial_domain()) { ++ if (type == PCI_CAP_ID_MSIX) ++ ret = xen_pci_frontend_enable_msix(dev, &v, nvec); ++ else ++ ret = xen_pci_frontend_enable_msi(dev, &v); ++ if (ret) ++ goto error; ++ } ++ i = 0; ++ list_for_each_entry(msidesc, &dev->msi_list, list) { ++ irq = xen_allocate_pirq(v[i], 0, /* not sharable */ ++ (type == PCI_CAP_ID_MSIX) ? ++ "pcifront-msi-x":"pcifront-msi"); ++ if (irq < 0) ++ return -1; ++ ++ ret = set_irq_msi(irq, msidesc); ++ if (ret) ++ goto error_while; ++ i++; ++ } ++ kfree(v); ++ return 0; ++ ++error_while: ++ unbind_from_irqhandler(irq, NULL); ++error: ++ if (ret == -ENODEV) ++ dev_err(&dev->dev,"Xen PCI frontend has not registered" \ ++ " MSI/MSI-X support!\n"); ++ ++ kfree(v); ++ return ret; ++} ++ ++void xen_pci_teardown_msi_dev(struct pci_dev *dev) ++{ ++ /* Only do this when were are in non-privileged mode.*/ ++ if (!xen_initial_domain()) { ++ struct msi_desc *msidesc; ++ ++ msidesc = list_entry(dev->msi_list.next, struct msi_desc, list); ++ if (msidesc->msi_attrib.is_msix) ++ xen_pci_frontend_disable_msix(dev); ++ else ++ xen_pci_frontend_disable_msi(dev); ++ } ++ ++} ++ ++void xen_pci_teardown_msi_irq(int irq) ++{ ++ xen_destroy_irq(irq); ++} ++#endif ++ ++static int xen_pcifront_enable_irq(struct pci_dev *dev) ++{ ++ int rc; ++ int share = 1; ++ ++ dev_info(&dev->dev, "Xen PCI enabling IRQ: %d\n", dev->irq); ++ ++ if (dev->irq < 0) ++ return -EINVAL; ++ ++ if (dev->irq < NR_IRQS_LEGACY) ++ share = 0; ++ ++ rc = xen_allocate_pirq(dev->irq, share, "pcifront"); ++ if (rc < 0) { ++ dev_warn(&dev->dev, "Xen PCI IRQ: %d, failed to register:%d\n", ++ dev->irq, rc); ++ return rc; ++ } ++ return 0; ++} ++ ++int __init pci_xen_init(void) ++{ ++ if (!xen_pv_domain() || xen_initial_domain()) ++ return -ENODEV; ++ ++ printk(KERN_INFO "PCI: setting up Xen PCI frontend stub\n"); ++ ++ pcibios_set_cache_line_size(); ++ ++ pcibios_enable_irq = xen_pcifront_enable_irq; ++ pcibios_disable_irq = NULL; ++ ++#ifdef CONFIG_ACPI ++ /* Keep ACPI out of the picture */ ++ acpi_noirq = 1; ++#endif ++ ++#ifdef CONFIG_ISAPNP ++ /* Stop isapnp from probing */ ++ isapnp_disable = 1; ++#endif ++ ++ /* Ensure a device still gets scanned even if it's fn number ++ * is non-zero. ++ */ ++ pci_scan_all_fns = 1; ++ ++ return 0; ++} ++ +diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig +index b83e119..3f9f4a0 100644 +--- a/arch/x86/xen/Kconfig ++++ b/arch/x86/xen/Kconfig +@@ -13,16 +13,18 @@ config XEN + kernel to boot in a paravirtualized environment under the + Xen hypervisor. + ++config XEN_PVHVM ++ def_bool y ++ depends on XEN ++ depends on X86_LOCAL_APIC ++ + config XEN_MAX_DOMAIN_MEMORY +- int "Maximum allowed size of a domain in gigabytes" +- default 8 if X86_32 +- default 32 if X86_64 ++ int ++ default 128 + depends on XEN + help +- The pseudo-physical to machine address array is sized +- according to the maximum possible memory size of a Xen +- domain. This array uses 1 page per gigabyte, so there's no +- need to be too stingy here. ++ This only affects the sizing of some bss arrays, the unused ++ portions of which are freed. + + config XEN_SAVE_RESTORE + bool +@@ -36,3 +38,40 @@ config XEN_DEBUG_FS + help + Enable statistics output and various tuning options in debugfs. + Enabling this option may incur a significant performance overhead. ++ ++config SWIOTLB_XEN ++ def_bool y ++ depends on XEN && SWIOTLB ++ ++config MICROCODE_XEN ++ def_bool y ++ depends on XEN_DOM0 && MICROCODE ++ ++config XEN_DOM0 ++ bool "Enable Xen privileged domain support" ++ depends on XEN && X86_IO_APIC && ACPI ++ help ++ The Xen hypervisor requires a privileged domain ("dom0") to ++ actually manage the machine, provide devices drivers, etc. ++ This option enables dom0 support. A dom0 kernel can also ++ run as an unprivileged domU kernel, or a kernel running ++ native on bare hardware. ++ ++# Dummy symbol since people have come to rely on the PRIVILEGED_GUEST ++# name in tools. ++config XEN_PRIVILEGED_GUEST ++ def_bool XEN_DOM0 ++ ++config XEN_DOM0_PCI ++ def_bool y ++ depends on XEN_DOM0 && PCI ++ select PCI_XEN ++ ++config XEN_PCI_PASSTHROUGH ++ bool "Enable support for Xen PCI passthrough devices" ++ depends on XEN && PCI ++ select PCI_XEN ++ select SWIOTLB_XEN ++ help ++ Enable support for passing PCI devices through to ++ unprivileged domains. (COMPLETELY UNTESTED) +diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile +index 3bb4fc2..13ca65c 100644 +--- a/arch/x86/xen/Makefile ++++ b/arch/x86/xen/Makefile +@@ -12,9 +12,12 @@ CFLAGS_mmu.o := $(nostackp) + + obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ + time.o xen-asm.o xen-asm_$(BITS).o \ +- grant-table.o suspend.o ++ grant-table.o suspend.o platform-pci-unplug.o + + obj-$(CONFIG_SMP) += smp.o + obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o + obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o +- ++obj-$(CONFIG_XEN_DOM0) += vga.o ++obj-$(CONFIG_XEN_DOM0) += apic.o ++obj-$(CONFIG_SWIOTLB) += pci-swiotlb-xen.o ++obj-$(CONFIG_XEN_DOM0_PCI) += pci.o +\ No newline at end of file +diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c +new file mode 100644 +index 0000000..21a3089 +--- /dev/null ++++ b/arch/x86/xen/apic.c +@@ -0,0 +1,33 @@ ++#include <linux/kernel.h> ++#include <linux/threads.h> ++#include <linux/bitmap.h> ++ ++#include <asm/io_apic.h> ++#include <asm/acpi.h> ++#include <asm/hw_irq.h> ++ ++#include <asm/xen/hypervisor.h> ++#include <asm/xen/hypercall.h> ++ ++#include <xen/xen.h> ++#include <xen/interface/xen.h> ++#include <xen/interface/physdev.h> ++ ++void __init xen_io_apic_init(void) ++{ ++ enable_IO_APIC(); ++} ++ ++void xen_init_apic(void) ++{ ++ if (!xen_initial_domain()) ++ return; ++ ++#ifdef CONFIG_ACPI ++ /* ++ * Pretend ACPI found our lapic even though we've disabled it, ++ * to prevent MP tables from setting up lapics. ++ */ ++ acpi_lapic = 1; ++#endif ++} +diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c +index 0087b00..070f138 100644 +--- a/arch/x86/xen/enlighten.c ++++ b/arch/x86/xen/enlighten.c +@@ -11,6 +11,7 @@ + * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 + */ + ++#include <linux/cpu.h> + #include <linux/kernel.h> + #include <linux/init.h> + #include <linux/smp.h> +@@ -28,12 +29,15 @@ + #include <linux/highmem.h> + #include <linux/console.h> + ++#include <xen/xen.h> + #include <xen/interface/xen.h> + #include <xen/interface/version.h> + #include <xen/interface/physdev.h> + #include <xen/interface/vcpu.h> ++#include <xen/interface/memory.h> + #include <xen/features.h> + #include <xen/page.h> ++#include <xen/hvm.h> + #include <xen/hvc-console.h> + + #include <asm/paravirt.h> +@@ -53,6 +57,7 @@ + #include <asm/tlbflush.h> + #include <asm/reboot.h> + #include <asm/stackprotector.h> ++#include <asm/hypervisor.h> + + #include "xen-ops.h" + #include "mmu.h" +@@ -66,6 +71,11 @@ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); + enum xen_domain_type xen_domain_type = XEN_NATIVE; + EXPORT_SYMBOL_GPL(xen_domain_type); + ++unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START; ++EXPORT_SYMBOL(machine_to_phys_mapping); ++unsigned int machine_to_phys_order; ++EXPORT_SYMBOL(machine_to_phys_order); ++ + struct start_info *xen_start_info; + EXPORT_SYMBOL_GPL(xen_start_info); + +@@ -73,6 +83,9 @@ struct shared_info xen_dummy_shared_info; + + void *xen_initial_gdt; + ++__read_mostly int xen_have_vector_callback; ++EXPORT_SYMBOL_GPL(xen_have_vector_callback); ++ + /* + * Point at some empty memory to start with. We map the real shared_info + * page as soon as fixmap is up and running. +@@ -94,6 +107,14 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info; + */ + static int have_vcpu_info_placement = 1; + ++static void clamp_max_cpus(void) ++{ ++#ifdef CONFIG_SMP ++ if (setup_max_cpus > MAX_VIRT_CPUS) ++ setup_max_cpus = MAX_VIRT_CPUS; ++#endif ++} ++ + static void xen_vcpu_setup(int cpu) + { + struct vcpu_register_vcpu_info info; +@@ -101,19 +122,20 @@ static void xen_vcpu_setup(int cpu) + struct vcpu_info *vcpup; + + BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); +- per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; + +- if (!have_vcpu_info_placement) +- return; /* already tested, not available */ ++ if (cpu < MAX_VIRT_CPUS) ++ per_cpu(xen_vcpu,cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; + +- vcpup = &per_cpu(xen_vcpu_info, cpu); ++ if (!have_vcpu_info_placement) { ++ if (cpu >= MAX_VIRT_CPUS) ++ clamp_max_cpus(); ++ return; ++ } + ++ vcpup = &per_cpu(xen_vcpu_info, cpu); + info.mfn = arbitrary_virt_to_mfn(vcpup); + info.offset = offset_in_page(vcpup); + +- printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n", +- cpu, vcpup, info.mfn, info.offset); +- + /* Check to see if the hypervisor will put the vcpu_info + structure where we want it, which allows direct access via + a percpu-variable. */ +@@ -122,13 +144,11 @@ static void xen_vcpu_setup(int cpu) + if (err) { + printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err); + have_vcpu_info_placement = 0; ++ clamp_max_cpus(); + } else { + /* This cpu is using the registered vcpu info, even if + later ones fail to. */ + per_cpu(xen_vcpu, cpu) = vcpup; +- +- printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n", +- cpu, vcpup); + } + } + +@@ -167,13 +187,16 @@ static void __init xen_banner(void) + + printk(KERN_INFO "Booting paravirtualized kernel on %s\n", + pv_info.name); +- printk(KERN_INFO "Xen version: %d.%d%s%s\n", ++ printk(KERN_INFO "Xen version: %d.%d%s%s%s\n", + version >> 16, version & 0xffff, extra.extraversion, +- xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); ++ xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? ++ " (preserve-AD)" : "", ++ xen_initial_domain() ? " (dom0)" : ""); + } + + static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0; + static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0; ++static __read_mostly unsigned int cpuid_leaf81_edx_mask = ~0; + + static void xen_cpuid(unsigned int *ax, unsigned int *bx, + unsigned int *cx, unsigned int *dx) +@@ -187,7 +210,7 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, + * unsupported kernel subsystems as possible. + */ + switch (*ax) { +- case 1: ++ case 0x1: + maskecx = cpuid_leaf1_ecx_mask; + maskedx = cpuid_leaf1_edx_mask; + break; +@@ -196,6 +219,10 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, + /* Suppress extended topology stuff */ + maskebx = 0; + break; ++ ++ case 0x80000001: ++ maskedx = cpuid_leaf81_edx_mask; ++ break; + } + + asm(XEN_EMULATE_PREFIX "cpuid" +@@ -213,34 +240,29 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, + static __init void xen_init_cpuid_mask(void) + { + unsigned int ax, bx, cx, dx; ++ unsigned int xsave_mask; + + cpuid_leaf1_edx_mask = +- ~((1 << X86_FEATURE_MCE) | /* disable MCE */ +- (1 << X86_FEATURE_MCA) | /* disable MCA */ +- (1 << X86_FEATURE_ACC)); /* thermal monitoring */ ++ ~(1 << X86_FEATURE_ACC); /* thermal monitoring */ ++ ++ cpuid_leaf81_edx_mask = ~(1 << (X86_FEATURE_GBPAGES % 32)); + + if (!xen_initial_domain()) + cpuid_leaf1_edx_mask &= +- ~((1 << X86_FEATURE_APIC) | /* disable local APIC */ ++ ~((1 << X86_FEATURE_MCE) | /* disable MCE */ ++ (1 << X86_FEATURE_MCA) | /* disable MCA */ ++ (1 << X86_FEATURE_APIC) | /* disable local APIC */ + (1 << X86_FEATURE_ACPI)); /* disable ACPI */ +- + ax = 1; +- cx = 0; + xen_cpuid(&ax, &bx, &cx, &dx); + +- /* cpuid claims we support xsave; try enabling it to see what happens */ +- if (cx & (1 << (X86_FEATURE_XSAVE % 32))) { +- unsigned long cr4; +- +- set_in_cr4(X86_CR4_OSXSAVE); +- +- cr4 = read_cr4(); +- +- if ((cr4 & X86_CR4_OSXSAVE) == 0) +- cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_XSAVE % 32)); ++ xsave_mask = ++ (1 << (X86_FEATURE_XSAVE % 32)) | ++ (1 << (X86_FEATURE_OSXSAVE % 32)); + +- clear_in_cr4(X86_CR4_OSXSAVE); +- } ++ /* Xen will set CR4.OSXSAVE if supported and not disabled by force */ ++ if ((cx & xsave_mask) != xsave_mask) ++ cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */ + } + + static void xen_set_debugreg(int reg, unsigned long val) +@@ -406,7 +428,7 @@ static __init void xen_load_gdt_boot(const struct desc_ptr *dtr) + + pte = pfn_pte(pfn, PAGE_KERNEL_RO); + +- if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0)) ++ if (HYPERVISOR_update_va_mapping(va, pte, 0)) + BUG(); + + frames[f] = mfn; +@@ -517,13 +539,13 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val, + return 0; + #ifdef CONFIG_X86_MCE + } else if (addr == (unsigned long)machine_check) { +- return 0; ++ /* We can use the original machine_check handler, ++ despite IST. */ + #endif +- } else { +- /* Some other trap using IST? */ +- if (WARN_ON(val->ist != 0)) +- return 0; +- } ++ } else if (WARN(val->ist != 0, ++ "Unknown IST-using trap: vector %d, %pF, val->ist=%d\n", ++ vector, (void *)addr, val->ist)) ++ return 0; + #endif /* CONFIG_X86_64 */ + info->address = addr; + +@@ -679,6 +701,18 @@ static void xen_set_iopl_mask(unsigned mask) + HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); + } + ++static void xen_set_io_bitmap(struct thread_struct *thread, ++ unsigned long bytes_updated) ++{ ++ struct physdev_set_iobitmap set_iobitmap; ++ ++ set_xen_guest_handle(set_iobitmap.bitmap, ++ (char *)thread->io_bitmap_ptr); ++ set_iobitmap.nr_ports = thread->io_bitmap_ptr ? IO_BITMAP_BITS : 0; ++ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, ++ &set_iobitmap)); ++} ++ + static void xen_io_delay(void) + { + } +@@ -716,7 +750,7 @@ static u32 xen_safe_apic_wait_icr_idle(void) + return 0; + } + +-static void set_xen_basic_apic_ops(void) ++static __init void set_xen_basic_apic_ops(void) + { + apic->read = xen_apic_read; + apic->write = xen_apic_write; +@@ -728,7 +762,6 @@ static void set_xen_basic_apic_ops(void) + + #endif + +- + static void xen_clts(void) + { + struct multicall_space mcs; +@@ -811,6 +844,11 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) + Xen console noise. */ + break; + ++ case MSR_IA32_CR_PAT: ++ if (smp_processor_id() == 0) ++ xen_set_pat(((u64)high << 32) | low); ++ break; ++ + default: + ret = native_write_msr_safe(msr, low, high); + } +@@ -849,8 +887,6 @@ void xen_setup_vcpu_info_placement(void) + /* xen_vcpu_setup managed to place the vcpu_info within the + percpu area for all cpus, so make use of it */ + if (have_vcpu_info_placement) { +- printk(KERN_INFO "Xen: using vcpu_info placement\n"); +- + pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); + pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct); + pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); +@@ -923,10 +959,6 @@ static const struct pv_init_ops xen_init_ops __initdata = { + .patch = xen_patch, + }; + +-static const struct pv_time_ops xen_time_ops __initdata = { +- .sched_clock = xen_clocksource_read, +-}; +- + static const struct pv_cpu_ops xen_cpu_ops __initdata = { + .cpuid = xen_cpuid, + +@@ -978,6 +1010,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { + .load_sp0 = xen_load_sp0, + + .set_iopl_mask = xen_set_iopl_mask, ++ .set_io_bitmap = xen_set_io_bitmap, + .io_delay = xen_io_delay, + + /* Xen takes care of %gs when switching to usermode for us */ +@@ -1016,15 +1049,40 @@ static void xen_machine_halt(void) + xen_reboot(SHUTDOWN_poweroff); + } + ++static void xen_machine_power_off(void) ++{ ++ if (pm_power_off) ++ pm_power_off(); ++ else ++ xen_reboot(SHUTDOWN_poweroff); ++} ++ + static void xen_crash_shutdown(struct pt_regs *regs) + { + xen_reboot(SHUTDOWN_crash); + } + ++static int ++xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr) ++{ ++ xen_reboot(SHUTDOWN_crash); ++ return NOTIFY_DONE; ++} ++ ++static struct notifier_block xen_panic_block = { ++ .notifier_call= xen_panic_event, ++}; ++ ++int xen_panic_handler_init(void) ++{ ++ atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block); ++ return 0; ++} ++ + static const struct machine_ops __initdata xen_machine_ops = { + .restart = xen_restart, + .halt = xen_machine_halt, +- .power_off = xen_machine_halt, ++ .power_off = xen_machine_power_off, + .shutdown = xen_machine_halt, + .crash_shutdown = xen_crash_shutdown, + .emergency_restart = xen_emergency_restart, +@@ -1057,10 +1115,11 @@ asmlinkage void __init xen_start_kernel(void) + + xen_domain_type = XEN_PV_DOMAIN; + ++ xen_setup_machphys_mapping(); ++ + /* Install Xen paravirt ops */ + pv_info = xen_info; + pv_init_ops = xen_init_ops; +- pv_time_ops = xen_time_ops; + pv_cpu_ops = xen_cpu_ops; + pv_apic_ops = xen_apic_ops; + +@@ -1068,13 +1127,7 @@ asmlinkage void __init xen_start_kernel(void) + x86_init.oem.arch_setup = xen_arch_setup; + x86_init.oem.banner = xen_banner; + +- x86_init.timers.timer_init = xen_time_init; +- x86_init.timers.setup_percpu_clockev = x86_init_noop; +- x86_cpuinit.setup_percpu_clockev = x86_init_noop; +- +- x86_platform.calibrate_tsc = xen_tsc_khz; +- x86_platform.get_wallclock = xen_get_wallclock; +- x86_platform.set_wallclock = xen_set_wallclock; ++ xen_init_time_ops(); + + /* + * Set up some pagetable state before starting to set any ptes. +@@ -1112,6 +1165,10 @@ asmlinkage void __init xen_start_kernel(void) + */ + xen_setup_stackprotector(); + ++#ifdef CONFIG_SPARSE_IRQ ++ nr_dynamic_irqs += 256; ++#endif ++ + xen_init_irq_ops(); + xen_init_cpuid_mask(); + +@@ -1138,8 +1195,19 @@ asmlinkage void __init xen_start_kernel(void) + + xen_smp_init(); + ++#ifdef CONFIG_ACPI_NUMA ++ /* ++ * The pages we from Xen are not related to machine pages, so ++ * any NUMA information the kernel tries to get from ACPI will ++ * be meaningless. Prevent it from trying. ++ */ ++ acpi_numa = -1; ++#endif ++ + pgd = (pgd_t *)xen_start_info->pt_base; + ++ __supported_pte_mask |= _PAGE_IOMAP; ++ + /* Don't do the full vcpu_info placement stuff until we have a + possible map and a non-dummy shared_info. */ + per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; +@@ -1149,6 +1217,10 @@ asmlinkage void __init xen_start_kernel(void) + + xen_raw_console_write("mapping kernel into physical memory\n"); + pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); ++ xen_ident_map_ISA(); ++ ++ /* Allocate and initialize top and mid mfn levels for p2m structure */ ++ xen_build_mfn_list_list(); + + init_mm.pgd = pgd; + +@@ -1158,6 +1230,14 @@ asmlinkage void __init xen_start_kernel(void) + if (xen_feature(XENFEAT_supervisor_mode_kernel)) + pv_info.kernel_rpl = 0; + ++ if (xen_initial_domain()) { ++ struct physdev_set_iopl set_iopl; ++ set_iopl.iopl = 1; ++ if (HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl) == -1) ++ BUG(); ++ xen_init_apic(); ++ } ++ + /* set the limit of our address space */ + xen_reserve_top(); + +@@ -1180,6 +1260,16 @@ asmlinkage void __init xen_start_kernel(void) + add_preferred_console("xenboot", 0, NULL); + add_preferred_console("tty", 0, NULL); + add_preferred_console("hvc", 0, NULL); ++ ++ boot_params.screen_info.orig_video_isVGA = 0; ++ } else { ++ const struct dom0_vga_console_info *info = ++ (void *)((char *)xen_start_info + ++ xen_start_info->console.dom0.info_off); ++ ++ xen_init_vga(info, xen_start_info->console.dom0.info_size); ++ xen_start_info->console.domU.mfn = 0; ++ xen_start_info->console.domU.evtchn = 0; + } + + xen_raw_console_write("about to get started...\n"); +@@ -1193,3 +1283,126 @@ asmlinkage void __init xen_start_kernel(void) + x86_64_start_reservations((char *)__pa_symbol(&boot_params)); + #endif + } ++ ++static uint32_t xen_cpuid_base(void) ++{ ++ uint32_t base, eax, ebx, ecx, edx; ++ char signature[13]; ++ ++ for (base = 0x40000000; base < 0x40010000; base += 0x100) { ++ cpuid(base, &eax, &ebx, &ecx, &edx); ++ *(uint32_t *)(signature + 0) = ebx; ++ *(uint32_t *)(signature + 4) = ecx; ++ *(uint32_t *)(signature + 8) = edx; ++ signature[12] = 0; ++ ++ if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2)) ++ return base; ++ } ++ ++ return 0; ++} ++ ++static int init_hvm_pv_info(int *major, int *minor) ++{ ++ uint32_t eax, ebx, ecx, edx, pages, msr, base; ++ u64 pfn; ++ ++ base = xen_cpuid_base(); ++ if (!base) ++ return -EINVAL; ++ ++ cpuid(base + 1, &eax, &ebx, &ecx, &edx); ++ ++ *major = eax >> 16; ++ *minor = eax & 0xffff; ++ printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor); ++ ++ cpuid(base + 2, &pages, &msr, &ecx, &edx); ++ ++ pfn = __pa(hypercall_page); ++ wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); ++ ++ xen_setup_features(); ++ ++ pv_info = xen_info; ++ pv_info.kernel_rpl = 0; ++ ++ xen_domain_type = XEN_HVM_DOMAIN; ++ ++ return 0; ++} ++ ++void xen_hvm_init_shared_info(void) ++{ ++ int cpu; ++ struct xen_add_to_physmap xatp; ++ static struct shared_info *shared_info_page = 0; ++ ++ if (!shared_info_page) ++ shared_info_page = (struct shared_info *) alloc_bootmem_pages(PAGE_SIZE); ++ xatp.domid = DOMID_SELF; ++ xatp.idx = 0; ++ xatp.space = XENMAPSPACE_shared_info; ++ xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT; ++ if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) ++ BUG(); ++ ++ HYPERVISOR_shared_info = (struct shared_info *)shared_info_page; ++ ++ /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info ++ * page, we use it in the event channel upcall and in some pvclock ++ * related functions. We don't need the vcpu_info placement ++ * optimizations because we don't use any pv_mmu or pv_irq op on ++ * HVM. ++ * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is ++ * online but xen_hvm_init_shared_info is run at resume time too and ++ * in that case multiple vcpus might be online. */ ++ for_each_online_cpu(cpu) { ++ per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; ++ } ++} ++ ++#ifdef CONFIG_XEN_PVHVM ++static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self, ++ unsigned long action, void *hcpu) ++{ ++ int cpu = (long)hcpu; ++ switch (action) { ++ case CPU_UP_PREPARE: ++ per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; ++ break; ++ default: ++ break; ++ } ++ return NOTIFY_OK; ++} ++ ++static struct notifier_block __cpuinitdata xen_hvm_cpu_notifier = { ++ .notifier_call = xen_hvm_cpu_notify, ++}; ++ ++void __init xen_hvm_guest_init(void) ++{ ++ int r; ++ int major, minor; ++ ++ if (xen_pv_domain()) ++ return; ++ ++ r = init_hvm_pv_info(&major, &minor); ++ if (r < 0) ++ return; ++ ++ xen_hvm_init_shared_info(); ++ ++ if (xen_feature(XENFEAT_hvm_callback_vector)) ++ xen_have_vector_callback = 1; ++ register_cpu_notifier(&xen_hvm_cpu_notifier); ++ xen_unplug_emulated_devices(); ++ have_vcpu_info_placement = 0; ++ x86_init.irqs.intr_init = xen_init_IRQ; ++ xen_hvm_init_time_ops(); ++ xen_hvm_init_mmu_ops(); ++} ++#endif +diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c +index 350a3de..32a1c65 100644 +--- a/arch/x86/xen/mmu.c ++++ b/arch/x86/xen/mmu.c +@@ -42,6 +42,7 @@ + #include <linux/highmem.h> + #include <linux/debugfs.h> + #include <linux/bug.h> ++#include <linux/vmalloc.h> + #include <linux/module.h> + + #include <asm/pgtable.h> +@@ -50,14 +51,20 @@ + #include <asm/mmu_context.h> + #include <asm/setup.h> + #include <asm/paravirt.h> ++#include <asm/e820.h> + #include <asm/linkage.h> ++#include <asm/pat.h> ++#include <asm/init.h> ++#include <asm/page.h> + + #include <asm/xen/hypercall.h> + #include <asm/xen/hypervisor.h> + + #include <xen/page.h> + #include <xen/interface/xen.h> ++#include <xen/interface/hvm/hvm_op.h> + #include <xen/interface/version.h> ++#include <xen/interface/memory.h> + #include <xen/hvc-console.h> + + #include "multicalls.h" +@@ -66,6 +73,13 @@ + + #define MMU_UPDATE_HISTO 30 + ++/* ++ * Protects atomic reservation decrease/increase against concurrent increases. ++ * Also protects non-atomic updates of current_pages and driver_pages, and ++ * balloon lists. ++ */ ++DEFINE_SPINLOCK(xen_reservation_lock); ++ + #ifdef CONFIG_XEN_DEBUG_FS + + static struct { +@@ -124,7 +138,8 @@ static inline void check_zero(void) + * large enough to allocate page table pages to allocate the rest. + * Each page can map 2MB. + */ +-static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss; ++#define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4) ++static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES); + + #ifdef CONFIG_X86_64 + /* l3 pud for userspace vsyscall mapping */ +@@ -155,49 +170,202 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ + */ + #define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) + ++/* ++ * Xen leaves the responsibility for maintaining p2m mappings to the ++ * guests themselves, but it must also access and update the p2m array ++ * during suspend/resume when all the pages are reallocated. ++ * ++ * The p2m table is logically a flat array, but we implement it as a ++ * three-level tree to allow the address space to be sparse. ++ * ++ * Xen ++ * | ++ * p2m_top p2m_top_mfn ++ * / \ / \ ++ * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn ++ * / \ / \ / / ++ * p2m p2m p2m p2m p2m p2m p2m ... ++ * ++ * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p. ++ * ++ * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the ++ * maximum representable pseudo-physical address space is: ++ * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages ++ * ++ * P2M_PER_PAGE depends on the architecture, as a mfn is always ++ * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to ++ * 512 and 1024 entries respectively. ++ */ ++ ++unsigned long xen_max_p2m_pfn __read_mostly; + +-#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) +-#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE) ++#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) ++#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) ++#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **)) + +-/* Placeholder for holes in the address space */ +-static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data = +- { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL }; ++#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) + +- /* Array of pointers to pages containing p2m entries */ +-static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data = +- { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] }; ++/* Placeholders for holes in the address space */ ++static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE); ++static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE); ++static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE); + +-/* Arrays of p2m arrays expressed in mfns used for save/restore */ +-static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss; ++static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE); ++static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); ++static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); + +-static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE] +- __page_aligned_bss; ++RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); ++RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); + + static inline unsigned p2m_top_index(unsigned long pfn) + { +- BUG_ON(pfn >= MAX_DOMAIN_PAGES); +- return pfn / P2M_ENTRIES_PER_PAGE; ++ BUG_ON(pfn >= MAX_P2M_PFN); ++ return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE); ++} ++ ++static inline unsigned p2m_mid_index(unsigned long pfn) ++{ ++ return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE; + } + + static inline unsigned p2m_index(unsigned long pfn) + { +- return pfn % P2M_ENTRIES_PER_PAGE; ++ return pfn % P2M_PER_PAGE; ++} ++ ++static void p2m_top_init(unsigned long ***top) ++{ ++ unsigned i; ++ ++ for (i = 0; i < P2M_TOP_PER_PAGE; i++) ++ top[i] = p2m_mid_missing; ++} ++ ++static void p2m_top_mfn_init(unsigned long *top) ++{ ++ unsigned i; ++ ++ for (i = 0; i < P2M_TOP_PER_PAGE; i++) ++ top[i] = virt_to_mfn(p2m_mid_missing_mfn); ++} ++ ++static void p2m_top_mfn_p_init(unsigned long **top) ++{ ++ unsigned i; ++ ++ for (i = 0; i < P2M_TOP_PER_PAGE; i++) ++ top[i] = p2m_mid_missing_mfn; ++} ++ ++static void p2m_mid_init(unsigned long **mid) ++{ ++ unsigned i; ++ ++ for (i = 0; i < P2M_MID_PER_PAGE; i++) ++ mid[i] = p2m_missing; ++} ++ ++static void p2m_mid_mfn_init(unsigned long *mid) ++{ ++ unsigned i; ++ ++ for (i = 0; i < P2M_MID_PER_PAGE; i++) ++ mid[i] = virt_to_mfn(p2m_missing); ++} ++ ++static void p2m_init(unsigned long *p2m) ++{ ++ unsigned i; ++ ++ for (i = 0; i < P2M_MID_PER_PAGE; i++) ++ p2m[i] = INVALID_P2M_ENTRY; ++} ++ ++static int lookup_pte_fn( ++ pte_t *pte, struct page *pmd_page, unsigned long addr, void *data) ++{ ++ uint64_t *ptep = (uint64_t *)data; ++ if (ptep) ++ *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) << ++ PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK); ++ return 0; + } + +-/* Build the parallel p2m_top_mfn structures */ ++int create_lookup_pte_addr(struct mm_struct *mm, ++ unsigned long address, ++ uint64_t *ptep) ++{ ++ return apply_to_page_range(mm, address, PAGE_SIZE, ++ lookup_pte_fn, ptep); ++} ++ ++EXPORT_SYMBOL(create_lookup_pte_addr); ++ ++/* ++ * Build the parallel p2m_top_mfn and p2m_mid_mfn structures ++ * ++ * This is called both at boot time, and after resuming from suspend: ++ * - At boot time we're called very early, and must use extend_brk() ++ * to allocate memory. ++ * ++ * - After resume we're called from within stop_machine, but the mfn ++ * tree should alreay be completely allocated. ++ */ + void xen_build_mfn_list_list(void) + { +- unsigned pfn, idx; ++ unsigned long pfn; + +- for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) { +- unsigned topidx = p2m_top_index(pfn); ++ /* Pre-initialize p2m_top_mfn to be completely missing */ ++ if (p2m_top_mfn == NULL) { ++ p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); ++ p2m_mid_mfn_init(p2m_mid_missing_mfn); ++ ++ p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); ++ p2m_top_mfn_p_init(p2m_top_mfn_p); + +- p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]); ++ p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); ++ p2m_top_mfn_init(p2m_top_mfn); ++ } else { ++ /* Reinitialise, mfn's all change after migration */ ++ p2m_mid_mfn_init(p2m_mid_missing_mfn); + } + +- for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) { +- unsigned topidx = idx * P2M_ENTRIES_PER_PAGE; +- p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]); ++ for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) { ++ unsigned topidx = p2m_top_index(pfn); ++ unsigned mididx = p2m_mid_index(pfn); ++ unsigned long **mid; ++ unsigned long *mid_mfn_p; ++ ++ mid = p2m_top[topidx]; ++ mid_mfn_p = p2m_top_mfn_p[topidx]; ++ ++ /* Don't bother allocating any mfn mid levels if ++ * they're just missing, just update the stored mfn, ++ * since all could have changed over a migrate. ++ */ ++ if (mid == p2m_mid_missing) { ++ BUG_ON(mididx); ++ BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); ++ p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn); ++ pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE; ++ continue; ++ } ++ ++ if (mid_mfn_p == p2m_mid_missing_mfn) { ++ /* ++ * XXX boot-time only! We should never find ++ * missing parts of the mfn tree after ++ * runtime. extend_brk() will BUG if we call ++ * it too late. ++ */ ++ mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); ++ p2m_mid_mfn_init(mid_mfn_p); ++ ++ p2m_top_mfn_p[topidx] = mid_mfn_p; ++ } ++ ++ p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); ++ mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]); + } + } + +@@ -206,8 +374,8 @@ void xen_setup_mfn_list_list(void) + BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); + + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = +- virt_to_mfn(p2m_top_mfn_list); +- HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages; ++ virt_to_mfn(p2m_top_mfn); ++ HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn; + } + + /* Set up p2m_top to point to the domain-builder provided p2m pages */ +@@ -215,98 +383,176 @@ void __init xen_build_dynamic_phys_to_machine(void) + { + unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; + unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); +- unsigned pfn; ++ unsigned long pfn; ++ ++ xen_max_p2m_pfn = max_pfn; + +- for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) { ++ p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); ++ p2m_init(p2m_missing); ++ ++ p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); ++ p2m_mid_init(p2m_mid_missing); ++ ++ p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); ++ p2m_top_init(p2m_top); ++ ++ /* ++ * The domain builder gives us a pre-constructed p2m array in ++ * mfn_list for all the pages initially given to us, so we just ++ * need to graft that into our tree structure. ++ */ ++ for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) { + unsigned topidx = p2m_top_index(pfn); ++ unsigned mididx = p2m_mid_index(pfn); + +- p2m_top[topidx] = &mfn_list[pfn]; +- } ++ if (p2m_top[topidx] == p2m_mid_missing) { ++ unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE); ++ p2m_mid_init(mid); ++ ++ p2m_top[topidx] = mid; ++ } + +- xen_build_mfn_list_list(); ++ p2m_top[topidx][mididx] = &mfn_list[pfn]; ++ } + } + + unsigned long get_phys_to_machine(unsigned long pfn) + { +- unsigned topidx, idx; ++ unsigned topidx, mididx, idx; + +- if (unlikely(pfn >= MAX_DOMAIN_PAGES)) ++ if (unlikely(pfn >= MAX_P2M_PFN)) + return INVALID_P2M_ENTRY; + + topidx = p2m_top_index(pfn); ++ mididx = p2m_mid_index(pfn); + idx = p2m_index(pfn); +- return p2m_top[topidx][idx]; ++ ++ return p2m_top[topidx][mididx][idx]; + } + EXPORT_SYMBOL_GPL(get_phys_to_machine); + +-/* install a new p2m_top page */ +-bool install_p2mtop_page(unsigned long pfn, unsigned long *p) ++static void *alloc_p2m_page(void) + { +- unsigned topidx = p2m_top_index(pfn); +- unsigned long **pfnp, *mfnp; +- unsigned i; ++ return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT); ++} + +- pfnp = &p2m_top[topidx]; +- mfnp = &p2m_top_mfn[topidx]; ++static void free_p2m_page(void *p) ++{ ++ free_page((unsigned long)p); ++} + +- for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++) +- p[i] = INVALID_P2M_ENTRY; ++/* ++ * Fully allocate the p2m structure for a given pfn. We need to check ++ * that both the top and mid levels are allocated, and make sure the ++ * parallel mfn tree is kept in sync. We may race with other cpus, so ++ * the new pages are installed with cmpxchg; if we lose the race then ++ * simply free the page we allocated and use the one that's there. ++ */ ++static bool alloc_p2m(unsigned long pfn) ++{ ++ unsigned topidx, mididx; ++ unsigned long ***top_p, **mid; ++ unsigned long *top_mfn_p, *mid_mfn; + +- if (cmpxchg(pfnp, p2m_missing, p) == p2m_missing) { +- *mfnp = virt_to_mfn(p); +- return true; ++ topidx = p2m_top_index(pfn); ++ mididx = p2m_mid_index(pfn); ++ ++ top_p = &p2m_top[topidx]; ++ mid = *top_p; ++ ++ if (mid == p2m_mid_missing) { ++ /* Mid level is missing, allocate a new one */ ++ mid = alloc_p2m_page(); ++ if (!mid) ++ return false; ++ ++ p2m_mid_init(mid); ++ ++ if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing) ++ free_p2m_page(mid); + } + +- return false; +-} ++ top_mfn_p = &p2m_top_mfn[topidx]; ++ mid_mfn = p2m_top_mfn_p[topidx]; + +-static void alloc_p2m(unsigned long pfn) +-{ +- unsigned long *p; ++ BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p); ++ ++ if (mid_mfn == p2m_mid_missing_mfn) { ++ /* Separately check the mid mfn level */ ++ unsigned long missing_mfn; ++ unsigned long mid_mfn_mfn; ++ ++ mid_mfn = alloc_p2m_page(); ++ if (!mid_mfn) ++ return false; + +- p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL); +- BUG_ON(p == NULL); ++ p2m_mid_mfn_init(mid_mfn); + +- if (!install_p2mtop_page(pfn, p)) +- free_page((unsigned long)p); ++ missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); ++ mid_mfn_mfn = virt_to_mfn(mid_mfn); ++ if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn) ++ free_p2m_page(mid_mfn); ++ else ++ p2m_top_mfn_p[topidx] = mid_mfn; ++ } ++ ++ if (p2m_top[topidx][mididx] == p2m_missing) { ++ /* p2m leaf page is missing */ ++ unsigned long *p2m; ++ ++ p2m = alloc_p2m_page(); ++ if (!p2m) ++ return false; ++ ++ p2m_init(p2m); ++ ++ if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing) ++ free_p2m_page(p2m); ++ else ++ mid_mfn[mididx] = virt_to_mfn(p2m); ++ } ++ ++ return true; + } + + /* Try to install p2m mapping; fail if intermediate bits missing */ + bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) + { +- unsigned topidx, idx; ++ unsigned topidx, mididx, idx; + +- if (unlikely(pfn >= MAX_DOMAIN_PAGES)) { ++ if (unlikely(pfn >= MAX_P2M_PFN)) { + BUG_ON(mfn != INVALID_P2M_ENTRY); + return true; + } + + topidx = p2m_top_index(pfn); +- if (p2m_top[topidx] == p2m_missing) { +- if (mfn == INVALID_P2M_ENTRY) +- return true; +- return false; +- } +- ++ mididx = p2m_mid_index(pfn); + idx = p2m_index(pfn); +- p2m_top[topidx][idx] = mfn; ++ ++ if (p2m_top[topidx][mididx] == p2m_missing) ++ return mfn == INVALID_P2M_ENTRY; ++ ++ p2m_top[topidx][mididx][idx] = mfn; + + return true; + } + +-void set_phys_to_machine(unsigned long pfn, unsigned long mfn) ++bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) + { + if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { + BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); +- return; ++ return true; + } + + if (unlikely(!__set_phys_to_machine(pfn, mfn))) { +- alloc_p2m(pfn); ++ if (!alloc_p2m(pfn)) ++ return false; + + if (!__set_phys_to_machine(pfn, mfn)) +- BUG(); ++ return false; + } ++ ++ return true; + } + + unsigned long arbitrary_virt_to_mfn(void *vaddr) +@@ -315,6 +561,7 @@ unsigned long arbitrary_virt_to_mfn(void *vaddr) + + return PFN_DOWN(maddr.maddr); + } ++EXPORT_SYMBOL_GPL(set_phys_to_machine); + + xmaddr_t arbitrary_virt_to_machine(void *vaddr) + { +@@ -345,7 +592,8 @@ void make_lowmem_page_readonly(void *vaddr) + unsigned int level; + + pte = lookup_address(address, &level); +- BUG_ON(pte == NULL); ++ if (pte == NULL) ++ return; /* vaddr missing */ + + ptev = pte_wrprotect(*pte); + +@@ -360,7 +608,8 @@ void make_lowmem_page_readwrite(void *vaddr) + unsigned int level; + + pte = lookup_address(address, &level); +- BUG_ON(pte == NULL); ++ if (pte == NULL) ++ return; /* vaddr missing */ + + ptev = pte_mkwrite(*pte); + +@@ -376,6 +625,24 @@ static bool xen_page_pinned(void *ptr) + return PagePinned(page); + } + ++void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid) ++{ ++ struct multicall_space mcs; ++ struct mmu_update *u; ++ ++ mcs = xen_mc_entry(sizeof(*u)); ++ u = mcs.args; ++ ++ /* ptep might be kmapped when using 32-bit HIGHPTE */ ++ u->ptr = arbitrary_virt_to_machine(ptep).maddr; ++ u->val = pte_val_ma(pteval); ++ ++ MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid); ++ ++ xen_mc_issue(PARAVIRT_LAZY_MMU); ++} ++EXPORT_SYMBOL_GPL(xen_set_domain_pte); ++ + static void xen_extend_mmu_update(const struct mmu_update *update) + { + struct multicall_space mcs; +@@ -516,7 +783,34 @@ static pteval_t pte_pfn_to_mfn(pteval_t val) + if (val & _PAGE_PRESENT) { + unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; + pteval_t flags = val & PTE_FLAGS_MASK; +- val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags; ++ unsigned long mfn = pfn_to_mfn(pfn); ++ ++ /* ++ * If there's no mfn for the pfn, then just create an ++ * empty non-present pte. Unfortunately this loses ++ * information about the original pfn, so ++ * pte_mfn_to_pfn is asymmetric. ++ */ ++ if (unlikely(mfn == INVALID_P2M_ENTRY)) { ++ mfn = 0; ++ flags = 0; ++ } ++ ++ val = ((pteval_t)mfn << PAGE_SHIFT) | flags; ++ } ++ ++ return val; ++} ++ ++static pteval_t iomap_pte(pteval_t val) ++{ ++ if (val & _PAGE_PRESENT) { ++ unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; ++ pteval_t flags = val & PTE_FLAGS_MASK; ++ ++ /* We assume the pte frame number is a MFN, so ++ just use it as-is. */ ++ val = ((pteval_t)pfn << PAGE_SHIFT) | flags; + } + + return val; +@@ -524,7 +818,18 @@ static pteval_t pte_pfn_to_mfn(pteval_t val) + + pteval_t xen_pte_val(pte_t pte) + { +- return pte_mfn_to_pfn(pte.pte); ++ pteval_t pteval = pte.pte; ++ ++ /* If this is a WC pte, convert back from Xen WC to Linux WC */ ++ if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) { ++ WARN_ON(!pat_enabled); ++ pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT; ++ } ++ ++ if (xen_initial_domain() && (pteval & _PAGE_IOMAP)) ++ return pteval; ++ ++ return pte_mfn_to_pfn(pteval); + } + PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); + +@@ -534,9 +839,62 @@ pgdval_t xen_pgd_val(pgd_t pgd) + } + PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val); + ++/* ++ * Xen's PAT setup is part of its ABI, though I assume entries 6 & 7 ++ * are reserved for now, to correspond to the Intel-reserved PAT ++ * types. ++ * ++ * We expect Linux's PAT set as follows: ++ * ++ * Idx PTE flags Linux Xen Default ++ * 0 WB WB WB ++ * 1 PWT WC WT WT ++ * 2 PCD UC- UC- UC- ++ * 3 PCD PWT UC UC UC ++ * 4 PAT WB WC WB ++ * 5 PAT PWT WC WP WT ++ * 6 PAT PCD UC- UC UC- ++ * 7 PAT PCD PWT UC UC UC ++ */ ++ ++void xen_set_pat(u64 pat) ++{ ++ /* We expect Linux to use a PAT setting of ++ * UC UC- WC WB (ignoring the PAT flag) */ ++ WARN_ON(pat != 0x0007010600070106ull); ++} ++ + pte_t xen_make_pte(pteval_t pte) + { +- pte = pte_pfn_to_mfn(pte); ++ phys_addr_t addr = (pte & PTE_PFN_MASK); ++ ++ /* If Linux is trying to set a WC pte, then map to the Xen WC. ++ * If _PAGE_PAT is set, then it probably means it is really ++ * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope ++ * things work out OK... ++ * ++ * (We should never see kernel mappings with _PAGE_PSE set, ++ * but we could see hugetlbfs mappings, I think.). ++ */ ++ if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) { ++ if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT) ++ pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT; ++ } ++ ++ /* ++ * Unprivileged domains are allowed to do IOMAPpings for ++ * PCI passthrough, but not map ISA space. The ISA ++ * mappings are just dummy local mappings to keep other ++ * parts of the kernel happy. ++ */ ++ if (unlikely(pte & _PAGE_IOMAP) && ++ (xen_initial_domain() || addr >= ISA_END_ADDRESS)) { ++ pte = iomap_pte(pte); ++ } else { ++ pte &= ~_PAGE_IOMAP; ++ pte = pte_pfn_to_mfn(pte); ++ } ++ + return native_make_pte(pte); + } + PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); +@@ -934,8 +1292,6 @@ static int xen_pin_page(struct mm_struct *mm, struct page *page, + read-only, and can be pinned. */ + static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd) + { +- vm_unmap_aliases(); +- + xen_mc_batch(); + + if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) { +@@ -1219,7 +1575,7 @@ void xen_exit_mmap(struct mm_struct *mm) + spin_lock(&mm->page_table_lock); + + /* pgd may not be pinned in the error exit path of execve */ +- if (xen_page_pinned(mm->pgd)) ++ if (xen_page_pinned(mm->pgd) && !mm->context.has_foreign_mappings) + xen_pgd_unpin(mm); + + spin_unlock(&mm->page_table_lock); +@@ -1288,12 +1644,19 @@ static void xen_flush_tlb_single(unsigned long addr) + preempt_enable(); + } + ++/* ++ * Flush tlb on other cpus. Xen can do this via a single hypercall ++ * rather than explicit IPIs, which has the nice property of avoiding ++ * any cpus which don't actually have dirty tlbs. Unfortunately it ++ * doesn't give us an opportunity to kick out cpus which are in lazy ++ * tlb state, so we may end up reflushing some cpus unnecessarily. ++ */ + static void xen_flush_tlb_others(const struct cpumask *cpus, + struct mm_struct *mm, unsigned long va) + { + struct { + struct mmuext_op op; +- DECLARE_BITMAP(mask, NR_CPUS); ++ DECLARE_BITMAP(mask, num_processors); + } *args; + struct multicall_space mcs; + +@@ -1417,6 +1780,13 @@ static int xen_pgd_alloc(struct mm_struct *mm) + return ret; + } + ++void xen_late_unpin_pgd(struct mm_struct *mm, pgd_t *pgd) ++{ ++ if (xen_page_pinned(pgd)) ++ __xen_pgd_unpin(mm, pgd); ++ ++} ++ + static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) + { + #ifdef CONFIG_X86_64 +@@ -1445,13 +1815,29 @@ static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) + } + #endif + +-#ifdef CONFIG_X86_32 + static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) + { +- /* If there's an existing pte, then don't allow _PAGE_RW to be set */ +- if (pte_val_ma(*ptep) & _PAGE_PRESENT) ++ unsigned long pfn = pte_pfn(pte); ++ pte_t oldpte = *ptep; ++ ++ if (pte_flags(oldpte) & _PAGE_PRESENT) { ++ /* Don't allow existing IO mappings to be overridden */ ++ if (pte_flags(oldpte) & _PAGE_IOMAP) ++ pte = oldpte; ++ ++ /* Don't allow _PAGE_RW to be set on existing pte */ + pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & + pte_val_ma(pte)); ++ } ++ ++ /* ++ * If the new pfn is within the range of the newly allocated ++ * kernel pagetable, and it isn't being mapped into an ++ * early_ioremap fixmap slot, make sure it is RO. ++ */ ++ if (!is_early_ioremap_ptep(ptep) && ++ pfn >= e820_table_start && pfn < e820_table_end) ++ pte = pte_wrprotect(pte); + + return pte; + } +@@ -1464,7 +1850,6 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) + + xen_set_pte(ptep, pte); + } +-#endif + + static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) + { +@@ -1517,7 +1902,6 @@ static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned l + if (PagePinned(virt_to_page(mm->pgd))) { + SetPagePinned(page); + +- vm_unmap_aliases(); + if (!PageHighMem(page)) { + make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn))); + if (level == PT_PTE && USE_SPLIT_PTLOCKS) +@@ -1620,6 +2004,7 @@ static void *m2v(phys_addr_t maddr) + return __ka(m2p(maddr)); + } + ++/* Set the page permissions on an identity-mapped pages */ + static void set_page_prot(void *addr, pgprot_t prot) + { + unsigned long pfn = __pa(addr) >> PAGE_SHIFT; +@@ -1635,6 +2020,9 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) + unsigned ident_pte; + unsigned long pfn; + ++ level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES, ++ PAGE_SIZE); ++ + ident_pte = 0; + pfn = 0; + for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { +@@ -1645,7 +2033,7 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) + pte_page = m2v(pmd[pmdidx].pmd); + else { + /* Check for free pte pages */ +- if (ident_pte == ARRAY_SIZE(level1_ident_pgt)) ++ if (ident_pte == LEVEL1_IDENT_ENTRIES) + break; + + pte_page = &level1_ident_pgt[ident_pte]; +@@ -1675,6 +2063,20 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) + set_page_prot(pmd, PAGE_KERNEL_RO); + } + ++void __init xen_setup_machphys_mapping(void) ++{ ++ struct xen_machphys_mapping mapping; ++ unsigned long machine_to_phys_nr_ents; ++ ++ if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) { ++ machine_to_phys_mapping = (unsigned long *)mapping.v_start; ++ machine_to_phys_nr_ents = mapping.max_mfn + 1; ++ } else { ++ machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES; ++ } ++ machine_to_phys_order = fls(machine_to_phys_nr_ents - 1); ++} ++ + #ifdef CONFIG_X86_64 + static void convert_pfn_mfn(void *v) + { +@@ -1760,12 +2162,15 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, + return pgd; + } + #else /* !CONFIG_X86_64 */ +-static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss; ++static RESERVE_BRK_ARRAY(pmd_t, level2_kernel_pgt, PTRS_PER_PMD); + + __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, + unsigned long max_pfn) + { + pmd_t *kernel_pmd; ++ int i; ++ ++ level2_kernel_pgt = extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); + + max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + + xen_start_info->nr_pt_frames * PAGE_SIZE + +@@ -1777,6 +2182,20 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, + xen_map_identity_early(level2_kernel_pgt, max_pfn); + + memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD); ++ ++ /* ++ * When running a 32 bit domain 0 on a 64 bit hypervisor a ++ * pinned L3 (such as the initial pgd here) contains bits ++ * which are reserved in the PAE layout but not in the 64 bit ++ * layout. Unfortunately some versions of the hypervisor ++ * (incorrectly) validate compat mode guests against the PAE ++ * layout and hence will not allow such a pagetable to be ++ * pinned by the guest. Therefore we mask off only the PFN and ++ * Present bits of the supplied L3. ++ */ ++ for (i = 0; i < PTRS_PER_PGD; i++) ++ swapper_pg_dir[i].pgd &= (PTE_PFN_MASK | _PAGE_PRESENT); ++ + set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY], + __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT)); + +@@ -1799,6 +2218,8 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, + } + #endif /* CONFIG_X86_64 */ + ++static unsigned char dummy_ioapic_mapping[PAGE_SIZE] __page_aligned_bss; ++ + static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) + { + pte_t pte; +@@ -1828,9 +2249,26 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) + pte = pfn_pte(phys, prot); + break; + +- default: ++#ifdef CONFIG_X86_IO_APIC ++ case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END: ++ /* ++ * We just don't map the IO APIC - all access is via ++ * hypercalls. Keep the address in the pte for reference. ++ */ ++ pte = pfn_pte(PFN_DOWN(__pa(dummy_ioapic_mapping)), PAGE_KERNEL); ++ break; ++#endif ++ ++ case FIX_PARAVIRT_BOOTMAP: ++ /* This is an MFN, but it isn't an IO mapping from the ++ IO domain */ + pte = mfn_pte(phys, prot); + break; ++ ++ default: ++ /* By default, set_fixmap is used for hardware mappings */ ++ pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP)); ++ break; + } + + __native_set_fixmap(idx, pte); +@@ -1845,6 +2283,29 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) + #endif + } + ++__init void xen_ident_map_ISA(void) ++{ ++ unsigned long pa; ++ ++ /* ++ * If we're dom0, then linear map the ISA machine addresses into ++ * the kernel's address space. ++ */ ++ if (!xen_initial_domain()) ++ return; ++ ++ xen_raw_printk("Xen: setup ISA identity maps\n"); ++ ++ for (pa = ISA_START_ADDRESS; pa < ISA_END_ADDRESS; pa += PAGE_SIZE) { ++ pte_t pte = mfn_pte(PFN_DOWN(pa), PAGE_KERNEL_IO); ++ ++ if (HYPERVISOR_update_va_mapping(PAGE_OFFSET + pa, pte, 0)) ++ BUG(); ++ } ++ ++ xen_flush_tlb(); ++} ++ + static __init void xen_post_allocator_init(void) + { + pv_mmu_ops.set_pte = xen_set_pte; +@@ -1907,11 +2368,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { + .kmap_atomic_pte = xen_kmap_atomic_pte, + #endif + +-#ifdef CONFIG_X86_64 +- .set_pte = xen_set_pte, +-#else + .set_pte = xen_set_pte_init, +-#endif + .set_pte_at = xen_set_pte_at, + .set_pmd = xen_set_pmd_hyper, + +@@ -1960,8 +2417,305 @@ void __init xen_init_mmu_ops(void) + x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; + x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; + pv_mmu_ops = xen_mmu_ops; ++ ++ vmap_lazy_unmap = false; ++} ++ ++/* Protected by xen_reservation_lock. */ ++#define MAX_CONTIG_ORDER 9 /* 2MB */ ++static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER]; ++ ++#define VOID_PTE (mfn_pte(0, __pgprot(0))) ++static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order, ++ unsigned long *in_frames, ++ unsigned long *out_frames) ++{ ++ int i; ++ struct multicall_space mcs; ++ ++ xen_mc_batch(); ++ for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) { ++ mcs = __xen_mc_entry(0); ++ ++ if (in_frames) ++ in_frames[i] = virt_to_mfn(vaddr); ++ ++ MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0); ++ set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY); ++ ++ if (out_frames) ++ out_frames[i] = virt_to_pfn(vaddr); ++ } ++ xen_mc_issue(0); ++} ++ ++/* ++ * Update the pfn-to-mfn mappings for a virtual address range, either to ++ * point to an array of mfns, or contiguously from a single starting ++ * mfn. ++ */ ++static void xen_remap_exchanged_ptes(unsigned long vaddr, int order, ++ unsigned long *mfns, ++ unsigned long first_mfn) ++{ ++ unsigned i, limit; ++ unsigned long mfn; ++ ++ xen_mc_batch(); ++ ++ limit = 1u << order; ++ for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) { ++ struct multicall_space mcs; ++ unsigned flags; ++ ++ mcs = __xen_mc_entry(0); ++ if (mfns) ++ mfn = mfns[i]; ++ else ++ mfn = first_mfn + i; ++ ++ if (i < (limit - 1)) ++ flags = 0; ++ else { ++ if (order == 0) ++ flags = UVMF_INVLPG | UVMF_ALL; ++ else ++ flags = UVMF_TLB_FLUSH | UVMF_ALL; ++ } ++ ++ MULTI_update_va_mapping(mcs.mc, vaddr, ++ mfn_pte(mfn, PAGE_KERNEL), flags); ++ ++ set_phys_to_machine(virt_to_pfn(vaddr), mfn); ++ } ++ ++ xen_mc_issue(0); ++} ++ ++/* ++ * Perform the hypercall to exchange a region of our pfns to point to ++ * memory with the required contiguous alignment. Takes the pfns as ++ * input, and populates mfns as output. ++ * ++ * Returns a success code indicating whether the hypervisor was able to ++ * satisfy the request or not. ++ */ ++static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in, ++ unsigned long *pfns_in, ++ unsigned long extents_out, unsigned int order_out, ++ unsigned long *mfns_out, ++ unsigned int address_bits) ++{ ++ long rc; ++ int success; ++ ++ struct xen_memory_exchange exchange = { ++ .in = { ++ .nr_extents = extents_in, ++ .extent_order = order_in, ++ .extent_start = pfns_in, ++ .domid = DOMID_SELF ++ }, ++ .out = { ++ .nr_extents = extents_out, ++ .extent_order = order_out, ++ .extent_start = mfns_out, ++ .address_bits = address_bits, ++ .domid = DOMID_SELF ++ } ++ }; ++ ++ BUG_ON(extents_in << order_in != extents_out << order_out); ++ ++ rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange); ++ success = (exchange.nr_exchanged == extents_in); ++ ++ BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0))); ++ BUG_ON(success && (rc != 0)); ++ ++ return success; + } + ++int xen_create_contiguous_region(unsigned long vstart, unsigned int order, ++ unsigned int address_bits) ++{ ++ unsigned long *in_frames = discontig_frames, out_frame; ++ unsigned long flags; ++ int success; ++ ++ /* ++ * Currently an auto-translated guest will not perform I/O, nor will ++ * it require PAE page directories below 4GB. Therefore any calls to ++ * this function are redundant and can be ignored. ++ */ ++ ++ if (xen_feature(XENFEAT_auto_translated_physmap)) ++ return 0; ++ ++ if (unlikely(order > MAX_CONTIG_ORDER)) ++ return -ENOMEM; ++ ++ memset((void *) vstart, 0, PAGE_SIZE << order); ++ ++ spin_lock_irqsave(&xen_reservation_lock, flags); ++ ++ /* 1. Zap current PTEs, remembering MFNs. */ ++ xen_zap_pfn_range(vstart, order, in_frames, NULL); ++ ++ /* 2. Get a new contiguous memory extent. */ ++ out_frame = virt_to_pfn(vstart); ++ success = xen_exchange_memory(1UL << order, 0, in_frames, ++ 1, order, &out_frame, ++ address_bits); ++ ++ /* 3. Map the new extent in place of old pages. */ ++ if (success) ++ xen_remap_exchanged_ptes(vstart, order, NULL, out_frame); ++ else ++ xen_remap_exchanged_ptes(vstart, order, in_frames, 0); ++ ++ spin_unlock_irqrestore(&xen_reservation_lock, flags); ++ ++ return success ? 0 : -ENOMEM; ++} ++EXPORT_SYMBOL_GPL(xen_create_contiguous_region); ++ ++void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order) ++{ ++ unsigned long *out_frames = discontig_frames, in_frame; ++ unsigned long flags; ++ int success; ++ ++ if (xen_feature(XENFEAT_auto_translated_physmap)) ++ return; ++ ++ if (unlikely(order > MAX_CONTIG_ORDER)) ++ return; ++ ++ memset((void *) vstart, 0, PAGE_SIZE << order); ++ ++ spin_lock_irqsave(&xen_reservation_lock, flags); ++ ++ /* 1. Find start MFN of contiguous extent. */ ++ in_frame = virt_to_mfn(vstart); ++ ++ /* 2. Zap current PTEs. */ ++ xen_zap_pfn_range(vstart, order, NULL, out_frames); ++ ++ /* 3. Do the exchange for non-contiguous MFNs. */ ++ success = xen_exchange_memory(1, order, &in_frame, 1UL << order, ++ 0, out_frames, 0); ++ ++ /* 4. Map new pages in place of old pages. */ ++ if (success) ++ xen_remap_exchanged_ptes(vstart, order, out_frames, 0); ++ else ++ xen_remap_exchanged_ptes(vstart, order, NULL, in_frame); ++ ++ spin_unlock_irqrestore(&xen_reservation_lock, flags); ++} ++EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region); ++ ++#define REMAP_BATCH_SIZE 16 ++ ++struct remap_data { ++ unsigned long mfn; ++ pgprot_t prot; ++ struct mmu_update *mmu_update; ++}; ++ ++static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, ++ unsigned long addr, void *data) ++{ ++ struct remap_data *rmd = data; ++ pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot)); ++ ++ rmd->mmu_update->ptr = arbitrary_virt_to_machine(ptep).maddr; ++ rmd->mmu_update->val = pte_val_ma(pte); ++ rmd->mmu_update++; ++ ++ return 0; ++} ++ ++int xen_remap_domain_mfn_range(struct vm_area_struct *vma, ++ unsigned long addr, ++ unsigned long mfn, int nr, ++ pgprot_t prot, unsigned domid) ++{ ++ struct remap_data rmd; ++ struct mmu_update mmu_update[REMAP_BATCH_SIZE]; ++ int batch; ++ unsigned long range; ++ int err = 0; ++ ++ prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP); ++ ++ vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; ++ ++ rmd.mfn = mfn; ++ rmd.prot = prot; ++ ++ while (nr) { ++ batch = min(REMAP_BATCH_SIZE, nr); ++ range = (unsigned long)batch << PAGE_SHIFT; ++ ++ rmd.mmu_update = mmu_update; ++ err = apply_to_page_range(vma->vm_mm, addr, range, ++ remap_area_mfn_pte_fn, &rmd); ++ if (err) ++ goto out; ++ ++ err = -EFAULT; ++ if (HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0) ++ goto out; ++ ++ nr -= batch; ++ addr += range; ++ } ++ ++ err = 0; ++out: ++ ++ flush_tlb_all(); ++ ++ return err; ++} ++EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); ++ ++#ifdef CONFIG_XEN_PVHVM ++static void xen_hvm_exit_mmap(struct mm_struct *mm) ++{ ++ struct xen_hvm_pagetable_dying a; ++ int rc; ++ ++ a.domid = DOMID_SELF; ++ a.gpa = __pa(mm->pgd); ++ rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a); ++ WARN_ON_ONCE(rc < 0); ++} ++ ++static int is_pagetable_dying_supported(void) ++{ ++ struct xen_hvm_pagetable_dying a; ++ int rc = 0; ++ ++ a.domid = DOMID_SELF; ++ a.gpa = 0x00; ++ rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a); ++ if (rc < 0) { ++ printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n"); ++ return 0; ++ } ++ return 1; ++} ++ ++void __init xen_hvm_init_mmu_ops(void) ++{ ++ if (is_pagetable_dying_supported()) ++ pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap; ++} ++#endif ++ + #ifdef CONFIG_XEN_DEBUG_FS + + static struct dentry *d_mmu_debug; +diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h +index 5fe6bc7..537bb9a 100644 +--- a/arch/x86/xen/mmu.h ++++ b/arch/x86/xen/mmu.h +@@ -12,7 +12,6 @@ enum pt_level { + + + bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); +-bool install_p2mtop_page(unsigned long pfn, unsigned long *p); + + void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); + +@@ -60,4 +59,5 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, + unsigned long xen_read_cr2_direct(void); + + extern void xen_init_mmu_ops(void); ++extern void xen_hvm_init_mmu_ops(void); + #endif /* _XEN_MMU_H */ +diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c +new file mode 100644 +index 0000000..4d55524 +--- /dev/null ++++ b/arch/x86/xen/pci-swiotlb-xen.c +@@ -0,0 +1,52 @@ ++/* Glue code to lib/swiotlb-xen.c */ ++ ++#include <linux/dma-mapping.h> ++#include <linux/swiotlb.h> ++ ++#include <asm/xen/hypervisor.h> ++ ++int xen_swiotlb __read_mostly; ++ ++static struct dma_map_ops xen_swiotlb_dma_ops = { ++ .mapping_error = xen_swiotlb_dma_mapping_error, ++ .alloc_coherent = xen_swiotlb_alloc_coherent, ++ .free_coherent = xen_swiotlb_free_coherent, ++ .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu, ++ .sync_single_for_device = xen_swiotlb_sync_single_for_device, ++ .sync_single_range_for_cpu = xen_swiotlb_sync_single_range_for_cpu, ++ .sync_single_range_for_device = xen_swiotlb_sync_single_range_for_device, ++ .sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu, ++ .sync_sg_for_device = xen_swiotlb_sync_sg_for_device, ++ .map_sg = xen_swiotlb_map_sg_attrs, ++ .unmap_sg = xen_swiotlb_unmap_sg_attrs, ++ .map_page = xen_swiotlb_map_page, ++ .unmap_page = xen_swiotlb_unmap_page, ++ .dma_supported = xen_swiotlb_dma_supported, ++}; ++ ++/* ++ * pci_swiotlb_detect - set swiotlb to 1 if necessary ++ * ++ * This returns non-zero if we are forced to use swiotlb (by the boot ++ * option). ++ */ ++int __init pci_xen_swiotlb_detect(void) ++{ ++ ++ if (xen_pv_domain() && (xen_initial_domain() || swiotlb)) ++ xen_swiotlb = 1; ++ ++ /* If we are running under Xen, we MUST disable the native SWIOTLB */ ++ if (xen_pv_domain()) ++ swiotlb = 0; ++ ++ return xen_swiotlb; ++} ++ ++void __init pci_xen_swiotlb_init(void) ++{ ++ if (xen_swiotlb) { ++ xen_swiotlb_init(1); ++ dma_ops = &xen_swiotlb_dma_ops; ++ } ++} +diff --git a/arch/x86/xen/pci.c b/arch/x86/xen/pci.c +new file mode 100644 +index 0000000..8ca31f1 +--- /dev/null ++++ b/arch/x86/xen/pci.c +@@ -0,0 +1,296 @@ ++#include <linux/kernel.h> ++#include <linux/acpi.h> ++#include <linux/pci.h> ++#include <linux/msi.h> ++#include <linux/slab.h> ++ ++#include <asm/mpspec.h> ++#include <asm/io_apic.h> ++#include <asm/pci_x86.h> ++ ++#include <asm/xen/hypervisor.h> ++#include <asm/xen/pci.h> ++ ++#include <xen/interface/xen.h> ++#include <xen/events.h> ++ ++#include "xen-ops.h" ++ ++int xen_register_pirq(u32 gsi, int triggering) ++{ ++ int rc, irq; ++ struct physdev_map_pirq map_irq; ++ int shareable = 0; ++ char *name; ++ ++ if (!xen_pv_domain()) ++ return -1; ++ ++ if (triggering == ACPI_EDGE_SENSITIVE) { ++ shareable = 0; ++ name = "ioapic-edge"; ++ } else { ++ shareable = 1; ++ name = "ioapic-level"; ++ } ++ ++ irq = xen_allocate_pirq(gsi, shareable, name); ++ ++ printk(KERN_DEBUG "xen: --> irq=%d\n", irq); ++ ++ if (irq < 0) ++ goto out; ++ ++ map_irq.domid = DOMID_SELF; ++ map_irq.type = MAP_PIRQ_TYPE_GSI; ++ map_irq.index = gsi; ++ map_irq.pirq = irq; ++ ++ rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); ++ if (rc) { ++ printk(KERN_WARNING "xen map irq failed %d\n", rc); ++ return -1; ++ } ++ ++out: ++ return irq; ++} ++ ++int xen_register_gsi(u32 gsi, int triggering, int polarity) ++{ ++ int rc, irq; ++ struct physdev_setup_gsi setup_gsi; ++ ++ if (!xen_pv_domain()) ++ return -1; ++ ++ printk(KERN_DEBUG "xen: registering gsi %u triggering %d polarity %d\n", ++ gsi, triggering, polarity); ++ ++ irq = xen_register_pirq(gsi, triggering); ++ ++ setup_gsi.gsi = gsi; ++ setup_gsi.triggering = (triggering == ACPI_EDGE_SENSITIVE ? 0 : 1); ++ setup_gsi.polarity = (polarity == ACPI_ACTIVE_HIGH ? 0 : 1); ++ ++ rc = HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi, &setup_gsi); ++ if (rc == -EEXIST) ++ printk(KERN_INFO "Already setup the GSI :%d\n", gsi); ++ else if (rc) { ++ printk(KERN_ERR "Failed to setup GSI :%d, err_code:%d\n", ++ gsi, rc); ++ } ++ ++ return irq; ++} ++ ++#ifdef CONFIG_ACPI ++#define BAD_MADT_ENTRY(entry, end) ( \ ++ (!entry) || (unsigned long)entry + sizeof(*entry) > end || \ ++ ((struct acpi_subtable_header *)entry)->length < sizeof(*entry)) ++ ++ ++static int __init ++xen_acpi_parse_int_src_ovr(struct acpi_subtable_header * header, ++ const unsigned long end) ++{ ++ struct acpi_madt_interrupt_override *intsrc = NULL; ++ ++ intsrc = (struct acpi_madt_interrupt_override *)header; ++ ++ if (BAD_MADT_ENTRY(intsrc, end)) ++ return -EINVAL; ++ ++ acpi_table_print_madt_entry(header); ++ ++ if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) { ++ int gsi; ++ int trigger, polarity; ++ ++ trigger = intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK; ++ polarity = intsrc->inti_flags & ACPI_MADT_POLARITY_MASK; ++ ++ /* Command-line over-ride via acpi_sci= */ ++ if (acpi_sci_flags & ACPI_MADT_TRIGGER_MASK) ++ trigger = acpi_sci_flags & ACPI_MADT_TRIGGER_MASK; ++ ++ if (acpi_sci_flags & ACPI_MADT_POLARITY_MASK) ++ polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK; ++ ++ printk("xen: sci override: source_irq=%d global_irq=%d trigger=%x polarity=%x\n", ++ intsrc->source_irq, intsrc->global_irq, ++ trigger, polarity); ++ ++ switch (polarity) { ++ case ACPI_MADT_POLARITY_CONFORMS: ++ case ACPI_MADT_POLARITY_ACTIVE_LOW: ++ polarity = ACPI_ACTIVE_LOW; ++ break; ++ ++ case ACPI_MADT_POLARITY_ACTIVE_HIGH: ++ polarity = ACPI_ACTIVE_HIGH; ++ break; ++ ++ default: ++ return 0; ++ } ++ ++ switch (trigger) { ++ case ACPI_MADT_TRIGGER_CONFORMS: ++ case ACPI_MADT_TRIGGER_LEVEL: ++ trigger = ACPI_LEVEL_SENSITIVE; ++ break; ++ ++ case ACPI_MADT_TRIGGER_EDGE: ++ trigger = ACPI_EDGE_SENSITIVE; ++ break; ++ ++ default: ++ return 0; ++ } ++ ++ gsi = xen_register_gsi(intsrc->global_irq, ++ trigger, polarity); ++ /* ++ * stash over-ride to indicate we've been here ++ * and for later update of acpi_gbl_FADT ++ */ ++ acpi_sci_override_gsi = gsi; ++ ++ printk("xen: acpi sci %d\n", gsi); ++ } ++ ++ return 0; ++} ++ ++static __init void xen_setup_acpi_sci(void) ++{ ++ acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, ++ xen_acpi_parse_int_src_ovr, ++ nr_irqs); ++} ++#else ++static __init void xen_setup_acpi_sci(void) ++{ ++} ++#endif ++ ++void __init xen_setup_pirqs(void) ++{ ++ int irq; ++ ++ if (0 == nr_ioapics) { ++ for (irq = 0; irq < NR_IRQS_LEGACY; irq++) ++ xen_allocate_pirq(irq, 0, "xt-pic"); ++ return; ++ } ++ ++ /* Pre-allocate legacy irqs */ ++ for (irq = 0; irq < NR_IRQS_LEGACY; irq++) { ++ int trigger, polarity; ++ ++ if (acpi_get_override_irq(irq, &trigger, &polarity) == -1) ++ continue; ++ ++ xen_register_pirq(irq, ++ trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE); ++ } ++ ++ xen_setup_acpi_sci(); ++} ++ ++#ifdef CONFIG_PCI_MSI ++int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) ++{ ++ int irq, ret; ++ struct msi_desc *msidesc; ++ ++ list_for_each_entry(msidesc, &dev->msi_list, list) { ++ irq = xen_create_msi_irq(dev, msidesc, type); ++ if (irq < 0) ++ return -1; ++ ++ ret = set_irq_msi(irq, msidesc); ++ if (ret) ++ goto error; ++ } ++ return 0; ++ ++error: ++ xen_destroy_irq(irq); ++ return ret; ++} ++#endif ++ ++struct xen_device_domain_owner { ++ domid_t domain; ++ struct pci_dev *dev; ++ struct list_head list; ++}; ++ ++static DEFINE_SPINLOCK(dev_domain_list_spinlock); ++static struct list_head dev_domain_list = LIST_HEAD_INIT(dev_domain_list); ++ ++static struct xen_device_domain_owner *find_device(struct pci_dev *dev) ++{ ++ struct xen_device_domain_owner *owner; ++ ++ list_for_each_entry(owner, &dev_domain_list, list) { ++ if (owner->dev == dev) ++ return owner; ++ } ++ return NULL; ++} ++ ++int xen_find_device_domain_owner(struct pci_dev *dev) ++{ ++ struct xen_device_domain_owner *owner; ++ int domain = -ENODEV; ++ ++ spin_lock(&dev_domain_list_spinlock); ++ owner = find_device(dev); ++ if (owner) ++ domain = owner->domain; ++ spin_unlock(&dev_domain_list_spinlock); ++ return domain; ++} ++EXPORT_SYMBOL(xen_find_device_domain_owner); ++ ++int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain) ++{ ++ struct xen_device_domain_owner *owner; ++ ++ owner = kzalloc(sizeof(struct xen_device_domain_owner), GFP_KERNEL); ++ if (!owner) ++ return -ENODEV; ++ ++ spin_lock(&dev_domain_list_spinlock); ++ if (find_device(dev)) { ++ spin_unlock(&dev_domain_list_spinlock); ++ kfree(owner); ++ return -EEXIST; ++ } ++ owner->domain = domain; ++ owner->dev = dev; ++ list_add_tail(&owner->list, &dev_domain_list); ++ spin_unlock(&dev_domain_list_spinlock); ++ return 0; ++} ++EXPORT_SYMBOL(xen_register_device_domain_owner); ++ ++int xen_unregister_device_domain_owner(struct pci_dev *dev) ++{ ++ struct xen_device_domain_owner *owner; ++ ++ spin_lock(&dev_domain_list_spinlock); ++ owner = find_device(dev); ++ if (!owner) { ++ spin_unlock(&dev_domain_list_spinlock); ++ return -ENODEV; ++ } ++ list_del(&owner->list); ++ spin_unlock(&dev_domain_list_spinlock); ++ kfree(owner); ++ return 0; ++} ++EXPORT_SYMBOL(xen_unregister_device_domain_owner); +diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c +new file mode 100644 +index 0000000..0f45638 +--- /dev/null ++++ b/arch/x86/xen/platform-pci-unplug.c +@@ -0,0 +1,143 @@ ++/****************************************************************************** ++ * platform-pci-unplug.c ++ * ++ * Xen platform PCI device driver ++ * Copyright (c) 2010, Citrix ++ * ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms and conditions of the GNU General Public License, ++ * version 2, as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++ * more details. ++ * ++ * You should have received a copy of the GNU General Public License along with ++ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple ++ * Place - Suite 330, Boston, MA 02111-1307 USA. ++ * ++ */ ++ ++#include <linux/init.h> ++#include <linux/io.h> ++#include <linux/module.h> ++ ++#include <xen/platform_pci.h> ++ ++#define XEN_PLATFORM_ERR_MAGIC -1 ++#define XEN_PLATFORM_ERR_PROTOCOL -2 ++#define XEN_PLATFORM_ERR_BLACKLIST -3 ++ ++/* store the value of xen_emul_unplug after the unplug is done */ ++int xen_platform_pci_unplug; ++EXPORT_SYMBOL_GPL(xen_platform_pci_unplug); ++#ifdef CONFIG_XEN_PVHVM ++static int xen_emul_unplug; ++ ++static int __init check_platform_magic(void) ++{ ++ short magic; ++ char protocol; ++ ++ magic = inw(XEN_IOPORT_MAGIC); ++ if (magic != XEN_IOPORT_MAGIC_VAL) { ++ printk(KERN_ERR "Xen Platform PCI: unrecognised magic value\n"); ++ return XEN_PLATFORM_ERR_MAGIC; ++ } ++ ++ protocol = inb(XEN_IOPORT_PROTOVER); ++ ++ printk(KERN_DEBUG "Xen Platform PCI: I/O protocol version %d\n", ++ protocol); ++ ++ switch (protocol) { ++ case 1: ++ outw(XEN_IOPORT_LINUX_PRODNUM, XEN_IOPORT_PRODNUM); ++ outl(XEN_IOPORT_LINUX_DRVVER, XEN_IOPORT_DRVVER); ++ if (inw(XEN_IOPORT_MAGIC) != XEN_IOPORT_MAGIC_VAL) { ++ printk(KERN_ERR "Xen Platform: blacklisted by host\n"); ++ return XEN_PLATFORM_ERR_BLACKLIST; ++ } ++ break; ++ default: ++ printk(KERN_WARNING "Xen Platform PCI: unknown I/O protocol version"); ++ return XEN_PLATFORM_ERR_PROTOCOL; ++ } ++ ++ return 0; ++} ++ ++void __init xen_unplug_emulated_devices(void) ++{ ++ int r; ++ ++ /* user explicitly requested no unplug */ ++ if (xen_emul_unplug & XEN_UNPLUG_NEVER) ++ return; ++ /* check the version of the xen platform PCI device */ ++ r = check_platform_magic(); ++ /* If the version matches enable the Xen platform PCI driver. ++ * Also enable the Xen platform PCI driver if the host does ++ * not support the unplug protocol (XEN_PLATFORM_ERR_MAGIC) ++ * but the user told us that unplugging is unnecessary. */ ++ if (r && !(r == XEN_PLATFORM_ERR_MAGIC && ++ (xen_emul_unplug & XEN_UNPLUG_UNNECESSARY))) ++ return; ++ /* Set the default value of xen_emul_unplug depending on whether or ++ * not the Xen PV frontends and the Xen platform PCI driver have ++ * been compiled for this kernel (modules or built-in are both OK). */ ++ if (!xen_emul_unplug) { ++ if (xen_must_unplug_nics()) { ++ printk(KERN_INFO "Netfront and the Xen platform PCI driver have " ++ "been compiled for this kernel: unplug emulated NICs.\n"); ++ xen_emul_unplug |= XEN_UNPLUG_ALL_NICS; ++ } ++ if (xen_must_unplug_disks()) { ++ printk(KERN_INFO "Blkfront and the Xen platform PCI driver have " ++ "been compiled for this kernel: unplug emulated disks.\n" ++ "You might have to change the root device\n" ++ "from /dev/hd[a-d] to /dev/xvd[a-d]\n" ++ "in your root= kernel command line option\n"); ++ xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS; ++ } ++ } ++ /* Now unplug the emulated devices */ ++ if (!(xen_emul_unplug & XEN_UNPLUG_UNNECESSARY)) ++ outw(xen_emul_unplug, XEN_IOPORT_UNPLUG); ++ xen_platform_pci_unplug = xen_emul_unplug; ++} ++ ++static int __init parse_xen_emul_unplug(char *arg) ++{ ++ char *p, *q; ++ int l; ++ ++ for (p = arg; p; p = q) { ++ q = strchr(p, ','); ++ if (q) { ++ l = q - p; ++ q++; ++ } else { ++ l = strlen(p); ++ } ++ if (!strncmp(p, "all", l)) ++ xen_emul_unplug |= XEN_UNPLUG_ALL; ++ else if (!strncmp(p, "ide-disks", l)) ++ xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS; ++ else if (!strncmp(p, "aux-ide-disks", l)) ++ xen_emul_unplug |= XEN_UNPLUG_AUX_IDE_DISKS; ++ else if (!strncmp(p, "nics", l)) ++ xen_emul_unplug |= XEN_UNPLUG_ALL_NICS; ++ else if (!strncmp(p, "unnecessary", l)) ++ xen_emul_unplug |= XEN_UNPLUG_UNNECESSARY; ++ else if (!strncmp(p, "never", l)) ++ xen_emul_unplug |= XEN_UNPLUG_NEVER; ++ else ++ printk(KERN_WARNING "unrecognised option '%s' " ++ "in parameter 'xen_emul_unplug'\n", p); ++ } ++ return 0; ++} ++early_param("xen_emul_unplug", parse_xen_emul_unplug); ++#endif +diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c +index ad0047f..1a1934a 100644 +--- a/arch/x86/xen/setup.c ++++ b/arch/x86/xen/setup.c +@@ -10,6 +10,7 @@ + #include <linux/pm.h> + + #include <asm/elf.h> ++#include <asm/hpet.h> + #include <asm/vdso.h> + #include <asm/e820.h> + #include <asm/setup.h> +@@ -19,7 +20,9 @@ + + #include <xen/page.h> + #include <xen/interface/callback.h> ++#include <xen/interface/memory.h> + #include <xen/interface/physdev.h> ++#include <xen/interface/memory.h> + #include <xen/features.h> + + #include "xen-ops.h" +@@ -32,25 +35,178 @@ extern void xen_sysenter_target(void); + extern void xen_syscall_target(void); + extern void xen_syscall32_target(void); + ++/* Amount of extra memory space we add to the e820 ranges */ ++phys_addr_t xen_extra_mem_start, xen_extra_mem_size; ++ ++/* ++ * The maximum amount of extra memory compared to the base size. The ++ * main scaling factor is the size of struct page. At extreme ratios ++ * of base:extra, all the base memory can be filled with page ++ * structures for the extra memory, leaving no space for anything ++ * else. ++ * ++ * 10x seems like a reasonable balance between scaling flexibility and ++ * leaving a practically usable system. ++ */ ++#define EXTRA_MEM_RATIO (10) ++ ++static __init void xen_add_extra_mem(unsigned long pages) ++{ ++ u64 size = (u64)pages * PAGE_SIZE; ++ u64 extra_start = xen_extra_mem_start + xen_extra_mem_size; ++ ++ if (!pages) ++ return; ++ ++ e820_add_region(extra_start, size, E820_RAM); ++ sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); ++ ++ reserve_early(extra_start, extra_start + size, "XEN EXTRA"); ++ ++ xen_extra_mem_size += size; ++ ++ xen_max_p2m_pfn = PFN_DOWN(extra_start + size); ++} ++ ++static unsigned long __init xen_release_chunk(phys_addr_t start_addr, ++ phys_addr_t end_addr) ++{ ++ struct xen_memory_reservation reservation = { ++ .address_bits = 0, ++ .extent_order = 0, ++ .domid = DOMID_SELF ++ }; ++ unsigned long start, end; ++ unsigned long len = 0; ++ unsigned long pfn; ++ int ret; ++ ++ start = PFN_UP(start_addr); ++ end = PFN_DOWN(end_addr); ++ ++ if (end <= start) ++ return 0; ++ ++ printk(KERN_INFO "xen_release_chunk: looking at area pfn %lx-%lx: ", ++ start, end); ++ for(pfn = start; pfn < end; pfn++) { ++ unsigned long mfn = pfn_to_mfn(pfn); ++ ++ /* Make sure pfn exists to start with */ ++ if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) ++ continue; ++ ++ set_xen_guest_handle(reservation.extent_start, &mfn); ++ reservation.nr_extents = 1; ++ ++ ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, ++ &reservation); ++ WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n", ++ start, end, ret); ++ if (ret == 1) { ++ set_phys_to_machine(pfn, INVALID_P2M_ENTRY); ++ len++; ++ } ++ } ++ printk(KERN_CONT "%ld pages freed\n", len); ++ ++ return len; ++} ++ ++static unsigned long __init xen_return_unused_memory(unsigned long max_pfn, ++ const struct e820map *e820) ++{ ++ phys_addr_t max_addr = PFN_PHYS(max_pfn); ++ phys_addr_t last_end = ISA_END_ADDRESS; ++ unsigned long released = 0; ++ int i; ++ ++ /* Free any unused memory above the low 1Mbyte. */ ++ for (i = 0; i < e820->nr_map && last_end < max_addr; i++) { ++ phys_addr_t end = e820->map[i].addr; ++ end = min(max_addr, end); ++ ++ if (last_end < end) ++ released += xen_release_chunk(last_end, end); ++ last_end = max(last_end, e820->map[i].addr + e820->map[i].size); ++ } ++ ++ if (last_end < max_addr) ++ released += xen_release_chunk(last_end, max_addr); ++ ++ printk(KERN_INFO "released %ld pages of unused memory\n", released); ++ return released; ++} + + /** + * machine_specific_memory_setup - Hook for machine specific memory setup. + **/ +- + char * __init xen_memory_setup(void) + { ++ static struct e820entry map[E820MAX] __initdata; ++ + unsigned long max_pfn = xen_start_info->nr_pages; ++ unsigned long long mem_end; ++ int rc; ++ struct xen_memory_map memmap; ++ unsigned long extra_pages = 0; ++ unsigned long extra_limit; ++ int op; ++ int i; + + max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); ++ mem_end = PFN_PHYS(max_pfn); ++ ++ memmap.nr_entries = E820MAX; ++ set_xen_guest_handle(memmap.buffer, map); ++ ++ op = xen_initial_domain() ? ++ XENMEM_machine_memory_map : ++ XENMEM_memory_map; ++ rc = HYPERVISOR_memory_op(op, &memmap); ++ if (rc == -ENOSYS) { ++ BUG_ON(xen_initial_domain()); ++ memmap.nr_entries = 1; ++ map[0].addr = 0ULL; ++ map[0].size = mem_end; ++ /* 8MB slack (to balance backend allocations). */ ++ map[0].size += 8ULL << 20; ++ map[0].type = E820_RAM; ++ rc = 0; ++ } ++ BUG_ON(rc); + + e820.nr_map = 0; ++ xen_extra_mem_start = mem_end; ++ for (i = 0; i < memmap.nr_entries; i++) { ++ unsigned long long end = map[i].addr + map[i].size; ++ ++ if (map[i].type == E820_RAM && end > mem_end) { ++ /* RAM off the end - may be partially included */ ++ u64 delta = min(map[i].size, end - mem_end); + +- e820_add_region(0, PFN_PHYS((u64)max_pfn), E820_RAM); ++ map[i].size -= delta; ++ end -= delta; ++ ++ extra_pages += PFN_DOWN(delta); ++ } ++ ++ if (map[i].size > 0 && end > xen_extra_mem_start) ++ xen_extra_mem_start = end; ++ ++ /* Add region if any remains */ ++ if (map[i].size > 0) ++ e820_add_region(map[i].addr, map[i].size, map[i].type); ++ } + + /* +- * Even though this is normal, usable memory under Xen, reserve +- * ISA memory anyway because too many things think they can poke ++ * In domU, the ISA region is normal, usable memory, but we ++ * reserve ISA memory anyway because too many things poke + * about in there. ++ * ++ * In Dom0, the host E820 information can leave gaps in the ++ * ISA range, which would cause us to release those pages. To ++ * avoid this, we unconditionally reserve them here. + */ + e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, + E820_RESERVED); +@@ -67,21 +223,30 @@ char * __init xen_memory_setup(void) + + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + +- return "Xen"; +-} ++ extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820); + +-static void xen_idle(void) +-{ +- local_irq_disable(); +- +- if (need_resched()) +- local_irq_enable(); +- else { +- current_thread_info()->status &= ~TS_POLLING; +- smp_mb__after_clear_bit(); +- safe_halt(); +- current_thread_info()->status |= TS_POLLING; +- } ++ /* ++ * Clamp the amount of extra memory to a EXTRA_MEM_RATIO ++ * factor the base size. On non-highmem systems, the base ++ * size is the full initial memory allocation; on highmem it ++ * is limited to the max size of lowmem, so that it doesn't ++ * get completely filled. ++ * ++ * In principle there could be a problem in lowmem systems if ++ * the initial memory is also very large with respect to ++ * lowmem, but we won't try to deal with that here. ++ */ ++ extra_limit = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), ++ max_pfn + extra_pages); ++ ++ if (extra_limit >= max_pfn) ++ extra_pages = extra_limit - max_pfn; ++ else ++ extra_pages = 0; ++ ++ xen_add_extra_mem(extra_pages); ++ ++ return "Xen"; + } + + /* +@@ -156,6 +321,8 @@ void __init xen_arch_setup(void) + struct physdev_set_iopl set_iopl; + int rc; + ++ xen_panic_handler_init(); ++ + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); + +@@ -182,13 +349,21 @@ void __init xen_arch_setup(void) + } + #endif + ++ /* ++ * Xen hypervisor uses HPET to wakeup cpu from deep c-states, ++ * so the HPET usage in dom0 must be forbidden. ++ */ ++ disable_hpet(NULL); ++ + memcpy(boot_command_line, xen_start_info->cmd_line, + MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ? + COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE); + +- pm_idle = xen_idle; +- +- paravirt_disable_iospace(); ++ /* Set up idle, making sure it calls safe_halt() pvop */ ++#ifdef CONFIG_X86_32 ++ boot_cpu_data.hlt_works_ok = 1; ++#endif ++ pm_idle = default_idle; + + fiddle_vdso(); + } +diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c +index ca5f56e..3e06a9e 100644 +--- a/arch/x86/xen/smp.c ++++ b/arch/x86/xen/smp.c +@@ -178,11 +178,18 @@ static void __init xen_smp_prepare_boot_cpu(void) + static void __init xen_smp_prepare_cpus(unsigned int max_cpus) + { + unsigned cpu; ++ unsigned int i; + + xen_init_lock_cpu(0); + + smp_store_cpu_info(0); + cpu_data(0).x86_max_cores = 1; ++ ++ for_each_possible_cpu(i) { ++ zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); ++ zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); ++ zalloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL); ++ } + set_cpu_sibling_map(0); + + if (xen_smp_intr_init(0)) +@@ -299,6 +306,8 @@ static int __cpuinit xen_cpu_up(unsigned int cpu) + xen_setup_timer(cpu); + xen_init_lock_cpu(cpu); + ++ cpumask_set_cpu(cpu, cpu_callout_mask); ++ + per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; + + /* make sure interrupts start blocked */ +@@ -392,6 +401,8 @@ static void stop_self(void *v) + load_cr3(swapper_pg_dir); + /* should set up a minimal gdt */ + ++ set_cpu_online(cpu, false); ++ + HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL); + BUG(); + } +diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c +index a9c6611..1d789d5 100644 +--- a/arch/x86/xen/suspend.c ++++ b/arch/x86/xen/suspend.c +@@ -26,6 +26,18 @@ void xen_pre_suspend(void) + BUG(); + } + ++void xen_hvm_post_suspend(int suspend_cancelled) ++{ ++ int cpu; ++ xen_hvm_init_shared_info(); ++ xen_callback_vector(); ++ if (xen_feature(XENFEAT_hvm_safe_pvclock)) { ++ for_each_online_cpu(cpu) { ++ xen_setup_runstate_info(cpu); ++ } ++ } ++} ++ + void xen_post_suspend(int suspend_cancelled) + { + xen_build_mfn_list_list(); +diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c +index 8e04980..ab35140 100644 +--- a/arch/x86/xen/time.c ++++ b/arch/x86/xen/time.c +@@ -19,6 +19,7 @@ + #include <asm/xen/hypercall.h> + + #include <xen/events.h> ++#include <xen/features.h> + #include <xen/interface/xen.h> + #include <xen/interface/vcpu.h> + +@@ -155,7 +156,7 @@ static void do_stolen_accounting(void) + } + + /* Get the TSC speed from Xen */ +-unsigned long xen_tsc_khz(void) ++static unsigned long xen_tsc_khz(void) + { + struct pvclock_vcpu_time_info *info = + &HYPERVISOR_shared_info->vcpu_info[0].time; +@@ -190,7 +191,7 @@ static void xen_read_wallclock(struct timespec *ts) + put_cpu_var(xen_vcpu); + } + +-unsigned long xen_get_wallclock(void) ++static unsigned long xen_get_wallclock(void) + { + struct timespec ts; + +@@ -198,10 +199,24 @@ unsigned long xen_get_wallclock(void) + return ts.tv_sec; + } + +-int xen_set_wallclock(unsigned long now) ++static int xen_set_wallclock(unsigned long now) + { ++ struct xen_platform_op op; ++ int rc; ++ + /* do nothing for domU */ +- return -1; ++ if (!xen_initial_domain()) ++ return -1; ++ ++ op.cmd = XENPF_settime; ++ op.u.settime.secs = now; ++ op.u.settime.nsecs = 0; ++ op.u.settime.system_time = xen_clocksource_read(); ++ ++ rc = HYPERVISOR_dom0_op(&op); ++ WARN(rc != 0, "XENPF_settime failed: now=%ld\n", now); ++ ++ return rc; + } + + static struct clocksource xen_clocksource __read_mostly = { +@@ -403,6 +418,8 @@ void xen_setup_timer(int cpu) + + evt->cpumask = cpumask_of(cpu); + evt->irq = irq; ++ ++ xen_setup_runstate_info(cpu); + } + + void xen_teardown_timer(int cpu) +@@ -424,6 +441,8 @@ void xen_timer_resume(void) + { + int cpu; + ++ pvclock_resume(); ++ + if (xen_clockevent != &xen_vcpuop_clockevent) + return; + +@@ -433,7 +452,7 @@ void xen_timer_resume(void) + } + } + +-__init void xen_time_init(void) ++static __init void xen_time_init(void) + { + int cpu = smp_processor_id(); + +@@ -457,3 +476,51 @@ __init void xen_time_init(void) + xen_setup_timer(cpu); + xen_setup_cpu_clockevents(); + } ++ ++static const struct pv_time_ops xen_time_ops __initdata = { ++ .sched_clock = xen_clocksource_read, ++}; ++ ++__init void xen_init_time_ops(void) ++{ ++ pv_time_ops = xen_time_ops; ++ ++ x86_init.timers.timer_init = xen_time_init; ++ x86_init.timers.setup_percpu_clockev = x86_init_noop; ++ x86_cpuinit.setup_percpu_clockev = x86_init_noop; ++ ++ x86_platform.calibrate_tsc = xen_tsc_khz; ++ x86_platform.get_wallclock = xen_get_wallclock; ++ x86_platform.set_wallclock = xen_set_wallclock; ++} ++ ++#ifdef CONFIG_XEN_PVHVM ++static void xen_hvm_setup_cpu_clockevents(void) ++{ ++ int cpu = smp_processor_id(); ++ xen_setup_runstate_info(cpu); ++ xen_setup_timer(cpu); ++ xen_setup_cpu_clockevents(); ++} ++ ++__init void xen_hvm_init_time_ops(void) ++{ ++ /* vector callback is needed otherwise we cannot receive interrupts ++ * on cpu > 0 */ ++ if (!xen_have_vector_callback && num_present_cpus() > 1) ++ return; ++ if (!xen_feature(XENFEAT_hvm_safe_pvclock)) { ++ printk(KERN_INFO "Xen doesn't support pvclock on HVM," ++ "disable pv timer\n"); ++ return; ++ } ++ ++ pv_time_ops = xen_time_ops; ++ x86_init.timers.setup_percpu_clockev = xen_time_init; ++ x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents; ++ ++ x86_platform.calibrate_tsc = xen_tsc_khz; ++ x86_platform.get_wallclock = xen_get_wallclock; ++ x86_platform.set_wallclock = xen_set_wallclock; ++} ++#endif +diff --git a/arch/x86/xen/vga.c b/arch/x86/xen/vga.c +new file mode 100644 +index 0000000..1cd7f4d +--- /dev/null ++++ b/arch/x86/xen/vga.c +@@ -0,0 +1,67 @@ ++#include <linux/screen_info.h> ++#include <linux/init.h> ++ ++#include <asm/bootparam.h> ++#include <asm/setup.h> ++ ++#include <xen/interface/xen.h> ++ ++#include "xen-ops.h" ++ ++void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size) ++{ ++ struct screen_info *screen_info = &boot_params.screen_info; ++ ++ /* This is drawn from a dump from vgacon:startup in ++ * standard Linux. */ ++ screen_info->orig_video_mode = 3; ++ screen_info->orig_video_isVGA = 1; ++ screen_info->orig_video_lines = 25; ++ screen_info->orig_video_cols = 80; ++ screen_info->orig_video_ega_bx = 3; ++ screen_info->orig_video_points = 16; ++ screen_info->orig_y = screen_info->orig_video_lines - 1; ++ ++ switch (info->video_type) { ++ case XEN_VGATYPE_TEXT_MODE_3: ++ if (size < offsetof(struct dom0_vga_console_info, u.text_mode_3) ++ + sizeof(info->u.text_mode_3)) ++ break; ++ screen_info->orig_video_lines = info->u.text_mode_3.rows; ++ screen_info->orig_video_cols = info->u.text_mode_3.columns; ++ screen_info->orig_x = info->u.text_mode_3.cursor_x; ++ screen_info->orig_y = info->u.text_mode_3.cursor_y; ++ screen_info->orig_video_points = ++ info->u.text_mode_3.font_height; ++ break; ++ ++ case XEN_VGATYPE_VESA_LFB: ++ if (size < offsetof(struct dom0_vga_console_info, ++ u.vesa_lfb.gbl_caps)) ++ break; ++ screen_info->orig_video_isVGA = VIDEO_TYPE_VLFB; ++ screen_info->lfb_width = info->u.vesa_lfb.width; ++ screen_info->lfb_height = info->u.vesa_lfb.height; ++ screen_info->lfb_depth = info->u.vesa_lfb.bits_per_pixel; ++ screen_info->lfb_base = info->u.vesa_lfb.lfb_base; ++ screen_info->lfb_size = info->u.vesa_lfb.lfb_size; ++ screen_info->lfb_linelength = info->u.vesa_lfb.bytes_per_line; ++ screen_info->red_size = info->u.vesa_lfb.red_size; ++ screen_info->red_pos = info->u.vesa_lfb.red_pos; ++ screen_info->green_size = info->u.vesa_lfb.green_size; ++ screen_info->green_pos = info->u.vesa_lfb.green_pos; ++ screen_info->blue_size = info->u.vesa_lfb.blue_size; ++ screen_info->blue_pos = info->u.vesa_lfb.blue_pos; ++ screen_info->rsvd_size = info->u.vesa_lfb.rsvd_size; ++ screen_info->rsvd_pos = info->u.vesa_lfb.rsvd_pos; ++ if (size >= offsetof(struct dom0_vga_console_info, ++ u.vesa_lfb.gbl_caps) ++ + sizeof(info->u.vesa_lfb.gbl_caps)) ++ screen_info->capabilities = info->u.vesa_lfb.gbl_caps; ++ if (size >= offsetof(struct dom0_vga_console_info, ++ u.vesa_lfb.mode_attrs) ++ + sizeof(info->u.vesa_lfb.mode_attrs)) ++ screen_info->vesa_attributes = info->u.vesa_lfb.mode_attrs; ++ break; ++ } ++} +diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h +index f9153a3..ebbee21 100644 +--- a/arch/x86/xen/xen-ops.h ++++ b/arch/x86/xen/xen-ops.h +@@ -30,6 +30,10 @@ void xen_setup_machphys_mapping(void); + pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); + void xen_ident_map_ISA(void); + void xen_reserve_top(void); ++void xen_ident_map_ISA(void); ++extern unsigned long xen_max_p2m_pfn; ++ ++void xen_set_pat(u64); + + char * __init xen_memory_setup(void); + void __init xen_arch_setup(void); +@@ -38,6 +42,10 @@ void xen_enable_sysenter(void); + void xen_enable_syscall(void); + void xen_vcpu_restore(void); + ++void xen_callback_vector(void); ++void xen_hvm_init_shared_info(void); ++void __init xen_unplug_emulated_devices(void); ++ + void __init xen_build_dynamic_phys_to_machine(void); + + void xen_init_irq_ops(void); +@@ -46,11 +54,8 @@ void xen_setup_runstate_info(int cpu); + void xen_teardown_timer(int cpu); + cycle_t xen_clocksource_read(void); + void xen_setup_cpu_clockevents(void); +-unsigned long xen_tsc_khz(void); +-void __init xen_time_init(void); +-unsigned long xen_get_wallclock(void); +-int xen_set_wallclock(unsigned long time); +-unsigned long long xen_sched_clock(void); ++void __init xen_init_time_ops(void); ++void __init xen_hvm_init_time_ops(void); + + irqreturn_t xen_debug_interrupt(int irq, void *dev_id); + +@@ -82,6 +87,23 @@ static inline void xen_uninit_lock_cpu(int cpu) + } + #endif + ++struct dom0_vga_console_info; ++ ++#ifdef CONFIG_XEN_DOM0 ++void xen_init_vga(const struct dom0_vga_console_info *, size_t size); ++#else ++static inline void xen_init_vga(const struct dom0_vga_console_info *info, ++ size_t size) ++{ ++} ++#endif ++ ++#ifdef CONFIG_XEN_DOM0 ++void xen_init_apic(void); ++#else ++static inline void xen_init_apic(void) {} ++#endif ++ + /* Declare an asm function, along with symbols needed to make it + inlineable */ + #define DECL_ASM(ret, name, ...) \ +@@ -101,4 +123,6 @@ void xen_sysret32(void); + void xen_sysret64(void); + void xen_adjust_exception_frame(void); + ++extern int xen_panic_handler_init(void); ++ + #endif /* XEN_OPS_H */ +diff --git a/block/blk-core.c b/block/blk-core.c +index 71da511..32d305c 100644 +--- a/block/blk-core.c ++++ b/block/blk-core.c +@@ -439,6 +439,7 @@ void blk_put_queue(struct request_queue *q) + { + kobject_put(&q->kobj); + } ++EXPORT_SYMBOL_GPL(blk_put_queue); + + void blk_cleanup_queue(struct request_queue *q) + { +@@ -612,6 +613,7 @@ int blk_get_queue(struct request_queue *q) + + return 1; + } ++EXPORT_SYMBOL_GPL(blk_get_queue); + + static inline void blk_free_request(struct request_queue *q, struct request *rq) + { +diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile +index 7702118..1be123c 100644 +--- a/drivers/acpi/Makefile ++++ b/drivers/acpi/Makefile +@@ -61,6 +61,7 @@ obj-$(CONFIG_ACPI_POWER_METER) += power_meter.o + # processor has its own "processor." module_param namespace + processor-y := processor_core.o processor_throttling.o + processor-y += processor_idle.o processor_thermal.o ++processor-y += processor_xen.o + processor-$(CONFIG_CPU_FREQ) += processor_perflib.o + + obj-$(CONFIG_ACPI_PROCESSOR_AGGREGATOR) += acpi_pad.o +diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c +index 28ccdbc..b0f9ed6 100644 +--- a/drivers/acpi/acpi_memhotplug.c ++++ b/drivers/acpi/acpi_memhotplug.c +@@ -31,6 +31,7 @@ + #include <linux/types.h> + #include <linux/memory_hotplug.h> + #include <acpi/acpi_drivers.h> ++#include <xen/acpi.h> + + #define ACPI_MEMORY_DEVICE_CLASS "memory" + #define ACPI_MEMORY_DEVICE_HID "PNP0C80" +@@ -70,21 +71,6 @@ static struct acpi_driver acpi_memory_device_driver = { + }, + }; + +-struct acpi_memory_info { +- struct list_head list; +- u64 start_addr; /* Memory Range start physical addr */ +- u64 length; /* Memory Range length */ +- unsigned short caching; /* memory cache attribute */ +- unsigned short write_protect; /* memory read/write attribute */ +- unsigned int enabled:1; +-}; +- +-struct acpi_memory_device { +- struct acpi_device * device; +- unsigned int state; /* State of the memory device */ +- struct list_head res_list; +-}; +- + static int acpi_hotmem_initialized; + + static acpi_status +@@ -228,6 +214,9 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device) + return result; + } + ++ if (xen_initial_domain()) ++ return xen_hotadd_memory(mem_device); ++ + node = acpi_get_node(mem_device->device->handle); + /* + * Tell the VM there is more memory here... +diff --git a/drivers/acpi/acpica/hwsleep.c b/drivers/acpi/acpica/hwsleep.c +index cc22f9a..747d96f 100644 +--- a/drivers/acpi/acpica/hwsleep.c ++++ b/drivers/acpi/acpica/hwsleep.c +@@ -47,6 +47,9 @@ + #include "actables.h" + #include <linux/tboot.h> + ++#include <xen/acpi.h> ++#include <asm/xen/hypervisor.h> ++ + #define _COMPONENT ACPI_HARDWARE + ACPI_MODULE_NAME("hwsleep") + +@@ -346,6 +349,19 @@ acpi_status asmlinkage acpi_enter_sleep_state(u8 sleep_state) + tboot_sleep(sleep_state, pm1a_control, pm1b_control); + + /* Write #2: Write both SLP_TYP + SLP_EN */ ++ if (xen_pv_acpi()) { ++ int err; ++ ++ err = acpi_notify_hypervisor_state(sleep_state, ++ pm1a_control, pm1b_control); ++ if (err) { ++ ACPI_DEBUG_PRINT((ACPI_DB_INIT, ++ "Hypervisor failure [%d]\n", err)); ++ return_ACPI_STATUS(AE_ERROR); ++ } ++ ++ return_ACPI_STATUS(AE_OK); ++ } + + status = acpi_hw_write_pm1_control(pm1a_control, pm1b_control); + if (ACPI_FAILURE(status)) { +diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c +index 7102474..2428cc0 100644 +--- a/drivers/acpi/processor_core.c ++++ b/drivers/acpi/processor_core.c +@@ -58,6 +58,7 @@ + #include <acpi/acpi_bus.h> + #include <acpi/acpi_drivers.h> + #include <acpi/processor.h> ++#include <xen/acpi.h> + + #define PREFIX "ACPI: " + +@@ -81,11 +82,9 @@ MODULE_DESCRIPTION("ACPI Processor Driver"); + MODULE_LICENSE("GPL"); + + static int acpi_processor_add(struct acpi_device *device); +-static int acpi_processor_remove(struct acpi_device *device, int type); + #ifdef CONFIG_ACPI_PROCFS + static int acpi_processor_info_open_fs(struct inode *inode, struct file *file); + #endif +-static void acpi_processor_notify(struct acpi_device *device, u32 event); + static acpi_status acpi_processor_hotadd_init(acpi_handle handle, int *p_cpu); + static int acpi_processor_handle_eject(struct acpi_processor *pr); + +@@ -247,7 +246,7 @@ static int acpi_processor_errata_piix4(struct pci_dev *dev) + return 0; + } + +-static int acpi_processor_errata(struct acpi_processor *pr) ++int acpi_processor_errata(struct acpi_processor *pr) + { + int result = 0; + struct pci_dev *dev = NULL; +@@ -278,7 +277,7 @@ static int acpi_processor_errata(struct acpi_processor *pr) + * _PDC is required for a BIOS-OS handshake for most of the newer + * ACPI processor features. + */ +-static int acpi_processor_set_pdc(struct acpi_processor *pr) ++int acpi_processor_set_pdc(struct acpi_processor *pr) + { + struct acpi_object_list *pdc_in = pr->pdc; + acpi_status status = AE_OK; +@@ -347,7 +346,7 @@ static int acpi_processor_info_open_fs(struct inode *inode, struct file *file) + PDE(inode)->data); + } + +-static int acpi_processor_add_fs(struct acpi_device *device) ++int acpi_processor_add_fs(struct acpi_device *device) + { + struct proc_dir_entry *entry = NULL; + +@@ -386,7 +385,7 @@ static int acpi_processor_add_fs(struct acpi_device *device) + return -EIO; + return 0; + } +-static int acpi_processor_remove_fs(struct acpi_device *device) ++int acpi_processor_remove_fs(struct acpi_device *device) + { + + if (acpi_device_dir(device)) { +@@ -402,15 +401,6 @@ static int acpi_processor_remove_fs(struct acpi_device *device) + + return 0; + } +-#else +-static inline int acpi_processor_add_fs(struct acpi_device *device) +-{ +- return 0; +-} +-static inline int acpi_processor_remove_fs(struct acpi_device *device) +-{ +- return 0; +-} + #endif + + /* Use the acpiid in MADT to map cpus in case of SMP */ +@@ -705,7 +695,7 @@ static int acpi_processor_get_info(struct acpi_device *device) + + static DEFINE_PER_CPU(void *, processor_device_array); + +-static void acpi_processor_notify(struct acpi_device *device, u32 event) ++void acpi_processor_notify(struct acpi_device *device, u32 event) + { + struct acpi_processor *pr = acpi_driver_data(device); + int saved; +@@ -873,7 +863,7 @@ err_free_cpumask: + return result; + } + +-static int acpi_processor_remove(struct acpi_device *device, int type) ++int acpi_processor_remove(struct acpi_device *device, int type) + { + struct acpi_processor *pr = NULL; + +@@ -1148,7 +1138,11 @@ static int __init acpi_processor_init(void) + if (result < 0) + goto out_proc; + +- result = acpi_bus_register_driver(&acpi_processor_driver); ++ if (xen_initial_domain()) ++ result = xen_acpi_processor_init(); ++ else ++ result = acpi_bus_register_driver(&acpi_processor_driver); ++ + if (result < 0) + goto out_cpuidle; + +@@ -1184,7 +1178,10 @@ static void __exit acpi_processor_exit(void) + + acpi_processor_uninstall_hotplug_notify(); + +- acpi_bus_unregister_driver(&acpi_processor_driver); ++ if (xen_initial_domain()) ++ xen_acpi_processor_exit(); ++ else ++ acpi_bus_unregister_driver(&acpi_processor_driver); + + cpuidle_unregister_driver(&acpi_idle_driver); + +diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c +index a6ad608..3c32e87 100644 +--- a/drivers/acpi/processor_idle.c ++++ b/drivers/acpi/processor_idle.c +@@ -58,6 +58,7 @@ + + #include <acpi/acpi_bus.h> + #include <acpi/processor.h> ++#include <xen/acpi.h> + #include <asm/processor.h> + + #define PREFIX "ACPI: " +@@ -439,7 +440,8 @@ static int acpi_processor_get_power_info_cst(struct acpi_processor *pr) + cx.entry_method = ACPI_CSTATE_HALT; + snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI HLT"); + } else { +- continue; ++ if (!xen_initial_domain()) ++ continue; + } + if (cx.type == ACPI_STATE_C1 && + (idle_halt || idle_nomwait)) { +@@ -477,6 +479,9 @@ static int acpi_processor_get_power_info_cst(struct acpi_processor *pr) + + cx.power = obj->integer.value; + ++ /* cache control methods to notify xen*/ ++ processor_cntl_xen_power_cache(pr->acpi_id, i, reg); ++ + current_count++; + memcpy(&(pr->power.states[current_count]), &cx, sizeof(cx)); + +@@ -653,7 +658,7 @@ static int acpi_processor_power_verify(struct acpi_processor *pr) + return (working); + } + +-static int acpi_processor_get_power_info(struct acpi_processor *pr) ++int acpi_processor_get_power_info(struct acpi_processor *pr) + { + unsigned int i; + int result; +@@ -1223,9 +1228,14 @@ int __cpuinit acpi_processor_power_init(struct acpi_processor *pr, + * platforms that only support C1. + */ + if (pr->flags.power) { +- acpi_processor_setup_cpuidle(pr); +- if (cpuidle_register_device(&pr->power.dev)) +- return -EIO; ++ if (xen_initial_domain()) { ++ processor_cntl_xen_notify(pr, ++ PROCESSOR_PM_INIT, PM_TYPE_IDLE); ++ } else { ++ acpi_processor_setup_cpuidle(pr); ++ if (cpuidle_register_device(&pr->power.dev)) ++ return -EIO; ++ } + } + #ifdef CONFIG_ACPI_PROCFS + /* 'power' [R] */ +diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c +index 40d395e..7ba143d 100644 +--- a/drivers/acpi/processor_perflib.c ++++ b/drivers/acpi/processor_perflib.c +@@ -332,7 +332,7 @@ static int acpi_processor_get_performance_states(struct acpi_processor *pr) + return result; + } + +-static int acpi_processor_get_performance_info(struct acpi_processor *pr) ++int acpi_processor_get_performance_info(struct acpi_processor *pr) + { + int result = 0; + acpi_status status = AE_OK; +@@ -438,7 +438,7 @@ int acpi_processor_notify_smm(struct module *calling_module) + + EXPORT_SYMBOL(acpi_processor_notify_smm); + +-static int acpi_processor_get_psd(struct acpi_processor *pr) ++int acpi_processor_get_psd(struct acpi_processor *pr) + { + int result = 0; + acpi_status status = AE_OK; +diff --git a/drivers/acpi/processor_xen.c b/drivers/acpi/processor_xen.c +new file mode 100644 +index 0000000..305398d +--- /dev/null ++++ b/drivers/acpi/processor_xen.c +@@ -0,0 +1,651 @@ ++/* ++ * processor_xen.c - ACPI Processor Driver for xen ++ * ++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or (at ++ * your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, but ++ * WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with this program; if not, write to the Free Software Foundation, Inc., ++ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. ++ * ++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ */ ++ ++#include <linux/kernel.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/types.h> ++#include <linux/pci.h> ++#include <linux/pm.h> ++#include <linux/cpufreq.h> ++#include <linux/cpu.h> ++#include <linux/proc_fs.h> ++#include <linux/seq_file.h> ++#include <linux/dmi.h> ++#include <linux/moduleparam.h> ++#include <linux/cpuidle.h> ++#include <linux/acpi.h> ++ ++#include <acpi/acpi_bus.h> ++#include <acpi/acpi_drivers.h> ++#include <acpi/processor.h> ++#include <xen/acpi.h> ++#include <xen/pcpu.h> ++ ++#define PREFIX "ACPI: " ++ ++#define ACPI_PROCESSOR_CLASS "processor" ++#define ACPI_PROCESSOR_DEVICE_NAME "Processor" ++#define ACPI_PROCESSOR_FILE_INFO "info" ++#define ACPI_PROCESSOR_FILE_THROTTLING "throttling" ++#define ACPI_PROCESSOR_FILE_LIMIT "limit" ++#define ACPI_PROCESSOR_NOTIFY_PERFORMANCE 0x80 ++#define ACPI_PROCESSOR_NOTIFY_POWER 0x81 ++#define ACPI_PROCESSOR_NOTIFY_THROTTLING 0x82 ++ ++#define _COMPONENT ACPI_PROCESSOR_COMPONENT ++ACPI_MODULE_NAME("processor_xen"); ++ ++static const struct acpi_device_id processor_device_ids[] = { ++ {ACPI_PROCESSOR_OBJECT_HID, 0}, ++ {"ACPI0007", 0}, ++ {"", 0}, ++}; ++ ++/* ++ * Xen ACPI processor driver ++ */ ++ ++/* from processor_core.c */ ++ ++static int xen_acpi_processor_add(struct acpi_device *device); ++static void xen_acpi_processor_notify(struct acpi_device *device, u32 event); ++ ++struct acpi_driver xen_acpi_processor_driver = { ++ .name = "processor", ++ .class = ACPI_PROCESSOR_CLASS, ++ .ids = processor_device_ids, ++ .ops = { ++ .add = xen_acpi_processor_add, ++ .remove = acpi_processor_remove, ++ .suspend = acpi_processor_suspend, ++ .resume = acpi_processor_resume, ++ .notify = xen_acpi_processor_notify, ++ }, ++}; ++ ++static int is_processor_present(acpi_handle handle) ++{ ++ acpi_status status; ++ unsigned long long sta = 0; ++ ++ ++ status = acpi_evaluate_integer(handle, "_STA", NULL, &sta); ++ ++ if (ACPI_SUCCESS(status) && (sta & ACPI_STA_DEVICE_PRESENT)) ++ return 1; ++ ++ /* ++ * _STA is mandatory for a processor that supports hot plug ++ */ ++ if (status == AE_NOT_FOUND) ++ ACPI_DEBUG_PRINT((ACPI_DB_INFO, ++ "Processor does not support hot plug\n")); ++ else ++ ACPI_EXCEPTION((AE_INFO, status, ++ "Processor Device is not present")); ++ return 0; ++} ++ ++static acpi_status ++xen_acpi_processor_hotadd_init(struct acpi_processor *pr, int *p_cpu) ++{ ++ if (!is_processor_present(pr->handle)) ++ return AE_ERROR; ++ ++ if (processor_cntl_xen_notify(pr, ++ PROCESSOR_HOTPLUG, HOTPLUG_TYPE_ADD)) ++ return AE_ERROR; ++ ++ return AE_OK; ++} ++ ++static int xen_acpi_processor_get_info(struct acpi_device *device) ++{ ++ acpi_status status = 0; ++ union acpi_object object = { 0 }; ++ struct acpi_buffer buffer = { sizeof(union acpi_object), &object }; ++ struct acpi_processor *pr; ++ int cpu_index, device_declaration = 0; ++ static int cpu0_initialized; ++ ++ pr = acpi_driver_data(device); ++ if (!pr) ++ return -EINVAL; ++ ++ if (num_online_cpus() > 1) ++ errata.smp = TRUE; ++ ++ acpi_processor_errata(pr); ++ ++ /* ++ * Check to see if we have bus mastering arbitration control. This ++ * is required for proper C3 usage (to maintain cache coherency). ++ */ ++ if (acpi_gbl_FADT.pm2_control_block && ++ acpi_gbl_FADT.pm2_control_length) { ++ pr->flags.bm_control = 1; ++ ACPI_DEBUG_PRINT((ACPI_DB_INFO, ++ "Bus mastering arbitration control present\n" ++ )); ++ } else ++ ACPI_DEBUG_PRINT((ACPI_DB_INFO, ++ "No bus mastering arbitration control\n")); ++ ++ if (!strcmp(acpi_device_hid(device), ACPI_PROCESSOR_OBJECT_HID)) { ++ /* Declared with "Processor" statement; match ProcessorID */ ++ status = acpi_evaluate_object(pr->handle, NULL, NULL, &buffer); ++ if (ACPI_FAILURE(status)) { ++ printk(KERN_ERR PREFIX "Evaluating processor object\n"); ++ return -ENODEV; ++ } ++ ++ /* ++ * TBD: Synch processor ID (via LAPIC/LSAPIC structures) on SMP. ++ * >>> 'acpi_get_processor_id(acpi_id, &id)' in ++ * arch/xxx/acpi.c ++ */ ++ pr->acpi_id = object.processor.proc_id; ++ } else { ++ /* ++ * Declared with "Device" statement; match _UID. ++ * Note that we don't handle string _UIDs yet. ++ */ ++ unsigned long long value; ++ status = acpi_evaluate_integer(pr->handle, METHOD_NAME__UID, ++ NULL, &value); ++ if (ACPI_FAILURE(status)) { ++ printk(KERN_ERR PREFIX ++ "Evaluating processor _UID [%#x]\n", status); ++ return -ENODEV; ++ } ++ device_declaration = 1; ++ pr->acpi_id = value; ++ } ++ ++ /* TBD: add Xen specific code to query cpu_index */ ++ cpu_index = -1; ++ ++ /* Handle UP system running SMP kernel, with no LAPIC in MADT */ ++ if (!cpu0_initialized && (cpu_index == -1) && ++ (num_online_cpus() == 1)) { ++ cpu_index = 0; ++ } ++ ++ cpu0_initialized = 1; ++ ++ pr->id = cpu_index; ++ ++ /* ++ * Extra Processor objects may be enumerated on MP systems with ++ * less than the max # of CPUs, or Xen vCPU < pCPU. ++ * They should be ignored _iff they are physically not present. ++ * ++ */ ++ if (xen_pcpu_index(pr->acpi_id, 1) == -1) { ++ if (ACPI_FAILURE ++ (xen_acpi_processor_hotadd_init(pr, &pr->id))) { ++ return -ENODEV; ++ } ++ } ++ ++ /* ++ * On some boxes several processors use the same processor bus id. ++ * But they are located in different scope. For example: ++ * \_SB.SCK0.CPU0 ++ * \_SB.SCK1.CPU0 ++ * Rename the processor device bus id. And the new bus id will be ++ * generated as the following format: ++ * CPU+CPU ID. ++ */ ++ sprintf(acpi_device_bid(device), "CPU%X", pr->id); ++ ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Processor [%d:%d]\n", pr->id, ++ pr->acpi_id)); ++ ++ if (!object.processor.pblk_address) ++ ACPI_DEBUG_PRINT((ACPI_DB_INFO, "No PBLK (NULL address)\n")); ++ else if (object.processor.pblk_length != 6) ++ printk(KERN_ERR PREFIX "Invalid PBLK length [%d]\n", ++ object.processor.pblk_length); ++ else { ++ pr->throttling.address = object.processor.pblk_address; ++ pr->throttling.duty_offset = acpi_gbl_FADT.duty_offset; ++ pr->throttling.duty_width = acpi_gbl_FADT.duty_width; ++ ++ pr->pblk = object.processor.pblk_address; ++ ++ /* ++ * We don't care about error returns - we just try to mark ++ * these reserved so that nobody else is confused into thinking ++ * that this region might be unused.. ++ * ++ * (In particular, allocating the IO range for Cardbus) ++ */ ++ request_region(pr->throttling.address, 6, "ACPI CPU throttle"); ++ } ++ ++ /* ++ * If ACPI describes a slot number for this CPU, we can use it ++ * ensure we get the right value in the "physical id" field ++ * of /proc/cpuinfo ++ */ ++ status = acpi_evaluate_object(pr->handle, "_SUN", NULL, &buffer); ++ if (ACPI_SUCCESS(status)) ++ arch_fix_phys_package_id(pr->id, object.integer.value); ++ ++ return 0; ++} ++ ++static struct acpi_device *processor_device_array[XEN_MAX_ACPI_ID + 1]; ++ ++static int __cpuinit xen_acpi_processor_add(struct acpi_device *device) ++{ ++ struct acpi_processor *pr = NULL; ++ int result = 0; ++ struct sys_device *sysdev; ++ ++ pr = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL); ++ if (!pr) ++ return -ENOMEM; ++ ++ if (!zalloc_cpumask_var(&pr->throttling.shared_cpu_map, GFP_KERNEL)) { ++ kfree(pr); ++ return -ENOMEM; ++ } ++ ++ pr->handle = device->handle; ++ strcpy(acpi_device_name(device), ACPI_PROCESSOR_DEVICE_NAME); ++ strcpy(acpi_device_class(device), ACPI_PROCESSOR_CLASS); ++ device->driver_data = pr; ++ ++ result = xen_acpi_processor_get_info(device); ++ if (result) { ++ /* Processor is physically not present */ ++ return 0; ++ } ++ ++ /* ++ * Buggy BIOS check ++ * ACPI id of processors can be reported wrongly by the BIOS. ++ * Don't trust it blindly ++ */ ++ if (pr->acpi_id > XEN_MAX_ACPI_ID || ++ (processor_device_array[pr->acpi_id] != NULL && ++ processor_device_array[pr->acpi_id] != device)) { ++ printk(KERN_WARNING "BIOS reported wrong ACPI id " ++ "for the processor\n"); ++ result = -ENODEV; ++ goto err_free_cpumask; ++ } ++ ++ processor_device_array[pr->acpi_id] = device; ++ ++ if (pr->id != -1) { ++ per_cpu(processors, pr->id) = pr; ++ ++ result = acpi_processor_add_fs(device); ++ if (result) ++ goto err_free_cpumask; ++ ++ sysdev = get_cpu_sysdev(pr->id); ++ if (sysdev != NULL && sysfs_create_link(&device->dev.kobj, ++ &sysdev->kobj, "sysdev")) { ++ result = -EFAULT; ++ goto err_remove_fs; ++ } ++ } ++ ++ /* _PDC call should be done before doing anything else (if reqd.). */ ++ xen_arch_acpi_processor_init_pdc(pr); ++ acpi_processor_set_pdc(pr); ++ arch_acpi_processor_cleanup_pdc(pr); ++ ++#ifdef CONFIG_CPU_FREQ ++ xen_acpi_processor_ppc_has_changed(pr); ++ result = xen_acpi_processor_get_performance(pr); ++ if (result) ++ goto err_remove_fs; ++#endif ++ ++ if (pr->id != -1) { ++ acpi_processor_get_throttling_info(pr); ++ acpi_processor_get_limit_info(pr); ++ } ++ ++ xen_acpi_processor_power_init(pr, device); ++ ++ if (pr->id != -1) { ++ pr->cdev = thermal_cooling_device_register("Processor", device, ++ &processor_cooling_ops); ++ if (IS_ERR(pr->cdev)) { ++ result = PTR_ERR(pr->cdev); ++ goto err_power_exit; ++ } ++ ++ dev_info(&device->dev, "registered as cooling_device%d\n", ++ pr->cdev->id); ++ ++ result = sysfs_create_link(&device->dev.kobj, ++ &pr->cdev->device.kobj, ++ "thermal_cooling"); ++ if (result) { ++ printk(KERN_ERR PREFIX "Create sysfs link\n"); ++ goto err_thermal_unregister; ++ } ++ result = sysfs_create_link(&pr->cdev->device.kobj, ++ &device->dev.kobj, ++ "device"); ++ if (result) { ++ printk(KERN_ERR PREFIX "Create sysfs link\n"); ++ goto err_remove_sysfs; ++ } ++ } ++ ++ return 0; ++ ++err_remove_sysfs: ++ sysfs_remove_link(&device->dev.kobj, "thermal_cooling"); ++err_thermal_unregister: ++ thermal_cooling_device_unregister(pr->cdev); ++err_power_exit: ++ acpi_processor_power_exit(pr, device); ++err_remove_fs: ++ acpi_processor_remove_fs(device); ++err_free_cpumask: ++ free_cpumask_var(pr->throttling.shared_cpu_map); ++ ++ return result; ++} ++ ++static void xen_acpi_processor_notify(struct acpi_device *device, u32 event) ++{ ++ struct acpi_processor *pr = acpi_driver_data(device); ++ int saved; ++ ++ if (!pr) ++ return; ++ ++ switch (event) { ++ case ACPI_PROCESSOR_NOTIFY_PERFORMANCE: ++ saved = pr->performance_platform_limit; ++ xen_acpi_processor_ppc_has_changed(pr); ++ if (saved == pr->performance_platform_limit) ++ break; ++ acpi_bus_generate_proc_event(device, event, ++ pr->performance_platform_limit); ++ acpi_bus_generate_netlink_event(device->pnp.device_class, ++ dev_name(&device->dev), event, ++ pr->performance_platform_limit); ++ break; ++ case ACPI_PROCESSOR_NOTIFY_POWER: ++ xen_acpi_processor_cst_has_changed(pr); ++ acpi_bus_generate_proc_event(device, event, 0); ++ acpi_bus_generate_netlink_event(device->pnp.device_class, ++ dev_name(&device->dev), event, 0); ++ break; ++ case ACPI_PROCESSOR_NOTIFY_THROTTLING: ++ acpi_processor_tstate_has_changed(pr); ++ acpi_bus_generate_proc_event(device, event, 0); ++ acpi_bus_generate_netlink_event(device->pnp.device_class, ++ dev_name(&device->dev), event, 0); ++ default: ++ ACPI_DEBUG_PRINT((ACPI_DB_INFO, ++ "Unsupported event [0x%x]\n", event)); ++ break; ++ } ++ ++ return; ++} ++ ++/* from processor_idle.c */ ++ ++static int xen_acpi_processor_get_power_info(struct acpi_processor *pr) ++{ ++ int ret; ++ int invalid_pr_id = 0; ++ ++ /* ++ * acpi_processor_get_power_info need valid pr->id ++ * so set pr->id=0 temporarily ++ */ ++ if (pr->id == -1) { ++ invalid_pr_id = 1; ++ pr->id = 0; ++ } ++ ++ ret = acpi_processor_get_power_info(pr); ++ ++ if (invalid_pr_id) ++ pr->id = -1; ++ ++ return ret; ++} ++ ++int xen_acpi_processor_cst_has_changed(struct acpi_processor *pr) ++{ ++ if (!pr) ++ return -EINVAL; ++ ++ if (!pr->flags.power_setup_done) ++ return -ENODEV; ++ ++ xen_acpi_processor_get_power_info(pr); ++ ++ processor_cntl_xen_notify(pr, ++ PROCESSOR_PM_CHANGE, PM_TYPE_IDLE); ++ ++ return 0; ++} ++ ++ ++int __cpuinit xen_acpi_processor_power_init(struct acpi_processor *pr, ++ struct acpi_device *device) ++{ ++ acpi_status status = 0; ++ unsigned int i; ++ ++ if (!pr) ++ return -EINVAL; ++ ++ if (acpi_gbl_FADT.cst_control) { ++ status = acpi_os_write_port(acpi_gbl_FADT.smi_command, ++ acpi_gbl_FADT.cst_control, 8); ++ if (ACPI_FAILURE(status)) { ++ ACPI_EXCEPTION((AE_INFO, status, ++ "Notifying BIOS of _CST ability failed")); ++ } ++ } ++ ++ xen_acpi_processor_get_power_info(pr); ++ ++ pr->flags.power_setup_done = 1; ++ ++ if (pr->flags.power) { ++ processor_cntl_xen_notify(pr, ++ PROCESSOR_PM_INIT, PM_TYPE_IDLE); ++ ++ printk(KERN_INFO PREFIX "CPU%d (power states:", pr->id); ++ for (i = 1; i <= pr->power.count; i++) ++ if (pr->power.states[i].valid) ++ printk(" C%d[C%d]", i, ++ pr->power.states[i].type); ++ printk(")\n"); ++ } ++ ++ return 0; ++} ++ ++/* from processor_perflib.c */ ++ ++#ifdef CONFIG_CPU_FREQ ++static int xen_processor_notify_smm(void) ++{ ++ acpi_status status; ++ static int is_done; ++ ++ /* only need successfully notify BIOS once */ ++ /* avoid double notification which may lead to unexpected result */ ++ if (is_done) ++ return 0; ++ ++ /* Can't write pstate_cnt to smi_cmd if either value is zero */ ++ if ((!acpi_gbl_FADT.smi_command) || (!acpi_gbl_FADT.pstate_control)) { ++ ACPI_DEBUG_PRINT((ACPI_DB_INFO, "No SMI port or pstate_cnt\n")); ++ return 0; ++ } ++ ++ ACPI_DEBUG_PRINT((ACPI_DB_INFO, ++ "Writing pstate_cnt [0x%x] to smi_cmd [0x%x]\n", ++ acpi_gbl_FADT.pstate_control, acpi_gbl_FADT.smi_command)); ++ ++ status = acpi_os_write_port(acpi_gbl_FADT.smi_command, ++ (u32) acpi_gbl_FADT.pstate_control, 8); ++ if (ACPI_FAILURE(status)) ++ return status; ++ ++ is_done = 1; ++ ++ return 0; ++} ++ ++static int xen_acpi_processor_get_platform_limit(struct acpi_processor *pr) ++{ ++ acpi_status status = 0; ++ unsigned long long ppc = 0; ++ ++ if (!pr) ++ return -EINVAL; ++ ++ /* ++ * _PPC indicates the maximum state currently supported by the platform ++ * (e.g. 0 = states 0..n; 1 = states 1..n; etc. ++ */ ++ status = acpi_evaluate_integer(pr->handle, "_PPC", NULL, &ppc); ++ ++ if (ACPI_FAILURE(status) && status != AE_NOT_FOUND) { ++ ACPI_EXCEPTION((AE_INFO, status, "Evaluating _PPC")); ++ return -ENODEV; ++ } ++ ++ pr->performance_platform_limit = (int)ppc; ++ ++ return 0; ++} ++ ++int xen_acpi_processor_ppc_has_changed(struct acpi_processor *pr) ++{ ++ int ret; ++ ++ ret = xen_acpi_processor_get_platform_limit(pr); ++ ++ if (ret < 0) ++ return ret; ++ else ++ return processor_cntl_xen_notify(pr, ++ PROCESSOR_PM_CHANGE, PM_TYPE_PERF); ++} ++ ++/* ++ * Existing ACPI module does parse performance states at some point, ++ * when acpi-cpufreq driver is loaded which however is something ++ * we'd like to disable to avoid confliction with xen PM ++ * logic. So we have to collect raw performance information here ++ * when ACPI processor object is found and started. ++ */ ++int xen_acpi_processor_get_performance(struct acpi_processor *pr) ++{ ++ int ret; ++ struct acpi_processor_performance *perf; ++ struct acpi_psd_package *pdomain; ++ ++ if (pr->performance) ++ return -EBUSY; ++ ++ perf = kzalloc(sizeof(struct acpi_processor_performance), GFP_KERNEL); ++ if (!perf) ++ return -ENOMEM; ++ ++ pr->performance = perf; ++ /* Get basic performance state information */ ++ ret = acpi_processor_get_performance_info(pr); ++ if (ret < 0) ++ goto err_out; ++ ++ /* ++ * Well, here we need retrieve performance dependency information ++ * from _PSD object. The reason why existing interface is not used ++ * is due to the reason that existing interface sticks to Linux cpu ++ * id to construct some bitmap, however we want to split ACPI ++ * processor objects from Linux cpu id logic. For example, even ++ * when Linux is configured as UP, we still want to parse all ACPI ++ * processor objects to xen. In this case, it's preferred ++ * to use ACPI ID instead. ++ */ ++ pdomain = &pr->performance->domain_info; ++ pdomain->num_processors = 0; ++ ret = acpi_processor_get_psd(pr); ++ if (ret < 0) { ++ /* ++ * _PSD is optional - assume no coordination if absent (or ++ * broken), matching native kernels' behavior. ++ */ ++ pdomain->num_entries = ACPI_PSD_REV0_ENTRIES; ++ pdomain->revision = ACPI_PSD_REV0_REVISION; ++ pdomain->domain = pr->acpi_id; ++ pdomain->coord_type = DOMAIN_COORD_TYPE_SW_ALL; ++ pdomain->num_processors = 1; ++ } ++ ++ /* Some sanity check */ ++ if ((pdomain->revision != ACPI_PSD_REV0_REVISION) || ++ (pdomain->num_entries != ACPI_PSD_REV0_ENTRIES) || ++ ((pdomain->coord_type != DOMAIN_COORD_TYPE_SW_ALL) && ++ (pdomain->coord_type != DOMAIN_COORD_TYPE_SW_ANY) && ++ (pdomain->coord_type != DOMAIN_COORD_TYPE_HW_ALL))) { ++ ret = -EINVAL; ++ goto err_out; ++ } ++ ++ /* Last step is to notify BIOS that xen exists */ ++ xen_processor_notify_smm(); ++ ++ processor_cntl_xen_notify(pr, PROCESSOR_PM_INIT, PM_TYPE_PERF); ++ ++ return 0; ++err_out: ++ pr->performance = NULL; ++ kfree(perf); ++ return ret; ++} ++#endif /* CONFIG_CPU_FREQ */ ++ ++/* init and exit */ ++ ++int xen_acpi_processor_init(void) ++{ ++ return acpi_bus_register_driver(&xen_acpi_processor_driver); ++} ++ ++void xen_acpi_processor_exit(void) ++{ ++ acpi_bus_unregister_driver(&xen_acpi_processor_driver); ++} +diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c +index 0458094..85a1308 100644 +--- a/drivers/acpi/sleep.c ++++ b/drivers/acpi/sleep.c +@@ -19,6 +19,8 @@ + + #include <asm/io.h> + ++#include <xen/acpi.h> ++ + #include <acpi/acpi_bus.h> + #include <acpi/acpi_drivers.h> + +@@ -200,6 +202,21 @@ static int acpi_suspend_begin(suspend_state_t pm_state) + return error; + } + ++static void do_suspend(void) ++{ ++ if (!xen_pv_acpi()) { ++ do_suspend_lowlevel(); ++ return; ++ } ++ ++ /* ++ * Xen will save and restore CPU context, so ++ * we can skip that and just go straight to ++ * the suspend. ++ */ ++ acpi_enter_sleep_state(ACPI_STATE_S3); ++} ++ + /** + * acpi_suspend_enter - Actually enter a sleep state. + * @pm_state: ignored +@@ -233,7 +250,7 @@ static int acpi_suspend_enter(suspend_state_t pm_state) + break; + + case ACPI_STATE_S3: +- do_suspend_lowlevel(); ++ do_suspend(); + break; + } + +diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig +index 1d886e0..f4a2b10 100644 +--- a/drivers/block/Kconfig ++++ b/drivers/block/Kconfig +@@ -462,6 +462,7 @@ config XEN_BLKDEV_FRONTEND + tristate "Xen virtual block device support" + depends on XEN + default y ++ select XEN_XENBUS_FRONTEND + help + This driver implements the front-end of the Xen virtual + block device driver. It communicates with a back-end driver +diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c +index b8578bb..9679ffa 100644 +--- a/drivers/block/xen-blkfront.c ++++ b/drivers/block/xen-blkfront.c +@@ -42,10 +42,12 @@ + #include <linux/module.h> + #include <linux/scatterlist.h> + ++#include <xen/xen.h> + #include <xen/xenbus.h> + #include <xen/grant_table.h> + #include <xen/events.h> + #include <xen/page.h> ++#include <xen/platform_pci.h> + + #include <xen/interface/grant_table.h> + #include <xen/interface/io/blkif.h> +@@ -67,7 +69,7 @@ struct blk_shadow { + + static const struct block_device_operations xlvbd_block_fops; + +-#define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE) ++#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE) + + /* + * We have one of these per vbd, whether ide, scsi or 'other'. They +@@ -76,6 +78,7 @@ static const struct block_device_operations xlvbd_block_fops; + */ + struct blkfront_info + { ++ struct mutex mutex; + struct xenbus_device *xbdev; + struct gendisk *gd; + int vdevice; +@@ -85,6 +88,7 @@ struct blkfront_info + struct blkif_front_ring ring; + struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + unsigned int evtchn, irq; ++ struct tasklet_struct tasklet; + struct request_queue *rq; + struct work_struct work; + struct gnttab_free_callback callback; +@@ -93,14 +97,12 @@ struct blkfront_info + int feature_barrier; + int is_ready; + +- /** +- * The number of people holding this device open. We won't allow a +- * hot-unplug unless this is 0. +- */ +- int users; ++ spinlock_t io_lock; + }; + +-static DEFINE_SPINLOCK(blkif_io_lock); ++static unsigned int nr_minors; ++static unsigned long *minors; ++static DEFINE_SPINLOCK(minor_lock); + + #define MAXIMUM_OUTSTANDING_BLOCK_REQS \ + (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE) +@@ -116,6 +118,10 @@ static DEFINE_SPINLOCK(blkif_io_lock); + #define EXTENDED (1<<EXT_SHIFT) + #define VDEV_IS_EXTENDED(dev) ((dev)&(EXTENDED)) + #define BLKIF_MINOR_EXT(dev) ((dev)&(~EXTENDED)) ++#define EMULATED_HD_DISK_MINOR_OFFSET (0) ++#define EMULATED_HD_DISK_NAME_OFFSET (EMULATED_HD_DISK_MINOR_OFFSET / 256) ++#define EMULATED_SD_DISK_MINOR_OFFSET (EMULATED_HD_DISK_MINOR_OFFSET + (4 * 16)) ++#define EMULATED_SD_DISK_NAME_OFFSET (EMULATED_HD_DISK_NAME_OFFSET + 4) + + #define DEV_NAME "xvd" /* name in /dev */ + +@@ -136,6 +142,55 @@ static void add_id_to_freelist(struct blkfront_info *info, + info->shadow_free = id; + } + ++static int xlbd_reserve_minors(unsigned int minor, unsigned int nr) ++{ ++ unsigned int end = minor + nr; ++ int rc; ++ ++ if (end > nr_minors) { ++ unsigned long *bitmap, *old; ++ ++ bitmap = kzalloc(BITS_TO_LONGS(end) * sizeof(*bitmap), ++ GFP_KERNEL); ++ if (bitmap == NULL) ++ return -ENOMEM; ++ ++ spin_lock(&minor_lock); ++ if (end > nr_minors) { ++ old = minors; ++ memcpy(bitmap, minors, ++ BITS_TO_LONGS(nr_minors) * sizeof(*bitmap)); ++ minors = bitmap; ++ nr_minors = BITS_TO_LONGS(end) * BITS_PER_LONG; ++ } else ++ old = bitmap; ++ spin_unlock(&minor_lock); ++ kfree(old); ++ } ++ ++ spin_lock(&minor_lock); ++ if (find_next_bit(minors, end, minor) >= end) { ++ for (; minor < end; ++minor) ++ __set_bit(minor, minors); ++ rc = 0; ++ } else ++ rc = -EBUSY; ++ spin_unlock(&minor_lock); ++ ++ return rc; ++} ++ ++static void xlbd_release_minors(unsigned int minor, unsigned int nr) ++{ ++ unsigned int end = minor + nr; ++ ++ BUG_ON(end > nr_minors); ++ spin_lock(&minor_lock); ++ for (; minor < end; ++minor) ++ __clear_bit(minor, minors); ++ spin_unlock(&minor_lock); ++} ++ + static void blkif_restart_queue_callback(void *arg) + { + struct blkfront_info *info = (struct blkfront_info *)arg; +@@ -333,11 +388,12 @@ wait: + flush_requests(info); + } + +-static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) ++static int xlvbd_init_blk_queue(struct blkfront_info *info, ++ struct gendisk *gd, u16 sector_size) + { + struct request_queue *rq; + +- rq = blk_init_queue(do_blkif_request, &blkif_io_lock); ++ rq = blk_init_queue(do_blkif_request, &info->io_lock); + if (rq == NULL) + return -1; + +@@ -370,20 +426,84 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) + static int xlvbd_barrier(struct blkfront_info *info) + { + int err; ++ const char *barrier; ++ ++ switch (info->feature_barrier) { ++ case QUEUE_ORDERED_DRAIN: barrier = "enabled (drain)"; break; ++ case QUEUE_ORDERED_TAG: barrier = "enabled (tag)"; break; ++ case QUEUE_ORDERED_NONE: barrier = "disabled"; break; ++ default: return -EINVAL; ++ } + +- err = blk_queue_ordered(info->rq, +- info->feature_barrier ? QUEUE_ORDERED_DRAIN : QUEUE_ORDERED_NONE, +- NULL); ++ err = blk_queue_ordered(info->rq, info->feature_barrier, NULL); + + if (err) + return err; + + printk(KERN_INFO "blkfront: %s: barriers %s\n", +- info->gd->disk_name, +- info->feature_barrier ? "enabled" : "disabled"); ++ info->gd->disk_name, barrier); + return 0; + } + ++static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset) ++{ ++ int major; ++ major = BLKIF_MAJOR(vdevice); ++ *minor = BLKIF_MINOR(vdevice); ++ switch (major) { ++ case XEN_IDE0_MAJOR: ++ *offset = (*minor / 64) + EMULATED_HD_DISK_NAME_OFFSET; ++ *minor = ((*minor / 64) * PARTS_PER_DISK) + ++ EMULATED_HD_DISK_MINOR_OFFSET; ++ break; ++ case XEN_IDE1_MAJOR: ++ *offset = (*minor / 64) + 2 + EMULATED_HD_DISK_NAME_OFFSET; ++ *minor = (((*minor / 64) + 2) * PARTS_PER_DISK) + ++ EMULATED_HD_DISK_MINOR_OFFSET; ++ break; ++ case XEN_SCSI_DISK0_MAJOR: ++ *offset = (*minor / PARTS_PER_DISK) + EMULATED_SD_DISK_NAME_OFFSET; ++ *minor = *minor + EMULATED_SD_DISK_MINOR_OFFSET; ++ break; ++ case XEN_SCSI_DISK1_MAJOR: ++ case XEN_SCSI_DISK2_MAJOR: ++ case XEN_SCSI_DISK3_MAJOR: ++ case XEN_SCSI_DISK4_MAJOR: ++ case XEN_SCSI_DISK5_MAJOR: ++ case XEN_SCSI_DISK6_MAJOR: ++ case XEN_SCSI_DISK7_MAJOR: ++ *offset = (*minor / PARTS_PER_DISK) + ++ ((major - XEN_SCSI_DISK1_MAJOR + 1) * 16) + ++ EMULATED_SD_DISK_NAME_OFFSET; ++ *minor = *minor + ++ ((major - XEN_SCSI_DISK1_MAJOR + 1) * 16 * PARTS_PER_DISK) + ++ EMULATED_SD_DISK_MINOR_OFFSET; ++ break; ++ case XEN_SCSI_DISK8_MAJOR: ++ case XEN_SCSI_DISK9_MAJOR: ++ case XEN_SCSI_DISK10_MAJOR: ++ case XEN_SCSI_DISK11_MAJOR: ++ case XEN_SCSI_DISK12_MAJOR: ++ case XEN_SCSI_DISK13_MAJOR: ++ case XEN_SCSI_DISK14_MAJOR: ++ case XEN_SCSI_DISK15_MAJOR: ++ *offset = (*minor / PARTS_PER_DISK) + ++ ((major - XEN_SCSI_DISK8_MAJOR + 8) * 16) + ++ EMULATED_SD_DISK_NAME_OFFSET; ++ *minor = *minor + ++ ((major - XEN_SCSI_DISK8_MAJOR + 8) * 16 * PARTS_PER_DISK) + ++ EMULATED_SD_DISK_MINOR_OFFSET; ++ break; ++ case XENVBD_MAJOR: ++ *offset = *minor / PARTS_PER_DISK; ++ break; ++ default: ++ printk(KERN_WARNING "blkfront: your disk configuration is " ++ "incorrect, please use an xvd device instead\n"); ++ return -ENODEV; ++ } ++ return 0; ++} + + static int xlvbd_alloc_gendisk(blkif_sector_t capacity, + struct blkfront_info *info, +@@ -391,7 +511,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, + { + struct gendisk *gd; + int nr_minors = 1; +- int err = -ENODEV; ++ int err; + unsigned int offset; + int minor; + int nr_parts; +@@ -406,21 +526,33 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, + } + + if (!VDEV_IS_EXTENDED(info->vdevice)) { +- minor = BLKIF_MINOR(info->vdevice); +- nr_parts = PARTS_PER_DISK; ++ err = xen_translate_vdev(info->vdevice, &minor, &offset); ++ if (err) ++ return err; ++ nr_parts = PARTS_PER_DISK; + } else { + minor = BLKIF_MINOR_EXT(info->vdevice); + nr_parts = PARTS_PER_EXT_DISK; ++ offset = minor / nr_parts; ++ if (xen_hvm_domain() && minor >= EMULATED_HD_DISK_MINOR_OFFSET) { ++ printk(KERN_WARNING "blkfront: vdevice 0x%x might conflict with " ++ "emulated IDE and SCSI disks; ignoring", info->vdevice); ++ return -ENODEV; ++ } + } ++ err = -ENODEV; + + if ((minor % nr_parts) == 0) + nr_minors = nr_parts; + +- gd = alloc_disk(nr_minors); +- if (gd == NULL) ++ err = xlbd_reserve_minors(minor, nr_minors); ++ if (err) + goto out; ++ err = -ENODEV; + +- offset = minor / nr_parts; ++ gd = alloc_disk(nr_minors); ++ if (gd == NULL) ++ goto release; + + if (nr_minors > 1) { + if (offset < 26) +@@ -447,16 +579,15 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, + gd->driverfs_dev = &(info->xbdev->dev); + set_capacity(gd, capacity); + +- if (xlvbd_init_blk_queue(gd, sector_size)) { ++ if (xlvbd_init_blk_queue(info, gd, sector_size)) { + del_gendisk(gd); +- goto out; ++ goto release; + } + + info->rq = gd->queue; + info->gd = gd; + +- if (info->feature_barrier) +- xlvbd_barrier(info); ++ xlvbd_barrier(info); + + if (vdisk_info & VDISK_READONLY) + set_disk_ro(gd, 1); +@@ -469,10 +600,45 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, + + return 0; + ++ release: ++ xlbd_release_minors(minor, nr_minors); + out: + return err; + } + ++static void xlvbd_release_gendisk(struct blkfront_info *info) ++{ ++ unsigned int minor, nr_minors; ++ unsigned long flags; ++ ++ if (info->rq == NULL) ++ return; ++ ++ spin_lock_irqsave(&info->io_lock, flags); ++ ++ /* No more blkif_request(). */ ++ blk_stop_queue(info->rq); ++ ++ /* No more gnttab callback work. */ ++ gnttab_cancel_free_callback(&info->callback); ++ spin_unlock_irqrestore(&info->io_lock, flags); ++ ++ /* Flush gnttab callback work. Must be done with no locks held. */ ++ flush_scheduled_work(); ++ ++ del_gendisk(info->gd); ++ ++ minor = info->gd->first_minor; ++ nr_minors = info->gd->minors; ++ xlbd_release_minors(minor, nr_minors); ++ ++ blk_cleanup_queue(info->rq); ++ info->rq = NULL; ++ ++ put_disk(info->gd); ++ info->gd = NULL; ++} ++ + static void kick_pending_request_queues(struct blkfront_info *info) + { + if (!RING_FULL(&info->ring)) { +@@ -487,16 +653,16 @@ static void blkif_restart_queue(struct work_struct *work) + { + struct blkfront_info *info = container_of(work, struct blkfront_info, work); + +- spin_lock_irq(&blkif_io_lock); ++ spin_lock_irq(&info->io_lock); + if (info->connected == BLKIF_STATE_CONNECTED) + kick_pending_request_queues(info); +- spin_unlock_irq(&blkif_io_lock); ++ spin_unlock_irq(&info->io_lock); + } + + static void blkif_free(struct blkfront_info *info, int suspend) + { + /* Prevent new requests being issued until we fix things up. */ +- spin_lock_irq(&blkif_io_lock); ++ spin_lock_irq(&info->io_lock); + info->connected = suspend ? + BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED; + /* No more blkif_request(). */ +@@ -504,7 +670,7 @@ static void blkif_free(struct blkfront_info *info, int suspend) + blk_stop_queue(info->rq); + /* No more gnttab callback work. */ + gnttab_cancel_free_callback(&info->callback); +- spin_unlock_irq(&blkif_io_lock); ++ spin_unlock_irq(&info->io_lock); + + /* Flush gnttab callback work. Must be done with no locks held. */ + flush_scheduled_work(); +@@ -529,21 +695,20 @@ static void blkif_completion(struct blk_shadow *s) + gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL); + } + +-static irqreturn_t blkif_interrupt(int irq, void *dev_id) ++static void ++blkif_do_interrupt(unsigned long data) + { ++ struct blkfront_info *info = (struct blkfront_info *)data; + struct request *req; + struct blkif_response *bret; + RING_IDX i, rp; + unsigned long flags; +- struct blkfront_info *info = (struct blkfront_info *)dev_id; + int error; + +- spin_lock_irqsave(&blkif_io_lock, flags); ++ spin_lock_irqsave(&info->io_lock, flags); + +- if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) { +- spin_unlock_irqrestore(&blkif_io_lock, flags); +- return IRQ_HANDLED; +- } ++ if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) ++ goto out; + + again: + rp = info->ring.sring->rsp_prod; +@@ -567,7 +732,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) + printk(KERN_WARNING "blkfront: %s: write barrier op failed\n", + info->gd->disk_name); + error = -EOPNOTSUPP; +- info->feature_barrier = 0; ++ info->feature_barrier = QUEUE_ORDERED_NONE; + xlvbd_barrier(info); + } + /* fall through */ +@@ -596,7 +761,17 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) + + kick_pending_request_queues(info); + +- spin_unlock_irqrestore(&blkif_io_lock, flags); ++out: ++ spin_unlock_irqrestore(&info->io_lock, flags); ++} ++ ++ ++static irqreturn_t ++blkif_interrupt(int irq, void *dev_id) ++{ ++ struct blkfront_info *info = (struct blkfront_info *)dev_id; ++ ++ tasklet_schedule(&info->tasklet); + + return IRQ_HANDLED; + } +@@ -650,7 +825,7 @@ fail: + + + /* Common code used when first setting up, and when resuming. */ +-static int talk_to_backend(struct xenbus_device *dev, ++static int talk_to_blkback(struct xenbus_device *dev, + struct blkfront_info *info) + { + const char *message = NULL; +@@ -710,7 +885,6 @@ again: + return err; + } + +- + /** + * Entry point to this code when a new device is created. Allocate the basic + * structures and the ring buffer for communication with the backend, and +@@ -736,16 +910,48 @@ static int blkfront_probe(struct xenbus_device *dev, + } + } + ++ if (xen_hvm_domain()) { ++ char *type; ++ int len; ++ /* no unplug has been done: do not hook devices != xen vbds */ ++ if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY) { ++ int major; ++ ++ if (!VDEV_IS_EXTENDED(vdevice)) ++ major = BLKIF_MAJOR(vdevice); ++ else ++ major = XENVBD_MAJOR; ++ ++ if (major != XENVBD_MAJOR) { ++ printk(KERN_INFO ++ "%s: HVM does not support vbd %d as xen block device\n", ++ __FUNCTION__, vdevice); ++ return -ENODEV; ++ } ++ } ++ /* do not create a PV cdrom device if we are an HVM guest */ ++ type = xenbus_read(XBT_NIL, dev->nodename, "device-type", &len); ++ if (IS_ERR(type)) ++ return -ENODEV; ++ if (strncmp(type, "cdrom", 5) == 0) { ++ kfree(type); ++ return -ENODEV; ++ } ++ kfree(type); ++ } + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) { + xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure"); + return -ENOMEM; + } + ++ mutex_init(&info->mutex); + info->xbdev = dev; + info->vdevice = vdevice; + info->connected = BLKIF_STATE_DISCONNECTED; + INIT_WORK(&info->work, blkif_restart_queue); ++ spin_lock_init(&info->io_lock); ++ tasklet_init(&info->tasklet, blkif_do_interrupt, (unsigned long)info); + + for (i = 0; i < BLK_RING_SIZE; i++) + info->shadow[i].req.id = i+1; +@@ -755,7 +961,7 @@ static int blkfront_probe(struct xenbus_device *dev, + info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0); + dev_set_drvdata(&dev->dev, info); + +- err = talk_to_backend(dev, info); ++ err = talk_to_blkback(dev, info); + if (err) { + kfree(info); + dev_set_drvdata(&dev->dev, NULL); +@@ -819,7 +1025,7 @@ static int blkif_recover(struct blkfront_info *info) + + xenbus_switch_state(info->xbdev, XenbusStateConnected); + +- spin_lock_irq(&blkif_io_lock); ++ spin_lock_irq(&info->io_lock); + + /* Now safe for us to use the shared ring */ + info->connected = BLKIF_STATE_CONNECTED; +@@ -830,7 +1036,7 @@ static int blkif_recover(struct blkfront_info *info) + /* Kick any other new requests queued since we resumed */ + kick_pending_request_queues(info); + +- spin_unlock_irq(&blkif_io_lock); ++ spin_unlock_irq(&info->io_lock); + + return 0; + } +@@ -850,13 +1056,50 @@ static int blkfront_resume(struct xenbus_device *dev) + + blkif_free(info, info->connected == BLKIF_STATE_CONNECTED); + +- err = talk_to_backend(dev, info); ++ err = talk_to_blkback(dev, info); + if (info->connected == BLKIF_STATE_SUSPENDED && !err) + err = blkif_recover(info); + + return err; + } + ++static void ++blkfront_closing(struct blkfront_info *info) ++{ ++ struct xenbus_device *xbdev = info->xbdev; ++ struct block_device *bdev = NULL; ++ ++ mutex_lock(&info->mutex); ++ ++ if (xbdev->state == XenbusStateClosing) { ++ mutex_unlock(&info->mutex); ++ return; ++ } ++ ++ if (info->gd) ++ bdev = bdget_disk(info->gd, 0); ++ ++ mutex_unlock(&info->mutex); ++ ++ if (!bdev) { ++ xenbus_frontend_closed(xbdev); ++ return; ++ } ++ ++ mutex_lock(&bdev->bd_mutex); ++ ++ if (bdev->bd_openers) { ++ xenbus_dev_error(xbdev, -EBUSY, ++ "Device in use; refusing to close"); ++ xenbus_switch_state(xbdev, XenbusStateClosing); ++ } else { ++ xlvbd_release_gendisk(info); ++ xenbus_frontend_closed(xbdev); ++ } ++ ++ mutex_unlock(&bdev->bd_mutex); ++ bdput(bdev); ++} + + /* + * Invoked when the backend is finally 'ready' (and has told produced +@@ -868,11 +1111,31 @@ static void blkfront_connect(struct blkfront_info *info) + unsigned long sector_size; + unsigned int binfo; + int err; +- +- if ((info->connected == BLKIF_STATE_CONNECTED) || +- (info->connected == BLKIF_STATE_SUSPENDED) ) ++ int barrier; ++ ++ switch (info->connected) { ++ case BLKIF_STATE_CONNECTED: ++ /* ++ * Potentially, the back-end may be signalling ++ * a capacity change; update the capacity. ++ */ ++ err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, ++ "sectors", "%Lu", §ors); ++ if (XENBUS_EXIST_ERR(err)) ++ return; ++ printk(KERN_INFO "Setting capacity to %Lu\n", ++ sectors); ++ set_capacity(info->gd, sectors); ++ revalidate_disk(info->gd); ++ ++ /* fall through */ ++ case BLKIF_STATE_SUSPENDED: + return; + ++ default: ++ break; ++ } ++ + dev_dbg(&info->xbdev->dev, "%s:%s.\n", + __func__, info->xbdev->otherend); + +@@ -889,10 +1152,26 @@ static void blkfront_connect(struct blkfront_info *info) + } + + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, +- "feature-barrier", "%lu", &info->feature_barrier, ++ "feature-barrier", "%lu", &barrier, + NULL); ++ ++ /* ++ * If there's no "feature-barrier" defined, then it means ++ * we're dealing with a very old backend which writes ++ * synchronously; draining will do what needs to get done. ++ * ++ * If there are barriers, then we can do full queued writes ++ * with tagged barriers. ++ * ++ * If barriers are not supported, then there's no much we can ++ * do, so just set ordering to NONE. ++ */ + if (err) +- info->feature_barrier = 0; ++ info->feature_barrier = QUEUE_ORDERED_DRAIN; ++ else if (barrier) ++ info->feature_barrier = QUEUE_ORDERED_TAG; ++ else ++ info->feature_barrier = QUEUE_ORDERED_NONE; + + err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size); + if (err) { +@@ -904,10 +1183,10 @@ static void blkfront_connect(struct blkfront_info *info) + xenbus_switch_state(info->xbdev, XenbusStateConnected); + + /* Kick pending requests. */ +- spin_lock_irq(&blkif_io_lock); ++ spin_lock_irq(&info->io_lock); + info->connected = BLKIF_STATE_CONNECTED; + kick_pending_request_queues(info); +- spin_unlock_irq(&blkif_io_lock); ++ spin_unlock_irq(&info->io_lock); + + add_disk(info->gd); + +@@ -915,57 +1194,21 @@ static void blkfront_connect(struct blkfront_info *info) + } + + /** +- * Handle the change of state of the backend to Closing. We must delete our +- * device-layer structures now, to ensure that writes are flushed through to +- * the backend. Once is this done, we can switch to Closed in +- * acknowledgement. +- */ +-static void blkfront_closing(struct xenbus_device *dev) +-{ +- struct blkfront_info *info = dev_get_drvdata(&dev->dev); +- unsigned long flags; +- +- dev_dbg(&dev->dev, "blkfront_closing: %s removed\n", dev->nodename); +- +- if (info->rq == NULL) +- goto out; +- +- spin_lock_irqsave(&blkif_io_lock, flags); +- +- /* No more blkif_request(). */ +- blk_stop_queue(info->rq); +- +- /* No more gnttab callback work. */ +- gnttab_cancel_free_callback(&info->callback); +- spin_unlock_irqrestore(&blkif_io_lock, flags); +- +- /* Flush gnttab callback work. Must be done with no locks held. */ +- flush_scheduled_work(); +- +- blk_cleanup_queue(info->rq); +- info->rq = NULL; +- +- del_gendisk(info->gd); +- +- out: +- xenbus_frontend_closed(dev); +-} +- +-/** + * Callback received when the backend's state changes. + */ +-static void backend_changed(struct xenbus_device *dev, ++static void blkback_changed(struct xenbus_device *dev, + enum xenbus_state backend_state) + { + struct blkfront_info *info = dev_get_drvdata(&dev->dev); +- struct block_device *bd; + +- dev_dbg(&dev->dev, "blkfront:backend_changed.\n"); ++ dev_dbg(&dev->dev, "blkfront:blkback_changed to state %d.\n", backend_state); + + switch (backend_state) { + case XenbusStateInitialising: + case XenbusStateInitWait: + case XenbusStateInitialised: ++ case XenbusStateReconfiguring: ++ case XenbusStateReconfigured: + case XenbusStateUnknown: + case XenbusStateClosed: + break; +@@ -975,35 +1218,56 @@ static void backend_changed(struct xenbus_device *dev, + break; + + case XenbusStateClosing: +- if (info->gd == NULL) { +- xenbus_frontend_closed(dev); +- break; +- } +- bd = bdget_disk(info->gd, 0); +- if (bd == NULL) +- xenbus_dev_fatal(dev, -ENODEV, "bdget failed"); +- +- mutex_lock(&bd->bd_mutex); +- if (info->users > 0) +- xenbus_dev_error(dev, -EBUSY, +- "Device in use; refusing to close"); +- else +- blkfront_closing(dev); +- mutex_unlock(&bd->bd_mutex); +- bdput(bd); ++ blkfront_closing(info); + break; + } + } + +-static int blkfront_remove(struct xenbus_device *dev) ++static int blkfront_remove(struct xenbus_device *xbdev) + { +- struct blkfront_info *info = dev_get_drvdata(&dev->dev); ++ struct blkfront_info *info = dev_get_drvdata(&xbdev->dev); ++ struct block_device *bdev = NULL; ++ struct gendisk *disk; + +- dev_dbg(&dev->dev, "blkfront_remove: %s removed\n", dev->nodename); ++ dev_dbg(&xbdev->dev, "%s removed", xbdev->nodename); + + blkif_free(info, 0); + +- kfree(info); ++ mutex_lock(&info->mutex); ++ ++ disk = info->gd; ++ if (disk) ++ bdev = bdget_disk(disk, 0); ++ ++ info->xbdev = NULL; ++ mutex_unlock(&info->mutex); ++ ++ if (!bdev) { ++ kfree(info); ++ return 0; ++ } ++ ++ /* ++ * The xbdev was removed before we reached the Closed ++ * state. See if it's safe to remove the disk. If the bdev ++ * isn't closed yet, we let release take care of it. ++ */ ++ ++ mutex_lock(&bdev->bd_mutex); ++ info = disk->private_data; ++ ++ dev_warn(disk_to_dev(disk), ++ "%s was hot-unplugged, %d stale handles\n", ++ xbdev->nodename, bdev->bd_openers); ++ ++ if (info && !bdev->bd_openers) { ++ xlvbd_release_gendisk(info); ++ disk->private_data = NULL; ++ kfree(info); ++ } ++ ++ mutex_unlock(&bdev->bd_mutex); ++ bdput(bdev); + + return 0; + } +@@ -1012,30 +1276,68 @@ static int blkfront_is_ready(struct xenbus_device *dev) + { + struct blkfront_info *info = dev_get_drvdata(&dev->dev); + +- return info->is_ready; ++ return info->is_ready && info->xbdev; + } + + static int blkif_open(struct block_device *bdev, fmode_t mode) + { +- struct blkfront_info *info = bdev->bd_disk->private_data; +- info->users++; +- return 0; ++ struct gendisk *disk = bdev->bd_disk; ++ struct blkfront_info *info; ++ int err = 0; ++ ++ info = disk->private_data; ++ if (!info) ++ /* xbdev gone */ ++ return -ERESTARTSYS; ++ ++ mutex_lock(&info->mutex); ++ ++ if (!info->gd) ++ /* xbdev is closed */ ++ err = -ERESTARTSYS; ++ ++ mutex_unlock(&info->mutex); ++ ++ return err; + } + + static int blkif_release(struct gendisk *disk, fmode_t mode) + { + struct blkfront_info *info = disk->private_data; +- info->users--; +- if (info->users == 0) { +- /* Check whether we have been instructed to close. We will +- have ignored this request initially, as the device was +- still mounted. */ +- struct xenbus_device *dev = info->xbdev; +- enum xenbus_state state = xenbus_read_driver_state(dev->otherend); +- +- if (state == XenbusStateClosing && info->is_ready) +- blkfront_closing(dev); ++ struct block_device *bdev; ++ struct xenbus_device *xbdev; ++ ++ bdev = bdget_disk(disk, 0); ++ bdput(bdev); ++ ++ if (bdev->bd_openers) ++ return 0; ++ ++ /* ++ * Check if we have been instructed to close. We will have ++ * deferred this request, because the bdev was still open. ++ */ ++ ++ mutex_lock(&info->mutex); ++ xbdev = info->xbdev; ++ ++ if (xbdev && xbdev->state == XenbusStateClosing) { ++ /* pending switch to state closed */ ++ dev_info(disk_to_dev(bdev->bd_disk), "releasing disk\n"); ++ xlvbd_release_gendisk(info); ++ xenbus_frontend_closed(info->xbdev); + } ++ ++ mutex_unlock(&info->mutex); ++ ++ if (!xbdev) { ++ /* sudden device removal */ ++ dev_info(disk_to_dev(bdev->bd_disk), "releasing disk\n"); ++ xlvbd_release_gendisk(info); ++ disk->private_data = NULL; ++ kfree(info); ++ } ++ + return 0; + } + +@@ -1061,7 +1363,7 @@ static struct xenbus_driver blkfront = { + .probe = blkfront_probe, + .remove = blkfront_remove, + .resume = blkfront_resume, +- .otherend_changed = backend_changed, ++ .otherend_changed = blkback_changed, + .is_ready = blkfront_is_ready, + }; + +diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c +index c496c8a..4064d95 100644 +--- a/drivers/char/agp/amd64-agp.c ++++ b/drivers/char/agp/amd64-agp.c +@@ -18,6 +18,8 @@ + #include <asm/k8.h> + #include <asm/gart.h> + #include "agp.h" ++#include <xen/page.h> ++#include <asm/xen/page.h> + + /* NVIDIA K8 registers */ + #define NVIDIA_X86_64_0_APBASE 0x10 +@@ -78,8 +80,21 @@ static int amd64_insert_memory(struct agp_memory *mem, off_t pg_start, int type) + } + + for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { ++ phys_addr_t phys = page_to_phys(mem->pages[i]); ++ if (xen_pv_domain()) { ++ phys_addr_t xen_phys = PFN_PHYS(pfn_to_mfn( ++ page_to_pfn(mem->pages[i]))); ++ if (phys != xen_phys) { ++ printk(KERN_ERR "Fixing up GART: (0x%lx->0x%lx)." \ ++ " CODE UNTESTED!\n", ++ (unsigned long)phys, ++ (unsigned long)xen_phys); ++ WARN_ON_ONCE(phys != xen_phys); ++ phys = xen_phys; ++ } ++ } + tmp = agp_bridge->driver->mask_memory(agp_bridge, +- page_to_phys(mem->pages[i]), ++ phys, + mask_type); + + BUG_ON(tmp & 0xffffff0000000ffcULL); +@@ -181,6 +196,20 @@ static int amd_8151_configure(void) + unsigned long gatt_bus = virt_to_phys(agp_bridge->gatt_table_real); + int i; + ++ if (xen_pv_domain()) { ++ phys_addr_t xen_phys = PFN_PHYS(pfn_to_mfn( ++ virt_to_pfn(agp_bridge->gatt_table_real))); ++ /* Future thoughts: Perhaps use the gatt_table_bus that ++ * agp_generic_create_gatt_table has setup instead of ++ * doing the virt_to_phys once more? */ ++ if (gatt_bus != xen_phys) { ++ printk(KERN_ERR "Fixing up GATT: (0x%lx->0x%lx)." \ ++ " CODE UNTESTED!\n", gatt_bus, ++ (unsigned long)xen_phys); ++ WARN_ON_ONCE(gatt_bus != xen_phys); ++ gatt_bus = xen_phys; ++ } ++ } + /* Configure AGP regs in each x86-64 host bridge. */ + for (i = 0; i < num_k8_northbridges; i++) { + agp_bridge->gart_bus_addr = +diff --git a/drivers/char/agp/backend.c b/drivers/char/agp/backend.c +index a56ca08..30fc4b6 100644 +--- a/drivers/char/agp/backend.c ++++ b/drivers/char/agp/backend.c +@@ -38,6 +38,8 @@ + #include <linux/vmalloc.h> + #include <asm/io.h> + #include "agp.h" ++#include <xen/page.h> ++#include <asm/xen/page.h> + + /* Due to XFree86 brain-damage, we can't go to 1.0 until they + * fix some real stupidity. It's only by chance we can bump +@@ -160,8 +162,13 @@ static int agp_backend_initialize(struct agp_bridge_data *bridge) + } + } else { + bridge->scratch_page_dma = page_to_phys(page); ++ if (xen_pv_domain()) { ++ phys_addr_t xen_phys = PFN_PHYS(pfn_to_mfn( ++ page_to_pfn(page))); ++ if (bridge->scratch_page_dma != xen_phys) ++ bridge->scratch_page_dma = xen_phys; ++ } + } +- + bridge->scratch_page = bridge->driver->mask_memory(bridge, + bridge->scratch_page_dma, 0); + } +diff --git a/drivers/char/agp/generic.c b/drivers/char/agp/generic.c +index c505439..2434c91 100644 +--- a/drivers/char/agp/generic.c ++++ b/drivers/char/agp/generic.c +@@ -42,6 +42,8 @@ + #include <asm/cacheflush.h> + #include <asm/pgtable.h> + #include "agp.h" ++#include <xen/page.h> ++#include <asm/xen/page.h> + + __u32 *agp_gatt_table; + int agp_memory_reserved; +@@ -1002,6 +1004,14 @@ int agp_generic_create_gatt_table(struct agp_bridge_data *bridge) + return -ENOMEM; + } + bridge->gatt_bus_addr = virt_to_phys(bridge->gatt_table_real); ++ /* KRW: virt_to_phys under Xen is not safe. */ ++ if (xen_pv_domain()) { ++ /* Use back-door to get the "real" PFN. */ ++ phys_addr_t pfn = virt_to_pfn(bridge->gatt_table_real); ++ phys_addr_t xen_phys = PFN_PHYS(pfn_to_mfn(pfn)); ++ if (bridge->gatt_bus_addr != xen_phys) ++ bridge->gatt_bus_addr = xen_phys; ++ } + + /* AK: bogus, should encode addresses > 4GB */ + for (i = 0; i < num_entries; i++) { +@@ -1141,8 +1151,17 @@ int agp_generic_insert_memory(struct agp_memory * mem, off_t pg_start, int type) + } + + for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { ++ phys_addr_t phys = page_to_phys(mem->pages[i]); ++ ++ /* HACK: Via a back-door we get the bus address. */ ++ if (xen_pv_domain()) { ++ phys_addr_t xen_phys = PFN_PHYS(pfn_to_mfn( ++ page_to_pfn(mem->pages[i]))); ++ if (phys != xen_phys) ++ phys = xen_phys; ++ } + writel(bridge->driver->mask_memory(bridge, +- page_to_phys(mem->pages[i]), ++ phys, + mask_type), + bridge->gatt_table+j); + } +@@ -1235,7 +1254,16 @@ int agp_generic_alloc_pages(struct agp_bridge_data *bridge, struct agp_memory *m + int i, ret = -ENOMEM; + + for (i = 0; i < num_pages; i++) { +- page = alloc_page(GFP_KERNEL | GFP_DMA32 | __GFP_ZERO); ++ if (xen_pv_domain()) { ++ void *addr; ++ dma_addr_t _d; ++ ++ addr = dma_alloc_coherent(NULL, PAGE_SIZE, &_d, GFP_KERNEL); ++ if (!addr) ++ goto out; ++ page = virt_to_page(addr); ++ } else ++ page = alloc_page(GFP_KERNEL | GFP_DMA32 | __GFP_ZERO); + /* agp_free_memory() needs gart address */ + if (page == NULL) + goto out; +@@ -1263,7 +1291,17 @@ struct page *agp_generic_alloc_page(struct agp_bridge_data *bridge) + { + struct page * page; + +- page = alloc_page(GFP_KERNEL | GFP_DMA32 | __GFP_ZERO); ++ if (xen_pv_domain()) { ++ void *addr; ++ dma_addr_t _d; ++ ++ addr = dma_alloc_coherent(NULL, PAGE_SIZE, &_d, GFP_KERNEL); ++ if (!addr) ++ return NULL; ++ page = virt_to_page(addr); ++ } else ++ page = alloc_page(GFP_KERNEL | GFP_DMA32 | __GFP_ZERO); ++ + if (page == NULL) + return NULL; + +@@ -1294,7 +1332,12 @@ void agp_generic_destroy_pages(struct agp_memory *mem) + unmap_page_from_agp(page); + #endif + put_page(page); +- __free_page(page); ++ if (xen_pv_domain()) { ++ void *addr = page_address(page); ++ dma_free_coherent(NULL, PAGE_SIZE, addr, ++ virt_to_bus(addr)); ++ } else ++ __free_page(page); + atomic_dec(&agp_bridge->current_memory_agp); + mem->pages[i] = NULL; + } +@@ -1311,7 +1354,12 @@ void agp_generic_destroy_page(struct page *page, int flags) + + if (flags & AGP_PAGE_DESTROY_FREE) { + put_page(page); +- __free_page(page); ++ if (xen_pv_domain()) { ++ void *addr = page_address(page); ++ dma_free_coherent(NULL, PAGE_SIZE, addr, ++ virt_to_bus(addr)); ++ } else ++ __free_page(page); + atomic_dec(&agp_bridge->current_memory_agp); + } + } +diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c +index b8e0219..7a62c3c 100644 +--- a/drivers/char/agp/intel-agp.c ++++ b/drivers/char/agp/intel-agp.c +@@ -10,14 +10,20 @@ + #include <linux/agp_backend.h> + #include <asm/smp.h> + #include "agp.h" ++#include <xen/page.h> ++#include <asm/xen/page.h> + + /* + * If we have Intel graphics, we're not going to have anything other than + * an Intel IOMMU. So make the correct use of the PCI DMA API contingent + * on the Intel IOMMU support (CONFIG_DMAR). + * Only newer chipsets need to bother with this, of course. ++ * ++ * Xen guests accessing graphics hardware also need proper translation ++ * between pseudo-physical addresses and real machine addresses, which ++ * is also achieved by using the DMA API. + */ +-#ifdef CONFIG_DMAR ++#if defined(CONFIG_DMAR) || defined(CONFIG_XEN) + #define USE_PCI_DMA_API 1 + #endif + +@@ -296,8 +302,20 @@ static void intel_agp_insert_sg_entries(struct agp_memory *mem, + int i, j; + + for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { ++ phys_addr_t phys = page_to_phys(mem->pages[i]); ++ if (xen_pv_domain()) { ++ phys_addr_t xen_phys = PFN_PHYS(pfn_to_mfn( ++ page_to_pfn(mem->pages[i]))); ++ if (xen_phys != phys) { ++ printk(KERN_ERR "Compile kernel with " \ ++ "CONFIG_DMAR to get rid of this " \ ++ "warning!\n"); ++ WARN_ON_ONCE(xen_phys != phys); ++ /* Fixup: */ ++ phys = xen_phys; ++ } + writel(agp_bridge->driver->mask_memory(agp_bridge, +- page_to_phys(mem->pages[i]), mask_type), ++ phys, mask_type), + intel_private.gtt+j); + } + +@@ -395,15 +413,19 @@ static void intel_i810_agp_enable(struct agp_bridge_data *bridge, u32 mode) + /* Exists to support ARGB cursors */ + static struct page *i8xx_alloc_pages(void) + { ++ void *addr; ++ dma_addr_t _d; + struct page *page; + +- page = alloc_pages(GFP_KERNEL | GFP_DMA32, 2); +- if (page == NULL) ++ addr = dma_alloc_coherent(NULL, 4 * PAGE_SIZE, &_d, GFP_KERNEL); ++ if (addr == NULL) + return NULL; + ++ page = virt_to_page(addr); ++ + if (set_pages_uc(page, 4) < 0) { + set_pages_wb(page, 4); +- __free_pages(page, 2); ++ dma_free_coherent(NULL, 4 * PAGE_SIZE, addr, _d); + return NULL; + } + get_page(page); +@@ -413,12 +435,17 @@ static struct page *i8xx_alloc_pages(void) + + static void i8xx_destroy_pages(struct page *page) + { ++ void *addr; ++ + if (page == NULL) + return; + + set_pages_wb(page, 4); + put_page(page); +- __free_pages(page, 2); ++ ++ addr = page_address(page); ++ ++ dma_free_coherent(NULL, 4 * PAGE_SIZE, addr, virt_to_bus(addr)); + atomic_dec(&agp_bridge->current_memory_agp); + } + +@@ -478,8 +505,16 @@ static int intel_i810_insert_entries(struct agp_memory *mem, off_t pg_start, + if (!mem->is_flushed) + global_cache_flush(); + for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { ++ phys_addr_t phys = page_to_phys(mem->pages[i]); ++ if (xen_pv_domain()) { ++ phys_addr_t xen_phys = PFN_PHYS(pfn_to_mfn( ++ page_to_pfn(mem->pages[i]))); ++ /* Fixup: */ ++ if (xen_phys != phys) ++ phys = xen_phys; ++ } + writel(agp_bridge->driver->mask_memory(agp_bridge, +- page_to_phys(mem->pages[i]), mask_type), ++ phys, mask_type), + intel_private.registers+I810_PTE_BASE+(j*4)); + } + readl(intel_private.registers+I810_PTE_BASE+((j-1)*4)); +@@ -552,6 +587,12 @@ static struct agp_memory *alloc_agpphysmem_i8xx(size_t pg_count, int type) + new->num_scratch_pages = pg_count; + new->type = AGP_PHYS_MEMORY; + new->physical = page_to_phys(new->pages[0]); ++ if (xen_pv_domain()) { ++ phys_addr_t xen_phys = PFN_PHYS(pfn_to_mfn( ++ page_to_pfn(new->pages[0]))); ++ if (xen_phys != new->physical) ++ new->physical = xen_phys; ++ } + return new; + } + +@@ -992,8 +1033,16 @@ static int intel_i830_insert_entries(struct agp_memory *mem, off_t pg_start, + global_cache_flush(); + + for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { ++ phys_addr_t phys = page_to_phys(mem->pages[i]); ++ if (xen_pv_domain()) { ++ phys_addr_t xen_phys = PFN_PHYS(pfn_to_mfn( ++ page_to_pfn(mem->pages[i]))); ++ /* Fixup: */ ++ if (xen_phys != phys) ++ phys = xen_phys; ++ } + writel(agp_bridge->driver->mask_memory(agp_bridge, +- page_to_phys(mem->pages[i]), mask_type), ++ phys, mask_type), + intel_private.registers+I810_PTE_BASE+(j*4)); + } + readl(intel_private.registers+I810_PTE_BASE+((j-1)*4)); +diff --git a/drivers/char/hvc_xen.c b/drivers/char/hvc_xen.c +index a6ee32b..a7c6529 100644 +--- a/drivers/char/hvc_xen.c ++++ b/drivers/char/hvc_xen.c +@@ -25,6 +25,8 @@ + #include <linux/types.h> + + #include <asm/xen/hypervisor.h> ++ ++#include <xen/xen.h> + #include <xen/page.h> + #include <xen/events.h> + #include <xen/interface/io/console.h> +@@ -72,11 +74,12 @@ static int __write_console(const char *data, int len) + wmb(); /* write ring before updating pointer */ + intf->out_prod = prod; + +- notify_daemon(); ++ if (sent) ++ notify_daemon(); + return sent; + } + +-static int write_console(uint32_t vtermno, const char *data, int len) ++static int domU_write_console(uint32_t vtermno, const char *data, int len) + { + int ret = len; + +@@ -99,7 +102,7 @@ static int write_console(uint32_t vtermno, const char *data, int len) + return ret; + } + +-static int read_console(uint32_t vtermno, char *buf, int len) ++static int domU_read_console(uint32_t vtermno, char *buf, int len) + { + struct xencons_interface *intf = xencons_interface(); + XENCONS_RING_IDX cons, prod; +@@ -120,28 +123,63 @@ static int read_console(uint32_t vtermno, char *buf, int len) + return recv; + } + +-static struct hv_ops hvc_ops = { +- .get_chars = read_console, +- .put_chars = write_console, ++static struct hv_ops domU_hvc_ops = { ++ .get_chars = domU_read_console, ++ .put_chars = domU_write_console, + .notifier_add = notifier_add_irq, + .notifier_del = notifier_del_irq, + .notifier_hangup = notifier_hangup_irq, + }; + +-static int __init xen_init(void) ++static int dom0_read_console(uint32_t vtermno, char *buf, int len) ++{ ++ return HYPERVISOR_console_io(CONSOLEIO_read, len, buf); ++} ++ ++/* ++ * Either for a dom0 to write to the system console, or a domU with a ++ * debug version of Xen ++ */ ++static int dom0_write_console(uint32_t vtermno, const char *str, int len) ++{ ++ int rc = HYPERVISOR_console_io(CONSOLEIO_write, len, (char *)str); ++ if (rc < 0) ++ return 0; ++ ++ return len; ++} ++ ++static struct hv_ops dom0_hvc_ops = { ++ .get_chars = dom0_read_console, ++ .put_chars = dom0_write_console, ++ .notifier_add = notifier_add_irq, ++ .notifier_del = notifier_del_irq, ++ .notifier_hangup = notifier_hangup_irq, ++}; ++ ++static int __init xen_hvc_init(void) + { + struct hvc_struct *hp; ++ struct hv_ops *ops; + +- if (!xen_pv_domain() || +- xen_initial_domain() || +- !xen_start_info->console.domU.evtchn) ++ if (!xen_pv_domain()) + return -ENODEV; + +- xencons_irq = bind_evtchn_to_irq(xen_start_info->console.domU.evtchn); ++ if (xen_initial_domain()) { ++ ops = &dom0_hvc_ops; ++ xencons_irq = bind_virq_to_irq(VIRQ_CONSOLE, 0); ++ } else { ++ if (!xen_start_info->console.domU.evtchn) ++ return -ENODEV; ++ ++ ops = &domU_hvc_ops; ++ xencons_irq = bind_evtchn_to_irq(xen_start_info->console.domU.evtchn); ++ } ++ + if (xencons_irq < 0) + xencons_irq = 0; /* NO_IRQ */ + +- hp = hvc_alloc(HVC_COOKIE, xencons_irq, &hvc_ops, 256); ++ hp = hvc_alloc(HVC_COOKIE, xencons_irq, ops, 256); + if (IS_ERR(hp)) + return PTR_ERR(hp); + +@@ -158,7 +196,7 @@ void xen_console_resume(void) + rebind_evtchn_irq(xen_start_info->console.domU.evtchn, xencons_irq); + } + +-static void __exit xen_fini(void) ++static void __exit xen_hvc_fini(void) + { + if (hvc) + hvc_remove(hvc); +@@ -166,29 +204,24 @@ static void __exit xen_fini(void) + + static int xen_cons_init(void) + { ++ struct hv_ops *ops; ++ + if (!xen_pv_domain()) + return 0; + +- hvc_instantiate(HVC_COOKIE, 0, &hvc_ops); ++ ops = &domU_hvc_ops; ++ if (xen_initial_domain()) ++ ops = &dom0_hvc_ops; ++ ++ hvc_instantiate(HVC_COOKIE, 0, ops); ++ + return 0; + } + +-module_init(xen_init); +-module_exit(xen_fini); ++module_init(xen_hvc_init); ++module_exit(xen_hvc_fini); + console_initcall(xen_cons_init); + +-static void raw_console_write(const char *str, int len) +-{ +- while(len > 0) { +- int rc = HYPERVISOR_console_io(CONSOLEIO_write, len, (char *)str); +- if (rc <= 0) +- break; +- +- str += rc; +- len -= rc; +- } +-} +- + #ifdef CONFIG_EARLY_PRINTK + static void xenboot_write_console(struct console *console, const char *string, + unsigned len) +@@ -196,19 +229,22 @@ static void xenboot_write_console(struct console *console, const char *string, + unsigned int linelen, off = 0; + const char *pos; + +- raw_console_write(string, len); ++ dom0_write_console(0, string, len); ++ ++ if (xen_initial_domain()) ++ return; + +- write_console(0, "(early) ", 8); ++ domU_write_console(0, "(early) ", 8); + while (off < len && NULL != (pos = strchr(string+off, '\n'))) { + linelen = pos-string+off; + if (off + linelen > len) + break; +- write_console(0, string+off, linelen); +- write_console(0, "\r\n", 2); ++ domU_write_console(0, string+off, linelen); ++ domU_write_console(0, "\r\n", 2); + off += linelen + 1; + } + if (off < len) +- write_console(0, string+off, len-off); ++ domU_write_console(0, string+off, len-off); + } + + struct console xenboot_console = { +@@ -220,7 +256,7 @@ struct console xenboot_console = { + + void xen_raw_console_write(const char *str) + { +- raw_console_write(str, strlen(str)); ++ dom0_write_console(0, str, strlen(str)); + } + + void xen_raw_printk(const char *fmt, ...) +diff --git a/drivers/firewire/net.c b/drivers/firewire/net.c +index cbaf420..163459d 100644 +--- a/drivers/firewire/net.c ++++ b/drivers/firewire/net.c +@@ -8,7 +8,6 @@ + + #include <linux/bug.h> + #include <linux/device.h> +-#include <linux/ethtool.h> + #include <linux/firewire.h> + #include <linux/firewire-constants.h> + #include <linux/highmem.h> +@@ -1333,17 +1332,6 @@ static int fwnet_change_mtu(struct net_device *net, int new_mtu) + return 0; + } + +-static void fwnet_get_drvinfo(struct net_device *net, +- struct ethtool_drvinfo *info) +-{ +- strcpy(info->driver, KBUILD_MODNAME); +- strcpy(info->bus_info, "ieee1394"); +-} +- +-static const struct ethtool_ops fwnet_ethtool_ops = { +- .get_drvinfo = fwnet_get_drvinfo, +-}; +- + static const struct net_device_ops fwnet_netdev_ops = { + .ndo_open = fwnet_open, + .ndo_stop = fwnet_stop, +@@ -1362,7 +1350,6 @@ static void fwnet_init_dev(struct net_device *net) + net->hard_header_len = FWNET_HLEN; + net->type = ARPHRD_IEEE1394; + net->tx_queue_len = 10; +- SET_ETHTOOL_OPS(net, &fwnet_ethtool_ops); + } + + /* caller must hold fwnet_device_mutex */ +diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c +index 0e27d98..f5e2572 100644 +--- a/drivers/gpu/drm/drm_drv.c ++++ b/drivers/gpu/drm/drm_drv.c +@@ -201,7 +201,7 @@ int drm_lastclose(struct drm_device * dev) + } + if (drm_core_check_feature(dev, DRIVER_SG) && dev->sg && + !drm_core_check_feature(dev, DRIVER_MODESET)) { +- drm_sg_cleanup(dev->sg); ++ drm_sg_cleanup(dev, dev->sg); + dev->sg = NULL; + } + +diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c +index 8bf3770..dde5f66 100644 +--- a/drivers/gpu/drm/drm_gem.c ++++ b/drivers/gpu/drm/drm_gem.c +@@ -539,7 +539,7 @@ int drm_gem_mmap(struct file *filp, struct vm_area_struct *vma) + vma->vm_flags |= VM_RESERVED | VM_IO | VM_PFNMAP | VM_DONTEXPAND; + vma->vm_ops = obj->dev->driver->gem_vm_ops; + vma->vm_private_data = map->handle; +- vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags)); ++ vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags)); + + /* Take a ref for this mapping of the object, so that the fault + * handler can dereference the mmap offset's pointer to the object. +diff --git a/drivers/gpu/drm/drm_scatter.c b/drivers/gpu/drm/drm_scatter.c +index c7823c8..95ffb8a 100644 +--- a/drivers/gpu/drm/drm_scatter.c ++++ b/drivers/gpu/drm/drm_scatter.c +@@ -32,20 +32,73 @@ + */ + + #include <linux/vmalloc.h> ++#include <linux/mm.h> + #include "drmP.h" + + #define DEBUG_SCATTER 0 + +-static inline void *drm_vmalloc_dma(unsigned long size) ++static void *drm_vmalloc_dma(struct drm_device *drmdev, unsigned long size) + { + #if defined(__powerpc__) && defined(CONFIG_NOT_COHERENT_CACHE) + return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL | _PAGE_NO_CACHE); + #else +- return vmalloc_32(size); ++ struct device *dev = &drmdev->pdev->dev; ++ struct page **pages; ++ void *addr; ++ const int npages = PFN_UP(size); ++ int i; ++ ++ pages = kmalloc(npages * sizeof(*pages), GFP_KERNEL); ++ if (!pages) ++ goto fail; ++ ++ for (i = 0; i < npages; i++) { ++ dma_addr_t phys; ++ void *addr; ++ addr = dma_alloc_coherent(dev, PAGE_SIZE, &phys, GFP_KERNEL); ++ if (addr == NULL) ++ goto out_free_pages; ++ ++ pages[i] = virt_to_page(addr); ++ } ++ ++ addr = vmap(pages, npages, VM_MAP | VM_IOREMAP, PAGE_KERNEL); ++ ++ kfree(pages); ++ ++ return addr; ++ ++out_free_pages: ++ while (i > 0) { ++ void *addr = page_address(pages[--i]); ++ dma_free_coherent(dev, PAGE_SIZE, addr, virt_to_bus(addr)); ++ } ++ ++ kfree(pages); ++ ++fail: ++ return NULL; ++#endif ++} ++ ++static void drm_vfree_dma(struct drm_device *drmdev, void *addr, int npages, ++ struct page **pages) ++{ ++#if defined(__powerpc__) && defined(CONFIG_NOT_COHERENT_CACHE) ++ vfree(addr); ++#else ++ struct device *dev = &drmdev->pdev->dev; ++ int i; ++ ++ for (i = 0; i < npages; i++) { ++ void *addr = page_address(pages[i]); ++ dma_free_coherent(dev, PAGE_SIZE, addr, virt_to_bus(addr)); ++ } ++ vunmap(addr); + #endif + } + +-void drm_sg_cleanup(struct drm_sg_mem * entry) ++void drm_sg_cleanup(struct drm_device *drmdev, struct drm_sg_mem * entry) + { + struct page *page; + int i; +@@ -56,7 +109,7 @@ void drm_sg_cleanup(struct drm_sg_mem * entry) + ClearPageReserved(page); + } + +- vfree(entry->virtual); ++ drm_vfree_dma(drmdev, entry->virtual, entry->pages, entry->pagelist); + + kfree(entry->busaddr); + kfree(entry->pagelist); +@@ -107,7 +160,7 @@ int drm_sg_alloc(struct drm_device *dev, struct drm_scatter_gather * request) + } + memset((void *)entry->busaddr, 0, pages * sizeof(*entry->busaddr)); + +- entry->virtual = drm_vmalloc_dma(pages << PAGE_SHIFT); ++ entry->virtual = drm_vmalloc_dma(dev, pages << PAGE_SHIFT); + if (!entry->virtual) { + kfree(entry->busaddr); + kfree(entry->pagelist); +@@ -180,7 +233,7 @@ int drm_sg_alloc(struct drm_device *dev, struct drm_scatter_gather * request) + return 0; + + failed: +- drm_sg_cleanup(entry); ++ drm_sg_cleanup(dev, entry); + return -ENOMEM; + } + EXPORT_SYMBOL(drm_sg_alloc); +@@ -212,7 +265,7 @@ int drm_sg_free(struct drm_device *dev, void *data, + + DRM_DEBUG("virtual = %p\n", entry->virtual); + +- drm_sg_cleanup(entry); ++ drm_sg_cleanup(dev, entry); + + return 0; + } +diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c +index 1c040d0..e3555bf 100644 +--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c ++++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c +@@ -87,6 +87,9 @@ static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) + bool is_iomem; + unsigned long address = (unsigned long)vmf->virtual_address; + int retval = VM_FAULT_NOPAGE; ++ bool vm_io = (vma->vm_flags & VM_IO) && VM_IO; ++ bool pte_iomap = (pgprot_val(vma->vm_page_prot) & _PAGE_IOMAP) ++ && _PAGE_IOMAP; + + /* + * Work around locking order reversal in fault / nopfn +@@ -158,11 +161,30 @@ static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) + if (is_iomem) { + vma->vm_page_prot = ttm_io_prot(bo->mem.placement, + vma->vm_page_prot); ++ if (!vm_io || !pte_iomap) { ++ vma->vm_flags |= VM_IO; ++ pgprot_val(vma->vm_page_prot) |= _PAGE_IOMAP; ++ } + } else { + ttm = bo->ttm; + vma->vm_page_prot = (bo->mem.placement & TTM_PL_FLAG_CACHED) ? + vm_get_page_prot(vma->vm_flags) : + ttm_io_prot(bo->mem.placement, vma->vm_page_prot); ++ /* ++ * During PCI suspend the graphic cards purge their VRAM and ++ * move their graphic objects to the TT. They also unmap all ++ * of the objects, meaning that when an user application is ++ * unfrozen it will re-fault and call here. ++ * ++ * What this means is that the VMA for the graphic object might ++ * have been set for VRAM TTM but now it is with the TT ++ * (normal RAM) meaning that the vma->vm_flags could be ++ * inappropiate (say, VM_IO on TT - no good). ++ */ ++ if (vm_io || pte_iomap) { ++ vma->vm_flags &= ~VM_IO; ++ pgprot_val(vma->vm_page_prot) &= ~_PAGE_IOMAP; ++ } + } + + /* +@@ -239,6 +261,7 @@ int ttm_bo_mmap(struct file *filp, struct vm_area_struct *vma, + { + struct ttm_bo_driver *driver; + struct ttm_buffer_object *bo; ++ struct ttm_mem_type_manager *man; + int ret; + + read_lock(&bdev->vm_lock); +@@ -271,7 +294,11 @@ int ttm_bo_mmap(struct file *filp, struct vm_area_struct *vma, + */ + + vma->vm_private_data = bo; +- vma->vm_flags |= VM_RESERVED | VM_IO | VM_MIXEDMAP | VM_DONTEXPAND; ++ vma->vm_flags |= VM_RESERVED | VM_MIXEDMAP | VM_DONTEXPAND; ++ man = &bdev->man[bo->mem.mem_type]; ++ if (man->flags & TTM_MEMTYPE_FLAG_NEEDS_IOREMAP) ++ vma->vm_flags |= VM_IO; ++ vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); + return 0; + out_unref: + ttm_bo_unref(&bo); +diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c +index 3d5b8b0..8b05e38 100644 +--- a/drivers/gpu/drm/ttm/ttm_tt.c ++++ b/drivers/gpu/drm/ttm/ttm_tt.c +@@ -38,7 +38,8 @@ + #include "ttm/ttm_module.h" + #include "ttm/ttm_bo_driver.h" + #include "ttm/ttm_placement.h" +- ++#include <linux/dma-mapping.h> ++#include <xen/xen.h> + static int ttm_tt_swapin(struct ttm_tt *ttm); + + /** +@@ -84,6 +85,16 @@ static struct page *ttm_tt_alloc_page(unsigned page_flags) + else + gfp_flags |= __GFP_HIGHMEM; + ++ if ((page_flags & TTM_PAGE_FLAG_DMA32) && xen_pv_domain()) ++ { ++ void *addr; ++ dma_addr_t _d; ++ ++ addr = dma_alloc_coherent(NULL, PAGE_SIZE, &_d, GFP_KERNEL); ++ if (addr == NULL) ++ return NULL; ++ return virt_to_page(addr); ++ } + return alloc_page(gfp_flags); + } + +@@ -286,6 +297,7 @@ static void ttm_tt_free_alloced_pages(struct ttm_tt *ttm) + int i; + struct page *cur_page; + struct ttm_backend *be = ttm->be; ++ void *addr; + + if (be) + be->func->clear(be); +@@ -300,7 +312,16 @@ static void ttm_tt_free_alloced_pages(struct ttm_tt *ttm) + "Leaking pages.\n"); + ttm_mem_global_free_page(ttm->glob->mem_glob, + cur_page); +- __free_page(cur_page); ++ ++ if ((ttm->page_flags & TTM_PAGE_FLAG_DMA32) && ++ xen_pv_domain()) { ++ addr = page_address(cur_page); ++ WARN_ON(!addr); ++ if (addr) ++ dma_free_coherent(NULL, PAGE_SIZE, addr, ++ virt_to_bus(addr)); ++ } else ++ __free_page(cur_page); + } + } + ttm->state = tt_unpopulated; +diff --git a/drivers/ieee1394/eth1394.c b/drivers/ieee1394/eth1394.c +index a4e9dcb..62ab09e 100644 +--- a/drivers/ieee1394/eth1394.c ++++ b/drivers/ieee1394/eth1394.c +@@ -58,7 +58,6 @@ + #include <linux/tcp.h> + #include <linux/skbuff.h> + #include <linux/bitops.h> +-#include <linux/ethtool.h> + #include <asm/uaccess.h> + #include <asm/delay.h> + #include <asm/unaligned.h> +@@ -173,8 +172,6 @@ static netdev_tx_t ether1394_tx(struct sk_buff *skb, + struct net_device *dev); + static void ether1394_iso(struct hpsb_iso *iso); + +-static const struct ethtool_ops ethtool_ops; +- + static int ether1394_write(struct hpsb_host *host, int srcid, int destid, + quadlet_t *data, u64 addr, size_t len, u16 flags); + static void ether1394_add_host(struct hpsb_host *host); +@@ -525,8 +522,6 @@ static void ether1394_init_dev(struct net_device *dev) + dev->header_ops = ðer1394_header_ops; + dev->netdev_ops = ðer1394_netdev_ops; + +- SET_ETHTOOL_OPS(dev, ðtool_ops); +- + dev->watchdog_timeo = ETHER1394_TIMEOUT; + dev->flags = IFF_BROADCAST | IFF_MULTICAST; + dev->features = NETIF_F_HIGHDMA; +@@ -1698,17 +1693,6 @@ fail: + return NETDEV_TX_OK; + } + +-static void ether1394_get_drvinfo(struct net_device *dev, +- struct ethtool_drvinfo *info) +-{ +- strcpy(info->driver, driver_name); +- strcpy(info->bus_info, "ieee1394"); /* FIXME provide more detail? */ +-} +- +-static const struct ethtool_ops ethtool_ops = { +- .get_drvinfo = ether1394_get_drvinfo +-}; +- + static int __init ether1394_init_module(void) + { + int err; +diff --git a/drivers/input/xen-kbdfront.c b/drivers/input/xen-kbdfront.c +index b115726..80a072e 100644 +--- a/drivers/input/xen-kbdfront.c ++++ b/drivers/input/xen-kbdfront.c +@@ -21,7 +21,10 @@ + #include <linux/errno.h> + #include <linux/module.h> + #include <linux/input.h> ++ + #include <asm/xen/hypervisor.h> ++ ++#include <xen/xen.h> + #include <xen/events.h> + #include <xen/page.h> + #include <xen/interface/io/fbif.h> +@@ -272,6 +275,8 @@ static void xenkbd_backend_changed(struct xenbus_device *dev, + switch (backend_state) { + case XenbusStateInitialising: + case XenbusStateInitialised: ++ case XenbusStateReconfiguring: ++ case XenbusStateReconfigured: + case XenbusStateUnknown: + case XenbusStateClosed: + break; +@@ -335,7 +340,7 @@ static struct xenbus_driver xenkbd_driver = { + + static int __init xenkbd_init(void) + { +- if (!xen_domain()) ++ if (!xen_domain() || xen_hvm_domain()) + return -ENODEV; + + /* Nothing to do if running in dom0. */ +diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig +index b2f71f7..b7feb84 100644 +--- a/drivers/net/Kconfig ++++ b/drivers/net/Kconfig +@@ -2787,6 +2787,7 @@ source "drivers/s390/net/Kconfig" + config XEN_NETDEV_FRONTEND + tristate "Xen network device frontend driver" + depends on XEN ++ select XEN_XENBUS_FRONTEND + default y + help + The network device frontend driver allows the kernel to +diff --git a/drivers/net/bmac.c b/drivers/net/bmac.c +index 406f064..c063b53 100644 +--- a/drivers/net/bmac.c ++++ b/drivers/net/bmac.c +@@ -1236,15 +1236,8 @@ static void bmac_reset_and_enable(struct net_device *dev) + } + spin_unlock_irqrestore(&bp->lock, flags); + } +-static void bmac_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) +-{ +- struct bmac_data *bp = netdev_priv(dev); +- strcpy(info->driver, "bmac"); +- strcpy(info->bus_info, dev_name(&bp->mdev->ofdev.dev)); +-} + + static const struct ethtool_ops bmac_ethtool_ops = { +- .get_drvinfo = bmac_get_drvinfo, + .get_link = ethtool_op_get_link, + }; + +diff --git a/drivers/net/fec_mpc52xx.c b/drivers/net/fec_mpc52xx.c +index 66dace6..8238fa2 100644 +--- a/drivers/net/fec_mpc52xx.c ++++ b/drivers/net/fec_mpc52xx.c +@@ -772,11 +772,6 @@ static void mpc52xx_fec_reset(struct net_device *dev) + + + /* ethtool interface */ +-static void mpc52xx_fec_get_drvinfo(struct net_device *dev, +- struct ethtool_drvinfo *info) +-{ +- strcpy(info->driver, DRIVER_NAME); +-} + + static int mpc52xx_fec_get_settings(struct net_device *dev, struct ethtool_cmd *cmd) + { +@@ -811,7 +806,6 @@ static void mpc52xx_fec_set_msglevel(struct net_device *dev, u32 level) + } + + static const struct ethtool_ops mpc52xx_fec_ethtool_ops = { +- .get_drvinfo = mpc52xx_fec_get_drvinfo, + .get_settings = mpc52xx_fec_get_settings, + .set_settings = mpc52xx_fec_set_settings, + .get_link = ethtool_op_get_link, +diff --git a/drivers/net/pasemi_mac_ethtool.c b/drivers/net/pasemi_mac_ethtool.c +index 28a8622..29ff9ad 100644 +--- a/drivers/net/pasemi_mac_ethtool.c ++++ b/drivers/net/pasemi_mac_ethtool.c +@@ -77,21 +77,6 @@ pasemi_mac_ethtool_get_settings(struct net_device *netdev, + return phy_ethtool_gset(phydev, cmd); + } + +-static void +-pasemi_mac_ethtool_get_drvinfo(struct net_device *netdev, +- struct ethtool_drvinfo *drvinfo) +-{ +- struct pasemi_mac *mac; +- mac = netdev_priv(netdev); +- +- /* clear and fill out info */ +- memset(drvinfo, 0, sizeof(struct ethtool_drvinfo)); +- strncpy(drvinfo->driver, "pasemi_mac", 12); +- strcpy(drvinfo->version, "N/A"); +- strcpy(drvinfo->fw_version, "N/A"); +- strncpy(drvinfo->bus_info, pci_name(mac->pdev), 32); +-} +- + static u32 + pasemi_mac_ethtool_get_msglevel(struct net_device *netdev) + { +@@ -150,7 +135,6 @@ static void pasemi_mac_get_strings(struct net_device *netdev, u32 stringset, + + const struct ethtool_ops pasemi_mac_ethtool_ops = { + .get_settings = pasemi_mac_ethtool_get_settings, +- .get_drvinfo = pasemi_mac_ethtool_get_drvinfo, + .get_msglevel = pasemi_mac_ethtool_get_msglevel, + .set_msglevel = pasemi_mac_ethtool_set_msglevel, + .get_link = ethtool_op_get_link, +diff --git a/drivers/net/pcmcia/3c574_cs.c b/drivers/net/pcmcia/3c574_cs.c +index b58965a..7f9a4f4 100644 +--- a/drivers/net/pcmcia/3c574_cs.c ++++ b/drivers/net/pcmcia/3c574_cs.c +@@ -83,7 +83,6 @@ earlier 3Com products. + #include <linux/skbuff.h> + #include <linux/if_arp.h> + #include <linux/ioport.h> +-#include <linux/ethtool.h> + #include <linux/bitops.h> + #include <linux/mii.h> + +@@ -249,7 +248,6 @@ static int el3_rx(struct net_device *dev, int worklimit); + static int el3_close(struct net_device *dev); + static void el3_tx_timeout(struct net_device *dev); + static int el3_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); +-static const struct ethtool_ops netdev_ethtool_ops; + static void set_rx_mode(struct net_device *dev); + static void set_multicast_list(struct net_device *dev); + +@@ -300,7 +298,6 @@ static int tc574_probe(struct pcmcia_device *link) + link->conf.ConfigIndex = 1; + + dev->netdev_ops = &el3_netdev_ops; +- SET_ETHTOOL_OPS(dev, &netdev_ethtool_ops); + dev->watchdog_timeo = TX_TIMEOUT; + + return tc574_config(link); +@@ -1083,16 +1080,6 @@ static int el3_rx(struct net_device *dev, int worklimit) + return worklimit; + } + +-static void netdev_get_drvinfo(struct net_device *dev, +- struct ethtool_drvinfo *info) +-{ +- strcpy(info->driver, "3c574_cs"); +-} +- +-static const struct ethtool_ops netdev_ethtool_ops = { +- .get_drvinfo = netdev_get_drvinfo, +-}; +- + /* Provide ioctl() calls to examine the MII xcvr state. */ + static int el3_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) + { +diff --git a/drivers/net/pcmcia/axnet_cs.c b/drivers/net/pcmcia/axnet_cs.c +index 3131a59..40e5e7c 100644 +--- a/drivers/net/pcmcia/axnet_cs.c ++++ b/drivers/net/pcmcia/axnet_cs.c +@@ -33,7 +33,6 @@ + #include <linux/timer.h> + #include <linux/delay.h> + #include <linux/spinlock.h> +-#include <linux/ethtool.h> + #include <linux/netdevice.h> + #include <linux/etherdevice.h> + #include <linux/crc32.h> +@@ -98,7 +97,6 @@ static netdev_tx_t axnet_start_xmit(struct sk_buff *skb, + static struct net_device_stats *get_stats(struct net_device *dev); + static void set_multicast_list(struct net_device *dev); + static void axnet_tx_timeout(struct net_device *dev); +-static const struct ethtool_ops netdev_ethtool_ops; + static irqreturn_t ei_irq_wrapper(int irq, void *dev_id); + static void ei_watchdog(u_long arg); + static void axnet_reset_8390(struct net_device *dev); +@@ -186,7 +184,6 @@ static int axnet_probe(struct pcmcia_device *link) + + dev->netdev_ops = &axnet_netdev_ops; + +- SET_ETHTOOL_OPS(dev, &netdev_ethtool_ops); + dev->watchdog_timeo = TX_TIMEOUT; + + return axnet_config(link); +@@ -683,16 +680,6 @@ reschedule: + add_timer(&info->watchdog); + } + +-static void netdev_get_drvinfo(struct net_device *dev, +- struct ethtool_drvinfo *info) +-{ +- strcpy(info->driver, "axnet_cs"); +-} +- +-static const struct ethtool_ops netdev_ethtool_ops = { +- .get_drvinfo = netdev_get_drvinfo, +-}; +- + /*====================================================================*/ + + static int axnet_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) +diff --git a/drivers/net/pcmcia/ibmtr_cs.c b/drivers/net/pcmcia/ibmtr_cs.c +index 06618af..db0c890 100644 +--- a/drivers/net/pcmcia/ibmtr_cs.c ++++ b/drivers/net/pcmcia/ibmtr_cs.c +@@ -52,7 +52,6 @@ + #include <linux/string.h> + #include <linux/timer.h> + #include <linux/module.h> +-#include <linux/ethtool.h> + #include <linux/netdevice.h> + #include <linux/trdevice.h> + #include <linux/ibmtr.h> +@@ -120,16 +119,6 @@ typedef struct ibmtr_dev_t { + struct tok_info *ti; + } ibmtr_dev_t; + +-static void netdev_get_drvinfo(struct net_device *dev, +- struct ethtool_drvinfo *info) +-{ +- strcpy(info->driver, "ibmtr_cs"); +-} +- +-static const struct ethtool_ops netdev_ethtool_ops = { +- .get_drvinfo = netdev_get_drvinfo, +-}; +- + /*====================================================================== + + ibmtr_attach() creates an "instance" of the driver, allocating +@@ -170,8 +159,6 @@ static int __devinit ibmtr_attach(struct pcmcia_device *link) + + link->irq.Instance = info->dev = dev; + +- SET_ETHTOOL_OPS(dev, &netdev_ethtool_ops); +- + return ibmtr_config(link); + } /* ibmtr_attach */ + +diff --git a/drivers/net/pcmcia/pcnet_cs.c b/drivers/net/pcmcia/pcnet_cs.c +index 94c9ad2..1b673b0 100644 +--- a/drivers/net/pcmcia/pcnet_cs.c ++++ b/drivers/net/pcmcia/pcnet_cs.c +@@ -36,7 +36,6 @@ + #include <linux/string.h> + #include <linux/timer.h> + #include <linux/delay.h> +-#include <linux/ethtool.h> + #include <linux/netdevice.h> + #include <linux/log2.h> + #include <linux/etherdevice.h> +@@ -111,7 +110,6 @@ static void pcnet_release(struct pcmcia_device *link); + static int pcnet_open(struct net_device *dev); + static int pcnet_close(struct net_device *dev); + static int ei_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); +-static const struct ethtool_ops netdev_ethtool_ops; + static irqreturn_t ei_irq_wrapper(int irq, void *dev_id); + static void ei_watchdog(u_long arg); + static void pcnet_reset_8390(struct net_device *dev); +@@ -654,8 +652,6 @@ static int pcnet_config(struct pcmcia_device *link) + ei_status.word16 = 1; + ei_status.reset_8390 = &pcnet_reset_8390; + +- SET_ETHTOOL_OPS(dev, &netdev_ethtool_ops); +- + if (info->flags & (IS_DL10019|IS_DL10022)) + mii_phy_probe(dev); + +@@ -1175,18 +1171,6 @@ reschedule: + + /*====================================================================*/ + +-static void netdev_get_drvinfo(struct net_device *dev, +- struct ethtool_drvinfo *info) +-{ +- strcpy(info->driver, "pcnet_cs"); +-} +- +-static const struct ethtool_ops netdev_ethtool_ops = { +- .get_drvinfo = netdev_get_drvinfo, +-}; +- +-/*====================================================================*/ +- + + static int ei_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) + { +diff --git a/drivers/net/sc92031.c b/drivers/net/sc92031.c +index 8d60300..0926832 100644 +--- a/drivers/net/sc92031.c ++++ b/drivers/net/sc92031.c +@@ -1255,16 +1255,6 @@ static int sc92031_ethtool_set_settings(struct net_device *dev, + return 0; + } + +-static void sc92031_ethtool_get_drvinfo(struct net_device *dev, +- struct ethtool_drvinfo *drvinfo) +-{ +- struct sc92031_priv *priv = netdev_priv(dev); +- struct pci_dev *pdev = priv->pdev; +- +- strcpy(drvinfo->driver, SC92031_NAME); +- strcpy(drvinfo->bus_info, pci_name(pdev)); +-} +- + static void sc92031_ethtool_get_wol(struct net_device *dev, + struct ethtool_wolinfo *wolinfo) + { +@@ -1386,7 +1376,6 @@ static void sc92031_ethtool_get_ethtool_stats(struct net_device *dev, + static const struct ethtool_ops sc92031_ethtool_ops = { + .get_settings = sc92031_ethtool_get_settings, + .set_settings = sc92031_ethtool_set_settings, +- .get_drvinfo = sc92031_ethtool_get_drvinfo, + .get_wol = sc92031_ethtool_get_wol, + .set_wol = sc92031_ethtool_set_wol, + .nway_reset = sc92031_ethtool_nway_reset, +diff --git a/drivers/net/tulip/xircom_cb.c b/drivers/net/tulip/xircom_cb.c +index 0f2ca598..44159be 100644 +--- a/drivers/net/tulip/xircom_cb.c ++++ b/drivers/net/tulip/xircom_cb.c +@@ -27,7 +27,6 @@ + #include <linux/skbuff.h> + #include <linux/delay.h> + #include <linux/init.h> +-#include <linux/ethtool.h> + #include <linux/bitops.h> + + #include <asm/uaccess.h> +@@ -179,19 +178,6 @@ static void print_binary(unsigned int number) + } + #endif + +-static void netdev_get_drvinfo(struct net_device *dev, +- struct ethtool_drvinfo *info) +-{ +- struct xircom_private *private = netdev_priv(dev); +- +- strcpy(info->driver, "xircom_cb"); +- strcpy(info->bus_info, pci_name(private->pdev)); +-} +- +-static const struct ethtool_ops netdev_ethtool_ops = { +- .get_drvinfo = netdev_get_drvinfo, +-}; +- + static const struct net_device_ops netdev_ops = { + .ndo_open = xircom_open, + .ndo_stop = xircom_close, +@@ -277,7 +263,6 @@ static int __devinit xircom_probe(struct pci_dev *pdev, const struct pci_device_ + setup_descriptors(private); + + dev->netdev_ops = &netdev_ops; +- SET_ETHTOOL_OPS(dev, &netdev_ethtool_ops); + pci_set_drvdata(pdev, dev); + + if (register_netdev(dev)) { +diff --git a/drivers/net/usb/hso.c b/drivers/net/usb/hso.c +index f450bc9..2109514 100644 +--- a/drivers/net/usb/hso.c ++++ b/drivers/net/usb/hso.c +@@ -820,17 +820,7 @@ static netdev_tx_t hso_net_start_xmit(struct sk_buff *skb, + return NETDEV_TX_OK; + } + +-static void hso_get_drvinfo(struct net_device *net, struct ethtool_drvinfo *info) +-{ +- struct hso_net *odev = netdev_priv(net); +- +- strncpy(info->driver, driver_name, ETHTOOL_BUSINFO_LEN); +- strncpy(info->version, DRIVER_VERSION, ETHTOOL_BUSINFO_LEN); +- usb_make_path(odev->parent->usb, info->bus_info, sizeof info->bus_info); +-} +- + static const struct ethtool_ops ops = { +- .get_drvinfo = hso_get_drvinfo, + .get_link = ethtool_op_get_link + }; + +diff --git a/drivers/net/usb/kaweth.c b/drivers/net/usb/kaweth.c +index e391ef9..47d1926 100644 +--- a/drivers/net/usb/kaweth.c ++++ b/drivers/net/usb/kaweth.c +@@ -767,14 +767,6 @@ static int kaweth_close(struct net_device *net) + return 0; + } + +-static void kaweth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) +-{ +- struct kaweth_device *kaweth = netdev_priv(dev); +- +- strlcpy(info->driver, driver_name, sizeof(info->driver)); +- usb_make_path(kaweth->dev, info->bus_info, sizeof (info->bus_info)); +-} +- + static u32 kaweth_get_link(struct net_device *dev) + { + struct kaweth_device *kaweth = netdev_priv(dev); +@@ -783,7 +775,6 @@ static u32 kaweth_get_link(struct net_device *dev) + } + + static const struct ethtool_ops ops = { +- .get_drvinfo = kaweth_get_drvinfo, + .get_link = kaweth_get_link + }; + +diff --git a/drivers/net/wireless/ray_cs.c b/drivers/net/wireless/ray_cs.c +index 1c88c2e..2e65100 100644 +--- a/drivers/net/wireless/ray_cs.c ++++ b/drivers/net/wireless/ray_cs.c +@@ -44,7 +44,6 @@ + #include <linux/if_arp.h> + #include <linux/ioport.h> + #include <linux/skbuff.h> +-#include <linux/ethtool.h> + #include <linux/ieee80211.h> + + #include <pcmcia/cs_types.h> +@@ -101,8 +100,6 @@ static int ray_dev_config(struct net_device *dev, struct ifmap *map); + static struct net_device_stats *ray_get_stats(struct net_device *dev); + static int ray_dev_init(struct net_device *dev); + +-static const struct ethtool_ops netdev_ethtool_ops; +- + static int ray_open(struct net_device *dev); + static netdev_tx_t ray_dev_start_xmit(struct sk_buff *skb, + struct net_device *dev); +@@ -362,7 +359,6 @@ static int ray_probe(struct pcmcia_device *p_dev) + + /* Raylink entries in the device structure */ + dev->netdev_ops = &ray_netdev_ops; +- SET_ETHTOOL_OPS(dev, &netdev_ethtool_ops); + dev->wireless_handlers = &ray_handler_def; + #ifdef WIRELESS_SPY + local->wireless_data.spy_data = &local->spy_data; +@@ -1106,18 +1102,6 @@ AP to AP 1 1 dest AP src AP dest source + } + } /* end encapsulate_frame */ + +-/*===========================================================================*/ +- +-static void netdev_get_drvinfo(struct net_device *dev, +- struct ethtool_drvinfo *info) +-{ +- strcpy(info->driver, "ray_cs"); +-} +- +-static const struct ethtool_ops netdev_ethtool_ops = { +- .get_drvinfo = netdev_get_drvinfo, +-}; +- + /*====================================================================*/ + + /*------------------------------------------------------------------*/ +diff --git a/drivers/net/wireless/wl3501_cs.c b/drivers/net/wireless/wl3501_cs.c +index 4f1e0cf..22b2b43 100644 +--- a/drivers/net/wireless/wl3501_cs.c ++++ b/drivers/net/wireless/wl3501_cs.c +@@ -29,7 +29,6 @@ + + #include <linux/delay.h> + #include <linux/types.h> +-#include <linux/ethtool.h> + #include <linux/init.h> + #include <linux/interrupt.h> + #include <linux/in.h> +@@ -1436,15 +1435,6 @@ static struct iw_statistics *wl3501_get_wireless_stats(struct net_device *dev) + return wstats; + } + +-static void wl3501_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) +-{ +- strlcpy(info->driver, wl3501_dev_info, sizeof(info->driver)); +-} +- +-static const struct ethtool_ops ops = { +- .get_drvinfo = wl3501_get_drvinfo +-}; +- + /** + * wl3501_detach - deletes a driver "instance" + * @link - FILL_IN +@@ -1936,7 +1926,6 @@ static int wl3501_probe(struct pcmcia_device *p_dev) + this->p_dev = p_dev; + dev->wireless_data = &this->wireless_data; + dev->wireless_handlers = &wl3501_handler_def; +- SET_ETHTOOL_OPS(dev, &ops); + netif_stop_queue(dev); + p_dev->priv = p_dev->irq.Instance = dev; + +diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c +index 1a11d95..3f71199 100644 +--- a/drivers/net/xen-netfront.c ++++ b/drivers/net/xen-netfront.c +@@ -42,6 +42,7 @@ + #include <linux/mm.h> + #include <net/ip.h> + ++#include <xen/xen.h> + #include <xen/xenbus.h> + #include <xen/events.h> + #include <xen/page.h> +@@ -53,19 +54,36 @@ + + static const struct ethtool_ops xennet_ethtool_ops; + ++static int use_smartpoll = 0; ++module_param(use_smartpoll, int, 0600); ++MODULE_PARM_DESC (use_smartpoll, "Use smartpoll mechanism if available"); ++ + struct netfront_cb { + struct page *page; + unsigned offset; + }; + ++#define MICRO_SECOND 1000000UL ++#define NANO_SECOND 1000000000UL ++#define DEFAULT_SMART_POLL_FREQ 1000UL ++ ++struct netfront_smart_poll { ++ struct hrtimer timer; ++ struct net_device *netdev; ++ unsigned int smart_poll_freq; ++ unsigned int feature_smart_poll; ++ unsigned int active; ++ unsigned long counter; ++}; ++ + #define NETFRONT_SKB_CB(skb) ((struct netfront_cb *)((skb)->cb)) + + #define RX_COPY_THRESHOLD 256 + + #define GRANT_INVALID_REF 0 + +-#define NET_TX_RING_SIZE __RING_SIZE((struct xen_netif_tx_sring *)0, PAGE_SIZE) +-#define NET_RX_RING_SIZE __RING_SIZE((struct xen_netif_rx_sring *)0, PAGE_SIZE) ++#define NET_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, PAGE_SIZE) ++#define NET_RX_RING_SIZE __CONST_RING_SIZE(xen_netif_rx, PAGE_SIZE) + #define TX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256) + + struct netfront_info { +@@ -104,7 +122,7 @@ struct netfront_info { + + /* Receive-ring batched refills. */ + #define RX_MIN_TARGET 8 +-#define RX_DFL_MIN_TARGET 64 ++#define RX_DFL_MIN_TARGET 80 + #define RX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256) + unsigned rx_min_target, rx_max_target, rx_target; + struct sk_buff_head rx_batch; +@@ -118,6 +136,8 @@ struct netfront_info { + unsigned long rx_pfn_array[NET_RX_RING_SIZE]; + struct multicall_entry rx_mcl[NET_RX_RING_SIZE+1]; + struct mmu_update rx_mmu[NET_RX_RING_SIZE]; ++ ++ struct netfront_smart_poll smart_poll; + }; + + struct netfront_rx_info { +@@ -337,15 +357,17 @@ static int xennet_open(struct net_device *dev) + return 0; + } + +-static void xennet_tx_buf_gc(struct net_device *dev) ++static int xennet_tx_buf_gc(struct net_device *dev) + { + RING_IDX cons, prod; ++ RING_IDX cons_begin, cons_end; + unsigned short id; + struct netfront_info *np = netdev_priv(dev); + struct sk_buff *skb; + + BUG_ON(!netif_carrier_ok(dev)); + ++ cons_begin = np->tx.rsp_cons; + do { + prod = np->tx.sring->rsp_prod; + rmb(); /* Ensure we see responses up to 'rp'. */ +@@ -390,7 +412,11 @@ static void xennet_tx_buf_gc(struct net_device *dev) + mb(); /* update shared area */ + } while ((cons == prod) && (prod != np->tx.sring->rsp_prod)); + ++ cons_end = np->tx.rsp_cons; ++ + xennet_maybe_wake_tx(dev); ++ ++ return (cons_begin == cons_end); + } + + static void xennet_make_frags(struct sk_buff *skb, struct net_device *dev, +@@ -1267,6 +1293,14 @@ static void xennet_disconnect_backend(struct netfront_info *info) + info->rx.sring = NULL; + } + ++static int netfront_suspend(struct xenbus_device *dev, pm_message_t state) ++{ ++ struct netfront_info *info = dev_get_drvdata(&dev->dev); ++ struct hrtimer *timer = &info->smart_poll.timer; ++ hrtimer_cancel(timer); ++ return 0; ++} ++ + /** + * We are reconnecting to the backend, due to a suspend/resume, or a backend + * driver restart. We tear down our netif structure and recreate it, but +@@ -1305,6 +1339,59 @@ static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[]) + return 0; + } + ++static enum hrtimer_restart smart_poll_function(struct hrtimer *timer) ++{ ++ struct netfront_smart_poll *psmart_poll; ++ struct net_device *dev; ++ struct netfront_info *np; ++ unsigned long flags; ++ unsigned int tx_active = 0, rx_active = 0; ++ ++ psmart_poll = container_of(timer, struct netfront_smart_poll, timer); ++ dev = psmart_poll->netdev; ++ np = netdev_priv(dev); ++ ++ spin_lock_irqsave(&np->tx_lock, flags); ++ ++ if (!np->rx.sring) ++ goto end; ++ ++ np->smart_poll.counter++; ++ ++ if (likely(netif_carrier_ok(dev))) { ++ tx_active = !(xennet_tx_buf_gc(dev)); ++ /* Under tx_lock: protects access to rx shared-ring indexes. */ ++ if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx)) { ++ rx_active = 1; ++ napi_schedule(&np->napi); ++ } ++ } ++ ++ np->smart_poll.active |= (tx_active || rx_active); ++ if (np->smart_poll.counter % ++ (np->smart_poll.smart_poll_freq / 10) == 0) { ++ if (!np->smart_poll.active) { ++ np->rx.sring->private.netif.smartpoll_active = 0; ++ goto end; ++ } ++ np->smart_poll.active = 0; ++ } ++ ++ if (np->rx.sring->private.netif.smartpoll_active) { ++ if ( hrtimer_start(timer, ++ ktime_set(0, NANO_SECOND/psmart_poll->smart_poll_freq), ++ HRTIMER_MODE_REL) ) { ++ printk(KERN_DEBUG "Failed to start hrtimer," ++ "use interrupt mode for this packet\n"); ++ np->rx.sring->private.netif.smartpoll_active = 0; ++ } ++ } ++ ++end: ++ spin_unlock_irqrestore(&np->tx_lock, flags); ++ return HRTIMER_NORESTART; ++} ++ + static irqreturn_t xennet_interrupt(int irq, void *dev_id) + { + struct net_device *dev = dev_id; +@@ -1320,6 +1407,16 @@ static irqreturn_t xennet_interrupt(int irq, void *dev_id) + napi_schedule(&np->napi); + } + ++ if (np->smart_poll.feature_smart_poll) { ++ if ( hrtimer_start(&np->smart_poll.timer, ++ ktime_set(0,NANO_SECOND/np->smart_poll.smart_poll_freq), ++ HRTIMER_MODE_REL) ) { ++ printk(KERN_DEBUG "Failed to start hrtimer," ++ "use interrupt mode for this packet\n"); ++ np->rx.sring->private.netif.smartpoll_active = 0; ++ } ++ } ++ + spin_unlock_irqrestore(&np->tx_lock, flags); + + return IRQ_HANDLED; +@@ -1393,7 +1490,7 @@ static int setup_netfront(struct xenbus_device *dev, struct netfront_info *info) + } + + /* Common code used when first setting up, and when resuming. */ +-static int talk_to_backend(struct xenbus_device *dev, ++static int talk_to_netback(struct xenbus_device *dev, + struct netfront_info *info) + { + const char *message; +@@ -1456,6 +1553,12 @@ again: + goto abort_transaction; + } + ++ err = xenbus_printf(xbt, dev->nodename, "feature-smart-poll", "%d", use_smartpoll); ++ if (err) { ++ message = "writing feature-smart-poll"; ++ goto abort_transaction; ++ } ++ + err = xenbus_transaction_end(xbt, 0); + if (err) { + if (err == -EAGAIN) +@@ -1543,7 +1646,26 @@ static int xennet_connect(struct net_device *dev) + return -ENODEV; + } + +- err = talk_to_backend(np->xbdev, np); ++ np->smart_poll.feature_smart_poll = 0; ++ if (use_smartpoll) { ++ err = xenbus_scanf(XBT_NIL, np->xbdev->otherend, ++ "feature-smart-poll", "%u", ++ &np->smart_poll.feature_smart_poll); ++ if (err != 1) ++ np->smart_poll.feature_smart_poll = 0; ++ } ++ ++ hrtimer_init(&np->smart_poll.timer, CLOCK_MONOTONIC, ++ HRTIMER_MODE_REL); ++ if (np->smart_poll.feature_smart_poll) { ++ np->smart_poll.timer.function = smart_poll_function; ++ np->smart_poll.netdev = dev; ++ np->smart_poll.smart_poll_freq = DEFAULT_SMART_POLL_FREQ; ++ np->smart_poll.active = 0; ++ np->smart_poll.counter = 0; ++ } ++ ++ err = talk_to_netback(np->xbdev, np); + if (err) + return err; + +@@ -1597,7 +1719,7 @@ static int xennet_connect(struct net_device *dev) + /** + * Callback received when the backend's state changes. + */ +-static void backend_changed(struct xenbus_device *dev, ++static void netback_changed(struct xenbus_device *dev, + enum xenbus_state backend_state) + { + struct netfront_info *np = dev_get_drvdata(&dev->dev); +@@ -1608,6 +1730,8 @@ static void backend_changed(struct xenbus_device *dev, + switch (backend_state) { + case XenbusStateInitialising: + case XenbusStateInitialised: ++ case XenbusStateReconfiguring: ++ case XenbusStateReconfigured: + case XenbusStateConnected: + case XenbusStateUnknown: + case XenbusStateClosed: +@@ -1628,12 +1752,30 @@ static void backend_changed(struct xenbus_device *dev, + } + } + ++static int xennet_get_coalesce(struct net_device *netdev, ++ struct ethtool_coalesce *ec) ++{ ++ struct netfront_info *np = netdev_priv(netdev); ++ ec->rx_coalesce_usecs = MICRO_SECOND / np->smart_poll.smart_poll_freq; ++ return 0; ++} ++ ++static int xennet_set_coalesce(struct net_device *netdev, ++ struct ethtool_coalesce *ec) ++{ ++ struct netfront_info *np = netdev_priv(netdev); ++ np->smart_poll.smart_poll_freq = MICRO_SECOND / ec->rx_coalesce_usecs; ++ return 0; ++} ++ + static const struct ethtool_ops xennet_ethtool_ops = + { + .set_tx_csum = ethtool_op_set_tx_csum, + .set_sg = xennet_set_sg, + .set_tso = xennet_set_tso, + .get_link = ethtool_op_get_link, ++ .get_coalesce = xennet_get_coalesce, ++ .set_coalesce = xennet_set_coalesce, + }; + + #ifdef CONFIG_SYSFS +@@ -1798,8 +1940,9 @@ static struct xenbus_driver netfront_driver = { + .ids = netfront_ids, + .probe = netfront_probe, + .remove = __devexit_p(xennet_remove), ++ .suspend = netfront_suspend, + .resume = netfront_resume, +- .otherend_changed = backend_changed, ++ .otherend_changed = netback_changed, + }; + + static int __init netif_init(void) +diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig +index fdc864f..7802fcd 100644 +--- a/drivers/pci/Kconfig ++++ b/drivers/pci/Kconfig +@@ -51,6 +51,16 @@ config PCI_STUB + + When in doubt, say N. + ++config XEN_PCIDEV_FRONTEND ++ tristate "Xen PCI Frontend" ++ depends on XEN && PCI && X86 ++ select HOTPLUG ++ select XEN_XENBUS_FRONTEND ++ default y ++ help ++ The PCI device frontend driver allows the kernel to import arbitrary ++ PCI devices from a PCI backend to support PCI driver domains. ++ + config HT_IRQ + bool "Interrupts on hypertransport devices" + default y +diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile +index 4a7f11d..b70aa4d 100644 +--- a/drivers/pci/Makefile ++++ b/drivers/pci/Makefile +@@ -31,6 +31,8 @@ obj-$(CONFIG_HT_IRQ) += htirq.o + # Build Intel IOMMU support + obj-$(CONFIG_DMAR) += dmar.o iova.o intel-iommu.o + ++# Build Xen IOMMU support ++obj-$(CONFIG_PCI_XEN) += xen-iommu.o + obj-$(CONFIG_INTR_REMAP) += dmar.o intr_remapping.o + + obj-$(CONFIG_PCI_IOV) += iov.o +@@ -60,6 +62,8 @@ obj-$(CONFIG_PCI_SYSCALL) += syscall.o + + obj-$(CONFIG_PCI_STUB) += pci-stub.o + ++obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += xen-pcifront.o ++ + ifeq ($(CONFIG_PCI_DEBUG),y) + EXTRA_CFLAGS += -DDEBUG + endif +diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c +index cef28a7..1940183 100644 +--- a/drivers/pci/bus.c ++++ b/drivers/pci/bus.c +@@ -249,6 +249,7 @@ void pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void *), + up_read(&pci_bus_sem); + } + ++EXPORT_SYMBOL_GPL(pci_walk_bus); + EXPORT_SYMBOL(pci_bus_alloc_resource); + EXPORT_SYMBOL_GPL(pci_bus_add_device); + EXPORT_SYMBOL(pci_bus_add_devices); +diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c +index 91d0390..24f6f28 100644 +--- a/drivers/pci/dmar.c ++++ b/drivers/pci/dmar.c +@@ -673,10 +673,13 @@ void __init detect_intel_iommu(void) + "x2apic and Intr-remapping.\n"); + #endif + #ifdef CONFIG_DMAR +- if (ret && !no_iommu && !iommu_detected && !swiotlb && +- !dmar_disabled) ++ if (ret && !no_iommu && !iommu_detected && !dmar_disabled) + iommu_detected = 1; + #endif ++#ifdef CONFIG_X86 ++ if (ret) ++ x86_init.iommu.iommu_init = intel_iommu_init; ++#endif + } + early_acpi_os_unmap_memory(dmar_tbl, dmar_tbl_size); + dmar_tbl = NULL; +diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c +index ba83495..1506d4a 100644 +--- a/drivers/pci/intel-iommu.c ++++ b/drivers/pci/intel-iommu.c +@@ -3278,7 +3278,7 @@ int __init intel_iommu_init(void) + * Check the need for DMA-remapping initialization now. + * Above initialization will also be used by Interrupt-remapping. + */ +- if (no_iommu || swiotlb || dmar_disabled) ++ if (no_iommu || dmar_disabled) + return -ENODEV; + + iommu_init_mempool(); +@@ -3299,7 +3299,9 @@ int __init intel_iommu_init(void) + "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n"); + + init_timer(&unmap_timer); +- force_iommu = 1; ++#ifdef CONFIG_SWIOTLB ++ swiotlb = 0; ++#endif + dma_ops = &intel_dma_ops; + + init_iommu_sysfs(); +diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c +index e03fe98..f9db891 100644 +--- a/drivers/pci/iov.c ++++ b/drivers/pci/iov.c +@@ -706,6 +706,21 @@ irqreturn_t pci_sriov_migration(struct pci_dev *dev) + } + EXPORT_SYMBOL_GPL(pci_sriov_migration); + ++/** ++ * pci_num_vf - return number of VFs associated with a PF device_release_driver ++ * @dev: the PCI device ++ * ++ * Returns number of VFs, or 0 if SR-IOV is not enabled. ++ */ ++int pci_num_vf(struct pci_dev *dev) ++{ ++ if (!dev || !dev->is_physfn) ++ return 0; ++ else ++ return dev->sriov->nr_virtfn; ++} ++EXPORT_SYMBOL_GPL(pci_num_vf); ++ + static int ats_alloc_one(struct pci_dev *dev, int ps) + { + int pos; +diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c +index 0fb1d05..c7e8a69 100644 +--- a/drivers/pci/msi.c ++++ b/drivers/pci/msi.c +@@ -19,6 +19,9 @@ + #include <linux/errno.h> + #include <linux/io.h> + ++#include <asm/xen/hypercall.h> ++#include <asm/xen/hypervisor.h> ++ + #include "pci.h" + #include "msi.h" + +@@ -391,6 +394,20 @@ static void __pci_restore_msix_state(struct pci_dev *dev) + + void pci_restore_msi_state(struct pci_dev *dev) + { ++ if (xen_initial_domain()) { ++ struct physdev_restore_msi physdev; ++ ++ if (!dev->msi_enabled && !dev->msix_enabled) ++ return; ++ ++ pci_intx_for_msi(dev, 0); ++ ++ physdev.bus = dev->bus->number; ++ physdev.devfn = dev->devfn; ++ HYPERVISOR_physdev_op(PHYSDEVOP_restore_msi, &physdev); ++ ++ return; ++ } + __pci_restore_msi_state(dev); + __pci_restore_msix_state(dev); + } +diff --git a/drivers/pci/xen-iommu.c b/drivers/pci/xen-iommu.c +new file mode 100644 +index 0000000..ac6bcdb +--- /dev/null ++++ b/drivers/pci/xen-iommu.c +@@ -0,0 +1,271 @@ ++#include <linux/types.h> ++#include <linux/mm.h> ++#include <linux/string.h> ++#include <linux/pci.h> ++#include <linux/module.h> ++#include <linux/version.h> ++#include <linux/scatterlist.h> ++#include <linux/io.h> ++#include <linux/bug.h> ++ ++#include <xen/interface/xen.h> ++#include <xen/grant_table.h> ++#include <xen/page.h> ++#include <xen/xen-ops.h> ++ ++#include <asm/iommu.h> ++#include <asm/swiotlb.h> ++#include <asm/tlbflush.h> ++ ++#define IOMMU_BUG_ON(test) \ ++do { \ ++ if (unlikely(test)) { \ ++ printk(KERN_ALERT "Fatal DMA error! " \ ++ "Please use 'swiotlb=force'\n"); \ ++ BUG(); \ ++ } \ ++} while (0) ++ ++/* Print address range with message */ ++#define PAR(msg, addr, size) \ ++do { \ ++ printk(msg "[%#llx - %#llx]\n", \ ++ (unsigned long long)addr, \ ++ (unsigned long long)addr + size); \ ++} while (0) ++ ++static inline int address_needs_mapping(struct device *hwdev, ++ dma_addr_t addr) ++{ ++ dma_addr_t mask = DMA_BIT_MASK(32); ++ int ret; ++ ++ /* If the device has a mask, use it, otherwise default to 32 bits */ ++ if (hwdev) ++ mask = *hwdev->dma_mask; ++ ++ ret = (addr & ~mask) != 0; ++ ++ if (ret) { ++ printk(KERN_ERR "dma address needs mapping\n"); ++ printk(KERN_ERR "mask: %#llx\n address: [%#llx]\n", mask, addr); ++ } ++ return ret; ++} ++ ++static int check_pages_physically_contiguous(unsigned long pfn, ++ unsigned int offset, ++ size_t length) ++{ ++ unsigned long next_mfn; ++ int i; ++ int nr_pages; ++ ++ next_mfn = pfn_to_mfn(pfn); ++ nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT; ++ ++ for (i = 1; i < nr_pages; i++) { ++ if (pfn_to_mfn(++pfn) != ++next_mfn) ++ return 0; ++ } ++ return 1; ++} ++ ++static int range_straddles_page_boundary(phys_addr_t p, size_t size) ++{ ++ unsigned long pfn = PFN_DOWN(p); ++ unsigned int offset = p & ~PAGE_MASK; ++ ++ if (offset + size <= PAGE_SIZE) ++ return 0; ++ if (check_pages_physically_contiguous(pfn, offset, size)) ++ return 0; ++ return 1; ++} ++ ++static inline void xen_dma_unmap_page(struct page *page) ++{ ++ /* Xen TODO: 2.6.18 xen calls __gnttab_dma_unmap_page here ++ * to deal with foreign pages. We'll need similar logic here at ++ * some point. ++ */ ++} ++ ++/* Gets dma address of a page */ ++static inline dma_addr_t xen_dma_map_page(struct page *page) ++{ ++ /* Xen TODO: 2.6.18 xen calls __gnttab_dma_map_page here to deal ++ * with foreign pages. We'll need similar logic here at some ++ * point. ++ */ ++ return ((dma_addr_t)pfn_to_mfn(page_to_pfn(page))) << PAGE_SHIFT; ++} ++ ++static int xen_map_sg(struct device *hwdev, struct scatterlist *sg, ++ int nents, ++ enum dma_data_direction direction, ++ struct dma_attrs *attrs) ++{ ++ struct scatterlist *s; ++ struct page *page; ++ int i, rc; ++ ++ BUG_ON(direction == DMA_NONE); ++ WARN_ON(nents == 0 || sg[0].length == 0); ++ ++ for_each_sg(sg, s, nents, i) { ++ BUG_ON(!sg_page(s)); ++ page = sg_page(s); ++ s->dma_address = xen_dma_map_page(page) + s->offset; ++ s->dma_length = s->length; ++ IOMMU_BUG_ON(range_straddles_page_boundary( ++ page_to_phys(page), s->length)); ++ } ++ ++ rc = nents; ++ ++ flush_write_buffers(); ++ return rc; ++} ++ ++static void xen_unmap_sg(struct device *hwdev, struct scatterlist *sg, ++ int nents, ++ enum dma_data_direction direction, ++ struct dma_attrs *attrs) ++{ ++ struct scatterlist *s; ++ struct page *page; ++ int i; ++ ++ for_each_sg(sg, s, nents, i) { ++ page = pfn_to_page(mfn_to_pfn(PFN_DOWN(s->dma_address))); ++ xen_dma_unmap_page(page); ++ } ++} ++ ++static void *xen_alloc_coherent(struct device *dev, size_t size, ++ dma_addr_t *dma_handle, gfp_t gfp) ++{ ++ void *ret; ++ unsigned int order = get_order(size); ++ unsigned long vstart; ++ u64 mask; ++ ++ /* ignore region specifiers */ ++ gfp &= ~(__GFP_DMA | __GFP_HIGHMEM); ++ ++ if (dma_alloc_from_coherent(dev, size, dma_handle, &ret)) ++ return ret; ++ ++ if (dev == NULL || (dev->coherent_dma_mask < DMA_BIT_MASK(32))) ++ gfp |= GFP_DMA; ++ ++ vstart = __get_free_pages(gfp, order); ++ ret = (void *)vstart; ++ ++ if (dev != NULL && dev->coherent_dma_mask) ++ mask = dev->coherent_dma_mask; ++ else ++ mask = DMA_BIT_MASK(32); ++ ++ if (ret != NULL) { ++ if (xen_create_contiguous_region(vstart, order, ++ fls64(mask)) != 0) { ++ free_pages(vstart, order); ++ return NULL; ++ } ++ memset(ret, 0, size); ++ *dma_handle = virt_to_machine(ret).maddr; ++ } ++ return ret; ++} ++ ++static void xen_free_coherent(struct device *dev, size_t size, ++ void *vaddr, dma_addr_t dma_addr) ++{ ++ int order = get_order(size); ++ ++ if (dma_release_from_coherent(dev, order, vaddr)) ++ return; ++ ++ xen_destroy_contiguous_region((unsigned long)vaddr, order); ++ free_pages((unsigned long)vaddr, order); ++} ++ ++static dma_addr_t xen_map_page(struct device *dev, struct page *page, ++ unsigned long offset, size_t size, ++ enum dma_data_direction direction, ++ struct dma_attrs *attrs) ++{ ++ dma_addr_t dma; ++ ++ BUG_ON(direction == DMA_NONE); ++ ++ WARN_ON(size == 0); ++ ++ dma = xen_dma_map_page(page) + offset; ++ ++ IOMMU_BUG_ON(address_needs_mapping(dev, dma)); ++ flush_write_buffers(); ++ return dma; ++} ++ ++static void xen_unmap_page(struct device *dev, dma_addr_t dma_addr, ++ size_t size, ++ enum dma_data_direction direction, ++ struct dma_attrs *attrs) ++{ ++ BUG_ON(direction == DMA_NONE); ++ xen_dma_unmap_page(pfn_to_page(mfn_to_pfn(PFN_DOWN(dma_addr)))); ++} ++ ++static struct dma_map_ops xen_dma_ops = { ++ .dma_supported = NULL, ++ ++ .alloc_coherent = xen_alloc_coherent, ++ .free_coherent = xen_free_coherent, ++ ++ .map_page = xen_map_page, ++ .unmap_page = xen_unmap_page, ++ ++ .map_sg = xen_map_sg, ++ .unmap_sg = xen_unmap_sg, ++ ++ .mapping_error = NULL, ++ ++ .is_phys = 0, ++}; ++ ++static struct dma_map_ops xen_swiotlb_dma_ops = { ++ .dma_supported = swiotlb_dma_supported, ++ ++ .alloc_coherent = xen_alloc_coherent, ++ .free_coherent = xen_free_coherent, ++ ++ .map_page = swiotlb_map_page, ++ .unmap_page = swiotlb_unmap_page, ++ ++ .map_sg = swiotlb_map_sg_attrs, ++ .unmap_sg = swiotlb_unmap_sg_attrs, ++ ++ .mapping_error = swiotlb_dma_mapping_error, ++ ++ .is_phys = 0, ++}; ++ ++void __init xen_iommu_init(void) ++{ ++ if (!xen_pv_domain()) ++ return; ++ ++ printk(KERN_INFO "Xen: Initializing Xen DMA ops\n"); ++ ++ force_iommu = 0; ++ dma_ops = &xen_dma_ops; ++ ++ if (swiotlb) { ++ printk(KERN_INFO "Xen: Enabling DMA fallback to swiotlb\n"); ++ dma_ops = &xen_swiotlb_dma_ops; ++ } ++} ++ +diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c +new file mode 100644 +index 0000000..76d0bdd +--- /dev/null ++++ b/drivers/pci/xen-pcifront.c +@@ -0,0 +1,1157 @@ ++/* ++ * PCI Frontend Xenbus Setup - handles setup with backend (imports page/evtchn) ++ * ++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> ++ */ ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/mm.h> ++#include <xen/xenbus.h> ++#include <xen/events.h> ++#include <xen/grant_table.h> ++#include <xen/page.h> ++#include <linux/spinlock.h> ++#include <linux/pci.h> ++#include <linux/msi.h> ++#include <xen/xenbus.h> ++#include <xen/interface/io/pciif.h> ++#include <asm/xen/pci.h> ++#include <linux/interrupt.h> ++#include <asm/atomic.h> ++#include <linux/workqueue.h> ++#include <linux/bitops.h> ++#include <linux/time.h> ++ ++ ++#ifndef __init_refok ++#define __init_refok ++#endif ++ ++#define INVALID_GRANT_REF (0) ++#define INVALID_EVTCHN (-1) ++ ++ ++struct pci_bus_entry { ++ struct list_head list; ++ struct pci_bus *bus; ++}; ++ ++#define _PDEVB_op_active (0) ++#define PDEVB_op_active (1 << (_PDEVB_op_active)) ++ ++struct pcifront_device { ++ struct xenbus_device *xdev; ++ struct list_head root_buses; ++ ++ int evtchn; ++ int gnt_ref; ++ ++ int irq; ++ ++ /* Lock this when doing any operations in sh_info */ ++ spinlock_t sh_info_lock; ++ struct xen_pci_sharedinfo *sh_info; ++ struct work_struct op_work; ++ unsigned long flags; ++ ++}; ++ ++struct pcifront_sd { ++ int domain; ++ struct pcifront_device *pdev; ++}; ++ ++static inline struct pcifront_device * ++pcifront_get_pdev(struct pcifront_sd *sd) ++{ ++ return sd->pdev; ++} ++ ++static inline void pcifront_init_sd(struct pcifront_sd *sd, ++ unsigned int domain, unsigned int bus, ++ struct pcifront_device *pdev) ++{ ++ sd->domain = domain; ++ sd->pdev = pdev; ++} ++ ++static inline void pcifront_setup_root_resources(struct pci_bus *bus, ++ struct pcifront_sd *sd) ++{ ++} ++ ++ ++DEFINE_SPINLOCK(pcifront_dev_lock); ++static struct pcifront_device *pcifront_dev; ++ ++static int verbose_request; ++module_param(verbose_request, int, 0644); ++ ++static int errno_to_pcibios_err(int errno) ++{ ++ switch (errno) { ++ case XEN_PCI_ERR_success: ++ return PCIBIOS_SUCCESSFUL; ++ ++ case XEN_PCI_ERR_dev_not_found: ++ return PCIBIOS_DEVICE_NOT_FOUND; ++ ++ case XEN_PCI_ERR_invalid_offset: ++ case XEN_PCI_ERR_op_failed: ++ return PCIBIOS_BAD_REGISTER_NUMBER; ++ ++ case XEN_PCI_ERR_not_implemented: ++ return PCIBIOS_FUNC_NOT_SUPPORTED; ++ ++ case XEN_PCI_ERR_access_denied: ++ return PCIBIOS_SET_FAILED; ++ } ++ return errno; ++} ++ ++static inline void schedule_pcifront_aer_op(struct pcifront_device *pdev) ++{ ++ if (test_bit(_XEN_PCIB_active, (unsigned long *)&pdev->sh_info->flags) ++ && !test_and_set_bit(_PDEVB_op_active, &pdev->flags)) { ++ dev_dbg(&pdev->xdev->dev, "schedule aer frontend job\n"); ++ schedule_work(&pdev->op_work); ++ } ++} ++ ++static int do_pci_op(struct pcifront_device *pdev, struct xen_pci_op *op) ++{ ++ int err = 0; ++ struct xen_pci_op *active_op = &pdev->sh_info->op; ++ unsigned long irq_flags; ++ evtchn_port_t port = pdev->evtchn; ++ unsigned irq = pdev->irq; ++ s64 ns, ns_timeout; ++ struct timeval tv; ++ ++ spin_lock_irqsave(&pdev->sh_info_lock, irq_flags); ++ ++ memcpy(active_op, op, sizeof(struct xen_pci_op)); ++ ++ /* Go */ ++ wmb(); ++ set_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags); ++ notify_remote_via_evtchn(port); ++ ++ /* ++ * We set a poll timeout of 3 seconds but give up on return after ++ * 2 seconds. It is better to time out too late rather than too early ++ * (in the latter case we end up continually re-executing poll() with a ++ * timeout in the past). 1s difference gives plenty of slack for error. ++ */ ++ do_gettimeofday(&tv); ++ ns_timeout = timeval_to_ns(&tv) + 2 * (s64)NSEC_PER_SEC; ++ ++ xen_clear_irq_pending(irq); ++ ++ while (test_bit(_XEN_PCIF_active, ++ (unsigned long *)&pdev->sh_info->flags)) { ++ xen_poll_irq_timeout(irq, jiffies + 3*HZ); ++ xen_clear_irq_pending(irq); ++ do_gettimeofday(&tv); ++ ns = timeval_to_ns(&tv); ++ if (ns > ns_timeout) { ++ dev_err(&pdev->xdev->dev, ++ "pciback not responding!!!\n"); ++ clear_bit(_XEN_PCIF_active, ++ (unsigned long *)&pdev->sh_info->flags); ++ err = XEN_PCI_ERR_dev_not_found; ++ goto out; ++ } ++ } ++ ++ /* ++ * We might lose backend service request since we ++ * reuse same evtchn with pci_conf backend response. So re-schedule ++ * aer pcifront service. ++ */ ++ if (test_bit(_XEN_PCIB_active, ++ (unsigned long *)&pdev->sh_info->flags)) { ++ dev_err(&pdev->xdev->dev, ++ "schedule aer pcifront service\n"); ++ schedule_pcifront_aer_op(pdev); ++ } ++ ++ memcpy(op, active_op, sizeof(struct xen_pci_op)); ++ ++ err = op->err; ++out: ++ spin_unlock_irqrestore(&pdev->sh_info_lock, irq_flags); ++ return err; ++} ++ ++/* Access to this function is spinlocked in drivers/pci/access.c */ ++static int pcifront_bus_read(struct pci_bus *bus, unsigned int devfn, ++ int where, int size, u32 *val) ++{ ++ int err = 0; ++ struct xen_pci_op op = { ++ .cmd = XEN_PCI_OP_conf_read, ++ .domain = pci_domain_nr(bus), ++ .bus = bus->number, ++ .devfn = devfn, ++ .offset = where, ++ .size = size, ++ }; ++ struct pcifront_sd *sd = bus->sysdata; ++ struct pcifront_device *pdev = pcifront_get_pdev(sd); ++ ++ if (verbose_request) ++ dev_info(&pdev->xdev->dev, ++ "read dev=%04x:%02x:%02x.%01x - offset %x size %d\n", ++ pci_domain_nr(bus), bus->number, PCI_SLOT(devfn), ++ PCI_FUNC(devfn), where, size); ++ ++ err = do_pci_op(pdev, &op); ++ ++ if (likely(!err)) { ++ if (verbose_request) ++ dev_info(&pdev->xdev->dev, "read got back value %x\n", ++ op.value); ++ ++ *val = op.value; ++ } else if (err == -ENODEV) { ++ /* No device here, pretend that it just returned 0 */ ++ err = 0; ++ *val = 0; ++ } ++ ++ return errno_to_pcibios_err(err); ++} ++ ++/* Access to this function is spinlocked in drivers/pci/access.c */ ++static int pcifront_bus_write(struct pci_bus *bus, unsigned int devfn, ++ int where, int size, u32 val) ++{ ++ struct xen_pci_op op = { ++ .cmd = XEN_PCI_OP_conf_write, ++ .domain = pci_domain_nr(bus), ++ .bus = bus->number, ++ .devfn = devfn, ++ .offset = where, ++ .size = size, ++ .value = val, ++ }; ++ struct pcifront_sd *sd = bus->sysdata; ++ struct pcifront_device *pdev = pcifront_get_pdev(sd); ++ ++ if (verbose_request) ++ dev_info(&pdev->xdev->dev, ++ "write dev=%04x:%02x:%02x.%01x - " ++ "offset %x size %d val %x\n", ++ pci_domain_nr(bus), bus->number, ++ PCI_SLOT(devfn), PCI_FUNC(devfn), where, size, val); ++ ++ return errno_to_pcibios_err(do_pci_op(pdev, &op)); ++} ++ ++struct pci_ops pcifront_bus_ops = { ++ .read = pcifront_bus_read, ++ .write = pcifront_bus_write, ++}; ++ ++#ifdef CONFIG_PCI_MSI ++static int pci_frontend_enable_msix(struct pci_dev *dev, ++ int **vector, int nvec) ++{ ++ int err; ++ int i; ++ struct xen_pci_op op = { ++ .cmd = XEN_PCI_OP_enable_msix, ++ .domain = pci_domain_nr(dev->bus), ++ .bus = dev->bus->number, ++ .devfn = dev->devfn, ++ .value = nvec, ++ }; ++ struct pcifront_sd *sd = dev->bus->sysdata; ++ struct pcifront_device *pdev = pcifront_get_pdev(sd); ++ struct msi_desc *entry; ++ ++ if (nvec > SH_INFO_MAX_VEC) { ++ dev_err(&dev->dev, "too much vector for pci frontend: %x." ++ " Increase SH_INFO_MAX_VEC.\n", nvec); ++ return -EINVAL; ++ } ++ ++ i = 0; ++ list_for_each_entry(entry, &dev->msi_list, list) { ++ op.msix_entries[i].entry = entry->msi_attrib.entry_nr; ++ /* Vector is useless at this point. */ ++ op.msix_entries[i].vector = -1; ++ i++; ++ } ++ ++ err = do_pci_op(pdev, &op); ++ ++ if (likely(!err)) { ++ if (likely(!op.value)) { ++ /* we get the result */ ++ for (i = 0; i < nvec; i++) ++ *(*vector+i) = op.msix_entries[i].vector; ++ return 0; ++ } else { ++ printk(KERN_DEBUG "enable msix get value %x\n", ++ op.value); ++ return op.value; ++ } ++ } else { ++ dev_err(&dev->dev, "enable msix get err %x\n", err); ++ return err; ++ } ++} ++ ++static void pci_frontend_disable_msix(struct pci_dev *dev) ++{ ++ int err; ++ struct xen_pci_op op = { ++ .cmd = XEN_PCI_OP_disable_msix, ++ .domain = pci_domain_nr(dev->bus), ++ .bus = dev->bus->number, ++ .devfn = dev->devfn, ++ }; ++ struct pcifront_sd *sd = dev->bus->sysdata; ++ struct pcifront_device *pdev = pcifront_get_pdev(sd); ++ ++ err = do_pci_op(pdev, &op); ++ ++ /* What should do for error ? */ ++ if (err) ++ dev_err(&dev->dev, "pci_disable_msix get err %x\n", err); ++} ++ ++static int pci_frontend_enable_msi(struct pci_dev *dev, int **vector) ++{ ++ int err; ++ struct xen_pci_op op = { ++ .cmd = XEN_PCI_OP_enable_msi, ++ .domain = pci_domain_nr(dev->bus), ++ .bus = dev->bus->number, ++ .devfn = dev->devfn, ++ }; ++ struct pcifront_sd *sd = dev->bus->sysdata; ++ struct pcifront_device *pdev = pcifront_get_pdev(sd); ++ ++ err = do_pci_op(pdev, &op); ++ if (likely(!err)) { ++ *(*vector) = op.value; ++ } else { ++ dev_err(&dev->dev, "pci frontend enable msi failed for dev " ++ "%x:%x \n", op.bus, op.devfn); ++ err = -EINVAL; ++ } ++ return err; ++} ++ ++static void pci_frontend_disable_msi(struct pci_dev *dev) ++{ ++ int err; ++ struct xen_pci_op op = { ++ .cmd = XEN_PCI_OP_disable_msi, ++ .domain = pci_domain_nr(dev->bus), ++ .bus = dev->bus->number, ++ .devfn = dev->devfn, ++ }; ++ struct pcifront_sd *sd = dev->bus->sysdata; ++ struct pcifront_device *pdev = pcifront_get_pdev(sd); ++ ++ err = do_pci_op(pdev, &op); ++ if (err == XEN_PCI_ERR_dev_not_found) { ++ /* XXX No response from backend, what shall we do? */ ++ printk(KERN_DEBUG "get no response from backend for disable MSI\n"); ++ return; ++ } ++ if (err) ++ /* how can pciback notify us fail? */ ++ printk(KERN_DEBUG "get fake response frombackend \n"); ++} ++ ++static struct xen_pci_frontend_ops pci_frontend_ops = { ++ .enable_msi = pci_frontend_enable_msi, ++ .disable_msi = pci_frontend_disable_msi, ++ .enable_msix = pci_frontend_enable_msix, ++ .disable_msix = pci_frontend_disable_msix, ++}; ++ ++static void pci_frontend_registrar(int enable) ++{ ++ if (enable) ++ xen_pci_frontend = &pci_frontend_ops; ++ else ++ xen_pci_frontend = NULL; ++}; ++#else ++static inline void pci_frontend_registrar(int enable) { }; ++#endif /* CONFIG_PCI_MSI */ ++ ++/* Claim resources for the PCI frontend as-is, backend won't allow changes */ ++static int pcifront_claim_resource(struct pci_dev *dev, void *data) ++{ ++ struct pcifront_device *pdev = data; ++ int i; ++ struct resource *r; ++ ++ for (i = 0; i < PCI_NUM_RESOURCES; i++) { ++ r = &dev->resource[i]; ++ ++ if (!r->parent && r->start && r->flags) { ++ dev_info(&pdev->xdev->dev, "claiming resource %s/%d\n", ++ pci_name(dev), i); ++ if (pci_claim_resource(dev, i)) { ++ dev_err(&pdev->xdev->dev, "Could not claim " ++ "resource %s/%d! Device offline. Try " ++ "giving less than 4GB to domain.\n", ++ pci_name(dev), i); ++ } ++ } ++ } ++ ++ return 0; ++} ++ ++int __devinit pcifront_scan_bus(struct pcifront_device *pdev, ++ unsigned int domain, unsigned int bus, ++ struct pci_bus *b) ++{ ++ struct pci_dev *d; ++ unsigned int devfn; ++ int err; ++ ++ /* Scan the bus for functions and add. ++ * We omit handling of PCI bridge attachment because pciback prevents ++ * bridges from being exported. ++ */ ++ for (devfn = 0; devfn < 0x100; devfn++) { ++ d = pci_get_slot(b, devfn); ++ if (d) { ++ /* Device is already known. */ ++ pci_dev_put(d); ++ continue; ++ } ++ ++ d = pci_scan_single_device(b, devfn); ++ if (d) ++ dev_info(&pdev->xdev->dev, "New device on " ++ "%04x:%02x:%02x.%02x found.\n", domain, bus, ++ PCI_SLOT(devfn), PCI_FUNC(devfn)); ++ } ++ ++ return 0; ++} ++ ++int __devinit pcifront_scan_root(struct pcifront_device *pdev, ++ unsigned int domain, unsigned int bus) ++{ ++ struct pci_bus *b; ++ struct pcifront_sd *sd = NULL; ++ struct pci_bus_entry *bus_entry = NULL; ++ int err = 0; ++ ++#ifndef CONFIG_PCI_DOMAINS ++ if (domain != 0) { ++ dev_err(&pdev->xdev->dev, ++ "PCI Root in non-zero PCI Domain! domain=%d\n", domain); ++ dev_err(&pdev->xdev->dev, ++ "Please compile with CONFIG_PCI_DOMAINS\n"); ++ err = -EINVAL; ++ goto err_out; ++ } ++#endif ++ ++ dev_info(&pdev->xdev->dev, "Creating PCI Frontend Bus %04x:%02x\n", ++ domain, bus); ++ ++ bus_entry = kmalloc(sizeof(*bus_entry), GFP_KERNEL); ++ sd = kmalloc(sizeof(*sd), GFP_KERNEL); ++ if (!bus_entry || !sd) { ++ err = -ENOMEM; ++ goto err_out; ++ } ++ pcifront_init_sd(sd, domain, bus, pdev); ++ ++ b = pci_scan_bus_parented(&pdev->xdev->dev, bus, ++ &pcifront_bus_ops, sd); ++ if (!b) { ++ dev_err(&pdev->xdev->dev, ++ "Error creating PCI Frontend Bus!\n"); ++ err = -ENOMEM; ++ goto err_out; ++ } ++ ++ pcifront_setup_root_resources(b, sd); ++ bus_entry->bus = b; ++ ++ list_add(&bus_entry->list, &pdev->root_buses); ++ ++ /* pci_scan_bus_parented skips devices which do not have a have ++ * devfn==0. The pcifront_scan_bus enumerates all devfn. */ ++ err = pcifront_scan_bus(pdev, domain, bus, b); ++ ++ /* Claim resources before going "live" with our devices */ ++ pci_walk_bus(b, pcifront_claim_resource, pdev); ++ ++ /* Create SysFS and notify udev of the devices. Aka: "going live" */ ++ pci_bus_add_devices(b); ++ ++ return err; ++ ++err_out: ++ kfree(bus_entry); ++ kfree(sd); ++ ++ return err; ++} ++ ++int __devinit pcifront_rescan_root(struct pcifront_device *pdev, ++ unsigned int domain, unsigned int bus) ++{ ++ int err; ++ struct pci_bus *b; ++ ++#ifndef CONFIG_PCI_DOMAINS ++ if (domain != 0) { ++ dev_err(&pdev->xdev->dev, ++ "PCI Root in non-zero PCI Domain! domain=%d\n", domain); ++ dev_err(&pdev->xdev->dev, ++ "Please compile with CONFIG_PCI_DOMAINS\n"); ++ return -EINVAL; ++ } ++#endif ++ ++ dev_info(&pdev->xdev->dev, "Rescanning PCI Frontend Bus %04x:%02x\n", ++ domain, bus); ++ ++ b = pci_find_bus(domain, bus); ++ if (!b) ++ /* If the bus is unknown, create it. */ ++ return pcifront_scan_root(pdev, domain, bus); ++ ++ err = pcifront_scan_bus(pdev, domain, bus, b); ++ ++ /* Claim resources before going "live" with our devices */ ++ pci_walk_bus(b, pcifront_claim_resource, pdev); ++ ++ /* Create SysFS and notify udev of the devices. Aka: "going live" */ ++ pci_bus_add_devices(b); ++ ++ return err; ++} ++ ++static void free_root_bus_devs(struct pci_bus *bus) ++{ ++ struct pci_dev *dev; ++ ++ while (!list_empty(&bus->devices)) { ++ dev = container_of(bus->devices.next, struct pci_dev, ++ bus_list); ++ dev_dbg(&dev->dev, "removing device\n"); ++ pci_remove_bus_device(dev); ++ } ++} ++ ++void pcifront_free_roots(struct pcifront_device *pdev) ++{ ++ struct pci_bus_entry *bus_entry, *t; ++ ++ dev_dbg(&pdev->xdev->dev, "cleaning up root buses\n"); ++ ++ list_for_each_entry_safe(bus_entry, t, &pdev->root_buses, list) { ++ list_del(&bus_entry->list); ++ ++ free_root_bus_devs(bus_entry->bus); ++ ++ kfree(bus_entry->bus->sysdata); ++ ++ device_unregister(bus_entry->bus->bridge); ++ pci_remove_bus(bus_entry->bus); ++ ++ kfree(bus_entry); ++ } ++} ++ ++static pci_ers_result_t pcifront_common_process(int cmd, ++ struct pcifront_device *pdev, ++ pci_channel_state_t state) ++{ ++ pci_ers_result_t result; ++ struct pci_driver *pdrv; ++ int bus = pdev->sh_info->aer_op.bus; ++ int devfn = pdev->sh_info->aer_op.devfn; ++ struct pci_dev *pcidev; ++ int flag = 0; ++ ++ dev_dbg(&pdev->xdev->dev, ++ "pcifront AER process: cmd %x (bus:%x, devfn%x)", ++ cmd, bus, devfn); ++ result = PCI_ERS_RESULT_NONE; ++ ++ pcidev = pci_get_bus_and_slot(bus, devfn); ++ if (!pcidev || !pcidev->driver) { ++ dev_err(&pcidev->dev, ++ "device or driver is NULL\n"); ++ return result; ++ } ++ pdrv = pcidev->driver; ++ ++ if (get_driver(&pdrv->driver)) { ++ if (pdrv->err_handler && pdrv->err_handler->error_detected) { ++ dev_dbg(&pcidev->dev, ++ "trying to call AER service\n"); ++ if (pcidev) { ++ flag = 1; ++ switch (cmd) { ++ case XEN_PCI_OP_aer_detected: ++ result = pdrv->err_handler-> ++ error_detected(pcidev, state); ++ break; ++ case XEN_PCI_OP_aer_mmio: ++ result = pdrv->err_handler-> ++ mmio_enabled(pcidev); ++ break; ++ case XEN_PCI_OP_aer_slotreset: ++ result = pdrv->err_handler-> ++ slot_reset(pcidev); ++ break; ++ case XEN_PCI_OP_aer_resume: ++ pdrv->err_handler->resume(pcidev); ++ break; ++ default: ++ dev_err(&pdev->xdev->dev, ++ "bad request in aer recovery " ++ "operation!\n"); ++ ++ } ++ } ++ } ++ put_driver(&pdrv->driver); ++ } ++ if (!flag) ++ result = PCI_ERS_RESULT_NONE; ++ ++ return result; ++} ++ ++ ++void pcifront_do_aer(struct work_struct *data) ++{ ++ struct pcifront_device *pdev = ++ container_of(data, struct pcifront_device, op_work); ++ int cmd = pdev->sh_info->aer_op.cmd; ++ pci_channel_state_t state = ++ (pci_channel_state_t)pdev->sh_info->aer_op.err; ++ ++ /*If a pci_conf op is in progress, ++ we have to wait until it is done before service aer op*/ ++ dev_dbg(&pdev->xdev->dev, ++ "pcifront service aer bus %x devfn %x\n", ++ pdev->sh_info->aer_op.bus, pdev->sh_info->aer_op.devfn); ++ ++ pdev->sh_info->aer_op.err = pcifront_common_process(cmd, pdev, state); ++ ++ wmb(); ++ clear_bit(_XEN_PCIB_active, (unsigned long *)&pdev->sh_info->flags); ++ notify_remote_via_evtchn(pdev->evtchn); ++ ++ /*in case of we lost an aer request in four lines time_window*/ ++ smp_mb__before_clear_bit(); ++ clear_bit(_PDEVB_op_active, &pdev->flags); ++ smp_mb__after_clear_bit(); ++ ++ schedule_pcifront_aer_op(pdev); ++ ++} ++ ++irqreturn_t pcifront_handler_aer(int irq, void *dev) ++{ ++ struct pcifront_device *pdev = dev; ++ schedule_pcifront_aer_op(pdev); ++ return IRQ_HANDLED; ++} ++int pcifront_connect(struct pcifront_device *pdev) ++{ ++ int err = 0; ++ ++ spin_lock(&pcifront_dev_lock); ++ ++ if (!pcifront_dev) { ++ dev_info(&pdev->xdev->dev, "Installing PCI frontend\n"); ++ pcifront_dev = pdev; ++ } else { ++ dev_err(&pdev->xdev->dev, "PCI frontend already installed!\n"); ++ err = -EEXIST; ++ } ++ ++ spin_unlock(&pcifront_dev_lock); ++ ++ return err; ++} ++ ++void pcifront_disconnect(struct pcifront_device *pdev) ++{ ++ spin_lock(&pcifront_dev_lock); ++ ++ if (pdev == pcifront_dev) { ++ dev_info(&pdev->xdev->dev, ++ "Disconnecting PCI Frontend Buses\n"); ++ pcifront_dev = NULL; ++ } ++ ++ spin_unlock(&pcifront_dev_lock); ++} ++static struct pcifront_device *alloc_pdev(struct xenbus_device *xdev) ++{ ++ struct pcifront_device *pdev; ++ ++ pdev = kzalloc(sizeof(struct pcifront_device), GFP_KERNEL); ++ if (pdev == NULL) ++ goto out; ++ ++ pdev->sh_info = ++ (struct xen_pci_sharedinfo *)__get_free_page(GFP_KERNEL); ++ if (pdev->sh_info == NULL) { ++ kfree(pdev); ++ pdev = NULL; ++ goto out; ++ } ++ pdev->sh_info->flags = 0; ++ ++ /*Flag for registering PV AER handler*/ ++ set_bit(_XEN_PCIB_AERHANDLER, (void *)&pdev->sh_info->flags); ++ ++ dev_set_drvdata(&xdev->dev, pdev); ++ pdev->xdev = xdev; ++ ++ INIT_LIST_HEAD(&pdev->root_buses); ++ ++ spin_lock_init(&pdev->sh_info_lock); ++ ++ pdev->evtchn = INVALID_EVTCHN; ++ pdev->gnt_ref = INVALID_GRANT_REF; ++ pdev->irq = -1; ++ ++ INIT_WORK(&pdev->op_work, pcifront_do_aer); ++ ++ dev_dbg(&xdev->dev, "Allocated pdev @ 0x%p pdev->sh_info @ 0x%p\n", ++ pdev, pdev->sh_info); ++out: ++ return pdev; ++} ++ ++static void free_pdev(struct pcifront_device *pdev) ++{ ++ dev_dbg(&pdev->xdev->dev, "freeing pdev @ 0x%p\n", pdev); ++ ++ pcifront_free_roots(pdev); ++ ++ /*For PCIE_AER error handling job*/ ++ flush_scheduled_work(); ++ unbind_from_irqhandler(pdev->irq, pdev); ++ ++ if (pdev->evtchn != INVALID_EVTCHN) ++ xenbus_free_evtchn(pdev->xdev, pdev->evtchn); ++ ++ if (pdev->gnt_ref != INVALID_GRANT_REF) ++ gnttab_end_foreign_access(pdev->gnt_ref, 0 /* r/w page */, ++ (unsigned long)pdev->sh_info); ++ ++ dev_set_drvdata(&pdev->xdev->dev, NULL); ++ kfree(pdev); ++} ++ ++static int pcifront_publish_info(struct pcifront_device *pdev) ++{ ++ int err = 0; ++ struct xenbus_transaction trans; ++ ++ err = xenbus_grant_ring(pdev->xdev, virt_to_mfn(pdev->sh_info)); ++ if (err < 0) ++ goto out; ++ ++ pdev->gnt_ref = err; ++ ++ err = xenbus_alloc_evtchn(pdev->xdev, &pdev->evtchn); ++ if (err) ++ goto out; ++ ++ err = bind_evtchn_to_irqhandler(pdev->evtchn, pcifront_handler_aer, ++ 0, "pcifront", pdev); ++ if (err < 0) { ++ xenbus_free_evtchn(pdev->xdev, pdev->evtchn); ++ xenbus_dev_fatal(pdev->xdev, err, "Failed to bind evtchn to " ++ "irqhandler.\n"); ++ return err; ++ } ++ pdev->irq = err; ++ ++do_publish: ++ err = xenbus_transaction_start(&trans); ++ if (err) { ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error writing configuration for backend " ++ "(start transaction)"); ++ goto out; ++ } ++ ++ err = xenbus_printf(trans, pdev->xdev->nodename, ++ "pci-op-ref", "%u", pdev->gnt_ref); ++ if (!err) ++ err = xenbus_printf(trans, pdev->xdev->nodename, ++ "event-channel", "%u", pdev->evtchn); ++ if (!err) ++ err = xenbus_printf(trans, pdev->xdev->nodename, ++ "magic", XEN_PCI_MAGIC); ++ ++ if (err) { ++ xenbus_transaction_end(trans, 1); ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error writing configuration for backend"); ++ goto out; ++ } else { ++ err = xenbus_transaction_end(trans, 0); ++ if (err == -EAGAIN) ++ goto do_publish; ++ else if (err) { ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error completing transaction " ++ "for backend"); ++ goto out; ++ } ++ } ++ ++ xenbus_switch_state(pdev->xdev, XenbusStateInitialised); ++ ++ dev_dbg(&pdev->xdev->dev, "publishing successful!\n"); ++ ++out: ++ return err; ++} ++ ++static int __devinit pcifront_try_connect(struct pcifront_device *pdev) ++{ ++ int err = -EFAULT; ++ int i, num_roots, len; ++ char str[64]; ++ unsigned int domain, bus; ++ ++ ++ /* Only connect once */ ++ if (xenbus_read_driver_state(pdev->xdev->nodename) != ++ XenbusStateInitialised) ++ goto out; ++ ++ err = pcifront_connect(pdev); ++ if (err) { ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error connecting PCI Frontend"); ++ goto out; ++ } ++ ++ err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, ++ "root_num", "%d", &num_roots); ++ if (err == -ENOENT) { ++ xenbus_dev_error(pdev->xdev, err, ++ "No PCI Roots found, trying 0000:00"); ++ err = pcifront_scan_root(pdev, 0, 0); ++ num_roots = 0; ++ } else if (err != 1) { ++ if (err == 0) ++ err = -EINVAL; ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error reading number of PCI roots"); ++ goto out; ++ } ++ ++ for (i = 0; i < num_roots; i++) { ++ len = snprintf(str, sizeof(str), "root-%d", i); ++ if (unlikely(len >= (sizeof(str) - 1))) { ++ err = -ENOMEM; ++ goto out; ++ } ++ ++ err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str, ++ "%x:%x", &domain, &bus); ++ if (err != 2) { ++ if (err >= 0) ++ err = -EINVAL; ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error reading PCI root %d", i); ++ goto out; ++ } ++ ++ err = pcifront_scan_root(pdev, domain, bus); ++ if (err) { ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error scanning PCI root %04x:%02x", ++ domain, bus); ++ goto out; ++ } ++ } ++ ++ err = xenbus_switch_state(pdev->xdev, XenbusStateConnected); ++ ++out: ++ return err; ++} ++ ++static int pcifront_try_disconnect(struct pcifront_device *pdev) ++{ ++ int err = 0; ++ enum xenbus_state prev_state; ++ ++ ++ prev_state = xenbus_read_driver_state(pdev->xdev->nodename); ++ ++ if (prev_state >= XenbusStateClosing) ++ goto out; ++ ++ if (prev_state == XenbusStateConnected) { ++ pcifront_free_roots(pdev); ++ pcifront_disconnect(pdev); ++ } ++ ++ err = xenbus_switch_state(pdev->xdev, XenbusStateClosed); ++ ++out: ++ ++ return err; ++} ++ ++static int __devinit pcifront_attach_devices(struct pcifront_device *pdev) ++{ ++ int err = -EFAULT; ++ int i, num_roots, len; ++ unsigned int domain, bus; ++ char str[64]; ++ ++ if (xenbus_read_driver_state(pdev->xdev->nodename) != ++ XenbusStateReconfiguring) ++ goto out; ++ ++ err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, ++ "root_num", "%d", &num_roots); ++ if (err == -ENOENT) { ++ xenbus_dev_error(pdev->xdev, err, ++ "No PCI Roots found, trying 0000:00"); ++ err = pcifront_rescan_root(pdev, 0, 0); ++ num_roots = 0; ++ } else if (err != 1) { ++ if (err == 0) ++ err = -EINVAL; ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error reading number of PCI roots"); ++ goto out; ++ } ++ ++ for (i = 0; i < num_roots; i++) { ++ len = snprintf(str, sizeof(str), "root-%d", i); ++ if (unlikely(len >= (sizeof(str) - 1))) { ++ err = -ENOMEM; ++ goto out; ++ } ++ ++ err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str, ++ "%x:%x", &domain, &bus); ++ if (err != 2) { ++ if (err >= 0) ++ err = -EINVAL; ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error reading PCI root %d", i); ++ goto out; ++ } ++ ++ err = pcifront_rescan_root(pdev, domain, bus); ++ if (err) { ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error scanning PCI root %04x:%02x", ++ domain, bus); ++ goto out; ++ } ++ } ++ ++ xenbus_switch_state(pdev->xdev, XenbusStateConnected); ++ ++out: ++ return err; ++} ++ ++static int pcifront_detach_devices(struct pcifront_device *pdev) ++{ ++ int err = 0; ++ int i, num_devs; ++ unsigned int domain, bus, slot, func; ++ struct pci_bus *pci_bus; ++ struct pci_dev *pci_dev; ++ char str[64]; ++ ++ if (xenbus_read_driver_state(pdev->xdev->nodename) != ++ XenbusStateConnected) ++ goto out; ++ ++ err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, "num_devs", "%d", ++ &num_devs); ++ if (err != 1) { ++ if (err >= 0) ++ err = -EINVAL; ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error reading number of PCI devices"); ++ goto out; ++ } ++ ++ /* Find devices being detached and remove them. */ ++ for (i = 0; i < num_devs; i++) { ++ int l, state; ++ l = snprintf(str, sizeof(str), "state-%d", i); ++ if (unlikely(l >= (sizeof(str) - 1))) { ++ err = -ENOMEM; ++ goto out; ++ } ++ err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str, "%d", ++ &state); ++ if (err != 1) ++ state = XenbusStateUnknown; ++ ++ if (state != XenbusStateClosing) ++ continue; ++ ++ /* Remove device. */ ++ l = snprintf(str, sizeof(str), "vdev-%d", i); ++ if (unlikely(l >= (sizeof(str) - 1))) { ++ err = -ENOMEM; ++ goto out; ++ } ++ err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str, ++ "%x:%x:%x.%x", &domain, &bus, &slot, &func); ++ if (err != 4) { ++ if (err >= 0) ++ err = -EINVAL; ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error reading PCI device %d", i); ++ goto out; ++ } ++ ++ pci_bus = pci_find_bus(domain, bus); ++ if (!pci_bus) { ++ dev_dbg(&pdev->xdev->dev, "Cannot get bus %04x:%02x\n", ++ domain, bus); ++ continue; ++ } ++ pci_dev = pci_get_slot(pci_bus, PCI_DEVFN(slot, func)); ++ if (!pci_dev) { ++ dev_dbg(&pdev->xdev->dev, ++ "Cannot get PCI device %04x:%02x:%02x.%02x\n", ++ domain, bus, slot, func); ++ continue; ++ } ++ pci_remove_bus_device(pci_dev); ++ pci_dev_put(pci_dev); ++ ++ dev_dbg(&pdev->xdev->dev, ++ "PCI device %04x:%02x:%02x.%02x removed.\n", ++ domain, bus, slot, func); ++ } ++ ++ err = xenbus_switch_state(pdev->xdev, XenbusStateReconfiguring); ++ ++out: ++ return err; ++} ++ ++static void __init_refok pcifront_backend_changed(struct xenbus_device *xdev, ++ enum xenbus_state be_state) ++{ ++ struct pcifront_device *pdev = dev_get_drvdata(&xdev->dev); ++ ++ switch (be_state) { ++ case XenbusStateUnknown: ++ case XenbusStateInitialising: ++ case XenbusStateInitWait: ++ case XenbusStateInitialised: ++ case XenbusStateClosed: ++ break; ++ ++ case XenbusStateConnected: ++ pcifront_try_connect(pdev); ++ break; ++ ++ case XenbusStateClosing: ++ dev_warn(&xdev->dev, "backend going away!\n"); ++ pcifront_try_disconnect(pdev); ++ break; ++ ++ case XenbusStateReconfiguring: ++ pcifront_detach_devices(pdev); ++ break; ++ ++ case XenbusStateReconfigured: ++ pcifront_attach_devices(pdev); ++ break; ++ } ++} ++ ++static int pcifront_xenbus_probe(struct xenbus_device *xdev, ++ const struct xenbus_device_id *id) ++{ ++ int err = 0; ++ struct pcifront_device *pdev = alloc_pdev(xdev); ++ ++ if (pdev == NULL) { ++ err = -ENOMEM; ++ xenbus_dev_fatal(xdev, err, ++ "Error allocating pcifront_device struct"); ++ goto out; ++ } ++ ++ err = pcifront_publish_info(pdev); ++ ++out: ++ return err; ++} ++ ++static int pcifront_xenbus_remove(struct xenbus_device *xdev) ++{ ++ struct pcifront_device *pdev = dev_get_drvdata(&xdev->dev); ++ ++ if (pdev) ++ free_pdev(pdev); ++ ++ return 0; ++} ++ ++static const struct xenbus_device_id xenpci_ids[] = { ++ {"pci"}, ++ {""}, ++}; ++ ++static struct xenbus_driver xenbus_pcifront_driver = { ++ .name = "pcifront", ++ .owner = THIS_MODULE, ++ .ids = xenpci_ids, ++ .probe = pcifront_xenbus_probe, ++ .remove = pcifront_xenbus_remove, ++ .otherend_changed = pcifront_backend_changed, ++}; ++ ++static int __init pcifront_init(void) ++{ ++ if (!xen_domain()) ++ return -ENODEV; ++ ++ pci_frontend_registrar(1 /* enable */); ++ ++ return xenbus_register_frontend(&xenbus_pcifront_driver); ++} ++ ++static void __exit pcifront_cleanup(void) ++{ ++ xenbus_unregister_driver(&xenbus_pcifront_driver); ++ pci_frontend_registrar(0 /* disable */); ++} ++module_init(pcifront_init); ++module_exit(pcifront_cleanup); ++ ++MODULE_DESCRIPTION("Xen PCI passthrough frontend."); ++MODULE_LICENSE("GPL"); ++MODULE_ALIAS("xen:pci"); +diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig +index 188e1ba..efac9e3 100644 +--- a/drivers/video/Kconfig ++++ b/drivers/video/Kconfig +@@ -2063,6 +2063,7 @@ config XEN_FBDEV_FRONTEND + select FB_SYS_IMAGEBLIT + select FB_SYS_FOPS + select FB_DEFERRED_IO ++ select XEN_XENBUS_FRONTEND + default y + help + This driver implements the front-end of the Xen virtual +diff --git a/drivers/video/broadsheetfb.c b/drivers/video/broadsheetfb.c +index 509cb92..df9ccb9 100644 +--- a/drivers/video/broadsheetfb.c ++++ b/drivers/video/broadsheetfb.c +@@ -470,7 +470,7 @@ static int __devinit broadsheetfb_probe(struct platform_device *dev) + par->read_reg = broadsheet_read_reg; + init_waitqueue_head(&par->waitq); + +- info->flags = FBINFO_FLAG_DEFAULT; ++ info->flags = FBINFO_FLAG_DEFAULT | FBINFO_VIRTFB; + + info->fbdefio = &broadsheetfb_defio; + fb_deferred_io_init(info); +diff --git a/drivers/video/fb_defio.c b/drivers/video/fb_defio.c +index c27ab1e..94414fc 100644 +--- a/drivers/video/fb_defio.c ++++ b/drivers/video/fb_defio.c +@@ -144,7 +144,9 @@ static const struct address_space_operations fb_deferred_io_aops = { + static int fb_deferred_io_mmap(struct fb_info *info, struct vm_area_struct *vma) + { + vma->vm_ops = &fb_deferred_io_vm_ops; +- vma->vm_flags |= ( VM_IO | VM_RESERVED | VM_DONTEXPAND ); ++ vma->vm_flags |= ( VM_RESERVED | VM_DONTEXPAND ); ++ if (!(info->flags & FBINFO_VIRTFB)) ++ vma->vm_flags |= VM_IO; + vma->vm_private_data = info; + return 0; + } +diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c +index 99bbd28..057433a 100644 +--- a/drivers/video/fbmem.c ++++ b/drivers/video/fbmem.c +@@ -1362,6 +1362,7 @@ fb_mmap(struct file *file, struct vm_area_struct * vma) + vma->vm_pgoff = off >> PAGE_SHIFT; + /* This is an IO map - tell maydump to skip this VMA */ + vma->vm_flags |= VM_IO | VM_RESERVED; ++ vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); + fb_pgprotect(file, vma, off); + if (io_remap_pfn_range(vma, vma->vm_start, off >> PAGE_SHIFT, + vma->vm_end - vma->vm_start, vma->vm_page_prot)) +diff --git a/drivers/video/hecubafb.c b/drivers/video/hecubafb.c +index 0b4bffb..f9d77ad 100644 +--- a/drivers/video/hecubafb.c ++++ b/drivers/video/hecubafb.c +@@ -253,7 +253,7 @@ static int __devinit hecubafb_probe(struct platform_device *dev) + par->send_command = apollo_send_command; + par->send_data = apollo_send_data; + +- info->flags = FBINFO_FLAG_DEFAULT; ++ info->flags = FBINFO_FLAG_DEFAULT | FBINFO_VIRTFB; + + info->fbdefio = &hecubafb_defio; + fb_deferred_io_init(info); +diff --git a/drivers/video/metronomefb.c b/drivers/video/metronomefb.c +index df1f757..661bfd2 100644 +--- a/drivers/video/metronomefb.c ++++ b/drivers/video/metronomefb.c +@@ -700,7 +700,7 @@ static int __devinit metronomefb_probe(struct platform_device *dev) + if (retval < 0) + goto err_free_irq; + +- info->flags = FBINFO_FLAG_DEFAULT; ++ info->flags = FBINFO_FLAG_DEFAULT | FBINFO_VIRTFB; + + info->fbdefio = &metronomefb_defio; + fb_deferred_io_init(info); +diff --git a/drivers/video/xen-fbfront.c b/drivers/video/xen-fbfront.c +index 54cd916..dc72563 100644 +--- a/drivers/video/xen-fbfront.c ++++ b/drivers/video/xen-fbfront.c +@@ -25,7 +25,10 @@ + #include <linux/module.h> + #include <linux/vmalloc.h> + #include <linux/mm.h> ++ + #include <asm/xen/hypervisor.h> ++ ++#include <xen/xen.h> + #include <xen/events.h> + #include <xen/page.h> + #include <xen/interface/io/fbif.h> +@@ -440,7 +443,7 @@ static int __devinit xenfb_probe(struct xenbus_device *dev, + fb_info->fix.type = FB_TYPE_PACKED_PIXELS; + fb_info->fix.accel = FB_ACCEL_NONE; + +- fb_info->flags = FBINFO_FLAG_DEFAULT; ++ fb_info->flags = FBINFO_FLAG_DEFAULT | FBINFO_VIRTFB; + + ret = fb_alloc_cmap(&fb_info->cmap, 256, 0); + if (ret < 0) { +@@ -627,6 +630,8 @@ static void xenfb_backend_changed(struct xenbus_device *dev, + switch (backend_state) { + case XenbusStateInitialising: + case XenbusStateInitialised: ++ case XenbusStateReconfiguring: ++ case XenbusStateReconfigured: + case XenbusStateUnknown: + case XenbusStateClosed: + break; +@@ -680,7 +685,7 @@ static struct xenbus_driver xenfb_driver = { + + static int __init xenfb_init(void) + { +- if (!xen_domain()) ++ if (!xen_domain() || xen_hvm_domain()) + return -ENODEV; + + /* Nothing to do if running in dom0. */ +diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig +index 3711b88..4fcb4c5 100644 +--- a/drivers/watchdog/Kconfig ++++ b/drivers/watchdog/Kconfig +@@ -975,6 +975,16 @@ config WATCHDOG_RIO + + # XTENSA Architecture + ++# Xen Architecture ++ ++config XEN_WDT ++ tristate "Xen Watchdog support" ++ depends on XEN ++ help ++ Say Y here to support the hypervisor watchdog capability provided ++ by Xen 4.0 and newer. The watchdog timeout period is normally one ++ minute but can be changed with a boot-time parameter. ++ + # + # ISA-based Watchdog Cards + # +diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile +index 699199b..2f6739a 100644 +--- a/drivers/watchdog/Makefile ++++ b/drivers/watchdog/Makefile +@@ -141,6 +141,9 @@ obj-$(CONFIG_WATCHDOG_CP1XXX) += cpwd.o + + # XTENSA Architecture + ++# Xen ++obj-$(CONFIG_XEN_WDT) += xen_wdt.o ++ + # Architecture Independant + obj-$(CONFIG_WM831X_WATCHDOG) += wm831x_wdt.o + obj-$(CONFIG_WM8350_WATCHDOG) += wm8350_wdt.o +diff --git a/drivers/watchdog/xen_wdt.c b/drivers/watchdog/xen_wdt.c +new file mode 100644 +index 0000000..bcfaafb +--- /dev/null ++++ b/drivers/watchdog/xen_wdt.c +@@ -0,0 +1,359 @@ ++/* ++ * Xen Watchdog Driver ++ * ++ * (c) Copyright 2010 Novell, Inc. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ */ ++ ++#define DRV_NAME "wdt" ++#define DRV_VERSION "0.01" ++#define PFX DRV_NAME ": " ++ ++#include <linux/bug.h> ++#include <linux/errno.h> ++#include <linux/fs.h> ++#include <linux/hrtimer.h> ++#include <linux/kernel.h> ++#include <linux/ktime.h> ++#include <linux/init.h> ++#include <linux/miscdevice.h> ++#include <linux/module.h> ++#include <linux/moduleparam.h> ++#include <linux/platform_device.h> ++#include <linux/spinlock.h> ++#include <linux/uaccess.h> ++#include <linux/watchdog.h> ++#include <xen/xen.h> ++#include <asm/xen/hypercall.h> ++#include <xen/interface/sched.h> ++ ++static struct platform_device *platform_device; ++static DEFINE_SPINLOCK(wdt_lock); ++static struct sched_watchdog wdt; ++static __kernel_time_t wdt_expires; ++static bool is_active, expect_release; ++ ++#define WATCHDOG_TIMEOUT 60 /* in seconds */ ++static unsigned int timeout = WATCHDOG_TIMEOUT; ++module_param(timeout, uint, S_IRUGO); ++MODULE_PARM_DESC(timeout, "Watchdog timeout in seconds " ++ "(default=" __MODULE_STRING(WATCHDOG_TIMEOUT) ")"); ++ ++static bool nowayout = WATCHDOG_NOWAYOUT; ++module_param(nowayout, bool, S_IRUGO); ++MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started " ++ "(default=" __MODULE_STRING(WATCHDOG_NOWAYOUT) ")"); ++ ++static inline __kernel_time_t set_timeout(void) ++{ ++ wdt.timeout = timeout; ++ return ktime_to_timespec(ktime_get()).tv_sec + timeout; ++} ++ ++static int xen_wdt_start(void) ++{ ++ __kernel_time_t expires; ++ int err; ++ ++ spin_lock(&wdt_lock); ++ ++ expires = set_timeout(); ++ if (!wdt.id) ++ err = HYPERVISOR_sched_op(SCHEDOP_watchdog, &wdt); ++ else ++ err = -EBUSY; ++ if (err > 0) { ++ wdt.id = err; ++ wdt_expires = expires; ++ err = 0; ++ } else ++ BUG_ON(!err); ++ ++ spin_unlock(&wdt_lock); ++ ++ return err; ++} ++ ++static int xen_wdt_stop(void) ++{ ++ int err = 0; ++ ++ spin_lock(&wdt_lock); ++ ++ wdt.timeout = 0; ++ if (wdt.id) ++ err = HYPERVISOR_sched_op(SCHEDOP_watchdog, &wdt); ++ if (!err) ++ wdt.id = 0; ++ ++ spin_unlock(&wdt_lock); ++ ++ return err; ++} ++ ++static int xen_wdt_kick(void) ++{ ++ __kernel_time_t expires; ++ int err; ++ ++ spin_lock(&wdt_lock); ++ ++ expires = set_timeout(); ++ if (wdt.id) ++ err = HYPERVISOR_sched_op(SCHEDOP_watchdog, &wdt); ++ else ++ err = -ENXIO; ++ if (!err) ++ wdt_expires = expires; ++ ++ spin_unlock(&wdt_lock); ++ ++ return err; ++} ++ ++static int xen_wdt_open(struct inode *inode, struct file *file) ++{ ++ int err; ++ ++ /* /dev/watchdog can only be opened once */ ++ if (xchg(&is_active, true)) ++ return -EBUSY; ++ ++ err = xen_wdt_start(); ++ if (err == -EBUSY) ++ err = xen_wdt_kick(); ++ return err ?: nonseekable_open(inode, file); ++} ++ ++static int xen_wdt_release(struct inode *inode, struct file *file) ++{ ++ if (expect_release) ++ xen_wdt_stop(); ++ else { ++ printk(KERN_CRIT PFX ++ "unexpected close, not stopping watchdog!\n"); ++ xen_wdt_kick(); ++ } ++ is_active = false; ++ expect_release = false; ++ return 0; ++} ++ ++static ssize_t xen_wdt_write(struct file *file, const char __user *data, ++ size_t len, loff_t *ppos) ++{ ++ /* See if we got the magic character 'V' and reload the timer */ ++ if (len) { ++ if (!nowayout) { ++ size_t i; ++ ++ /* in case it was set long ago */ ++ expect_release = false; ++ ++ /* scan to see whether or not we got the magic ++ character */ ++ for (i = 0; i != len; i++) { ++ char c; ++ if (get_user(c, data + i)) ++ return -EFAULT; ++ if (c == 'V') ++ expect_release = true; ++ } ++ } ++ ++ /* someone wrote to us, we should reload the timer */ ++ xen_wdt_kick(); ++ } ++ return len; ++} ++ ++static long xen_wdt_ioctl(struct file *file, unsigned int cmd, ++ unsigned long arg) ++{ ++ int new_options, retval = -EINVAL; ++ int new_timeout; ++ int __user *argp = (void __user *)arg; ++ static const struct watchdog_info ident = { ++ .options = WDIOF_SETTIMEOUT | WDIOF_MAGICCLOSE, ++ .firmware_version = 0, ++ .identity = DRV_NAME, ++ }; ++ ++ switch (cmd) { ++ case WDIOC_GETSUPPORT: ++ return copy_to_user(argp, &ident, sizeof(ident)) ? -EFAULT : 0; ++ ++ case WDIOC_GETSTATUS: ++ case WDIOC_GETBOOTSTATUS: ++ return put_user(0, argp); ++ ++ case WDIOC_SETOPTIONS: ++ if (get_user(new_options, argp)) ++ return -EFAULT; ++ ++ if (new_options & WDIOS_DISABLECARD) ++ retval = xen_wdt_stop(); ++ if (new_options & WDIOS_ENABLECARD) { ++ retval = xen_wdt_start(); ++ if (retval == -EBUSY) ++ retval = xen_wdt_kick(); ++ } ++ return retval; ++ ++ case WDIOC_KEEPALIVE: ++ xen_wdt_kick(); ++ return 0; ++ ++ case WDIOC_SETTIMEOUT: ++ if (get_user(new_timeout, argp)) ++ return -EFAULT; ++ if (!new_timeout) ++ return -EINVAL; ++ timeout = new_timeout; ++ xen_wdt_kick(); ++ /* fall through */ ++ case WDIOC_GETTIMEOUT: ++ return put_user(timeout, argp); ++ ++ case WDIOC_GETTIMELEFT: ++ retval = wdt_expires - ktime_to_timespec(ktime_get()).tv_sec; ++ return put_user(retval, argp); ++ } ++ ++ return -ENOTTY; ++} ++ ++static const struct file_operations xen_wdt_fops = { ++ .owner = THIS_MODULE, ++ .llseek = no_llseek, ++ .write = xen_wdt_write, ++ .unlocked_ioctl = xen_wdt_ioctl, ++ .open = xen_wdt_open, ++ .release = xen_wdt_release, ++}; ++ ++static struct miscdevice xen_wdt_miscdev = { ++ .minor = WATCHDOG_MINOR, ++ .name = "watchdog", ++ .fops = &xen_wdt_fops, ++}; ++ ++static int __devinit xen_wdt_probe(struct platform_device *dev) ++{ ++ struct sched_watchdog wd = { .id = ~0 }; ++ int ret = HYPERVISOR_sched_op(SCHEDOP_watchdog, &wd); ++ ++ switch (ret) { ++ case -EINVAL: ++ if (!timeout) { ++ timeout = WATCHDOG_TIMEOUT; ++ printk(KERN_INFO PFX ++ "timeout value invalid, using %d\n", timeout); ++ } ++ ++ ret = misc_register(&xen_wdt_miscdev); ++ if (ret) { ++ printk(KERN_ERR PFX ++ "cannot register miscdev on minor=%d (%d)\n", ++ WATCHDOG_MINOR, ret); ++ break; ++ } ++ ++ printk(KERN_INFO PFX ++ "initialized (timeout=%ds, nowayout=%d)\n", ++ timeout, nowayout); ++ break; ++ ++ case -ENOSYS: ++ printk(KERN_INFO PFX "not supported\n"); ++ ret = -ENODEV; ++ break; ++ ++ default: ++ printk(KERN_INFO PFX "bogus return value %d\n", ret); ++ break; ++ } ++ ++ return ret; ++} ++ ++static int __devexit xen_wdt_remove(struct platform_device *dev) ++{ ++ /* Stop the timer before we leave */ ++ if (!nowayout) ++ xen_wdt_stop(); ++ ++ misc_deregister(&xen_wdt_miscdev); ++ ++ return 0; ++} ++ ++static void xen_wdt_shutdown(struct platform_device *dev) ++{ ++ xen_wdt_stop(); ++} ++ ++static int xen_wdt_suspend(struct platform_device *dev, pm_message_t state) ++{ ++ return xen_wdt_stop(); ++} ++ ++static int xen_wdt_resume(struct platform_device *dev) ++{ ++ return xen_wdt_start(); ++} ++ ++static struct platform_driver xen_wdt_driver = { ++ .probe = xen_wdt_probe, ++ .remove = __devexit_p(xen_wdt_remove), ++ .shutdown = xen_wdt_shutdown, ++ .suspend = xen_wdt_suspend, ++ .resume = xen_wdt_resume, ++ .driver = { ++ .owner = THIS_MODULE, ++ .name = DRV_NAME, ++ }, ++}; ++ ++static int __init xen_wdt_init_module(void) ++{ ++ int err; ++ ++ if (!xen_domain()) ++ return -ENODEV; ++ ++ printk(KERN_INFO PFX "Xen WatchDog Timer Driver v%s\n", DRV_VERSION); ++ ++ err = platform_driver_register(&xen_wdt_driver); ++ if (err) ++ return err; ++ ++ platform_device = platform_device_register_simple(DRV_NAME, ++ -1, NULL, 0); ++ if (IS_ERR(platform_device)) { ++ err = PTR_ERR(platform_device); ++ platform_driver_unregister(&xen_wdt_driver); ++ } ++ ++ return err; ++} ++ ++static void __exit xen_wdt_cleanup_module(void) ++{ ++ platform_device_unregister(platform_device); ++ platform_driver_unregister(&xen_wdt_driver); ++ printk(KERN_INFO PFX "module unloaded\n"); ++} ++ ++module_init(xen_wdt_init_module); ++module_exit(xen_wdt_cleanup_module); ++ ++MODULE_AUTHOR("Jen Beulich <jbeulich@novell.com>"); ++MODULE_DESCRIPTION("Xen WatchDog Timer Driver"); ++MODULE_VERSION(DRV_VERSION); ++MODULE_LICENSE("GPL"); ++MODULE_ALIAS_MISCDEV(WATCHDOG_MINOR); +diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig +index cab100a..fa9982e 100644 +--- a/drivers/xen/Kconfig ++++ b/drivers/xen/Kconfig +@@ -28,6 +28,110 @@ config XEN_DEV_EVTCHN + firing. + If in doubt, say yes. + ++config XEN_BACKEND ++ bool "Backend driver support" ++ depends on XEN_DOM0 ++ default y ++ help ++ Support for backend device drivers that provide I/O services ++ to other virtual machines. ++ ++config XEN_NETDEV_BACKEND ++ tristate "Xen backend network device" ++ depends on XEN_BACKEND && NET ++ help ++ Implement the network backend driver, which passes packets ++ from the guest domain's frontend drivers to the network. ++ ++config XEN_BLKDEV_BACKEND ++ tristate "Block-device backend driver" ++ depends on XEN_BACKEND && BLOCK ++ help ++ The block-device backend driver allows the kernel to export its ++ block devices to other guests via a high-performance shared-memory ++ interface. ++ ++ ++config XEN_BLKDEV_TAP ++ tristate "Block-device tap backend driver" ++ depends on XEN_BACKEND && BLOCK ++ help ++ The block tap driver is an alternative to the block back driver ++ and allows VM block requests to be redirected to userspace through ++ a device interface. The tap allows user-space development of ++ high-performance block backends, where disk images may be implemented ++ as files, in memory, or on other hosts across the network. This ++ driver can safely coexist with the existing blockback driver. ++ ++config XEN_BLKBACK_PAGEMAP ++ tristate ++ depends on XEN_BLKDEV_BACKEND != n && XEN_BLKDEV_TAP != n ++ default XEN_BLKDEV_BACKEND || XEN_BLKDEV_TAP ++ ++config XEN_PCIDEV_BACKEND ++ tristate "PCI-device backend driver" ++ depends on PCI && XEN_BACKEND ++ default XEN_BACKEND ++ help ++ The PCI device backend driver allows the kernel to export arbitrary ++ PCI devices to other guests. If you select this to be a module, you ++ will need to make sure no other driver has bound to the device(s) ++ you want to make visible to other guests. ++ ++choice ++ prompt "PCI Backend Mode" ++ depends on XEN_PCIDEV_BACKEND ++ default XEN_PCIDEV_BACKEND_VPCI if !IA64 ++ default XEN_PCIDEV_BACKEND_CONTROLLER if IA64 ++ ++config XEN_PCIDEV_BACKEND_VPCI ++ bool "Virtual PCI" ++ ---help--- ++ This PCI Backend hides the true PCI topology and makes the frontend ++ think there is a single PCI bus with only the exported devices on it. ++ For example, a device at 03:05.0 will be re-assigned to 00:00.0. A ++ second device at 02:1a.1 will be re-assigned to 00:01.1. ++ ++config XEN_PCIDEV_BACKEND_PASS ++ bool "Passthrough" ++ ---help--- ++ This PCI Backend provides a real view of the PCI topology to the ++ frontend (for example, a device at 06:01.b will still appear at ++ 06:01.b to the frontend). This is similar to how Xen 2.0.x exposed ++ PCI devices to its driver domains. This may be required for drivers ++ which depend on finding their hardward in certain bus/slot ++ locations. ++ ++config XEN_PCIDEV_BACKEND_SLOT ++ bool "Slot" ++ ---help--- ++ This PCI Backend hides the true PCI topology and makes the frontend ++ think there is a single PCI bus with only the exported devices on it. ++ Contrary to the virtual PCI backend, a function becomes a new slot. ++ For example, a device at 03:05.2 will be re-assigned to 00:00.0. A ++ second device at 02:1a.1 will be re-assigned to 00:01.0. ++ ++config XEN_PCIDEV_BACKEND_CONTROLLER ++ bool "Controller" ++ depends on IA64 ++ ---help--- ++ This PCI backend virtualizes the PCI bus topology by providing a ++ virtual bus per PCI root device. Devices which are physically under ++ the same root bus will appear on the same virtual bus. For systems ++ with complex I/O addressing, this is the only backend which supports ++ extended I/O port spaces and MMIO translation offsets. This backend ++ also supports slot virtualization. For example, a device at ++ 0000:01:02.1 will be re-assigned to 0000:00:00.0. A second device ++ at 0000:02:05.0 (behind a P2P bridge on bus 0000:01) will be ++ re-assigned to 0000:00:01.0. A third device at 0000:16:05.0 (under ++ a different PCI root bus) will be re-assigned to 0000:01:00.0. ++ ++endchoice ++ ++config XEN_PCIDEV_BE_DEBUG ++ bool "PCI Backend Debugging" ++ depends on XEN_PCIDEV_BACKEND ++ + config XENFS + tristate "Xen filesystem" + depends on XEN +@@ -60,4 +164,37 @@ config XEN_SYS_HYPERVISOR + Create entries under /sys/hypervisor describing the Xen + hypervisor environment. When running native or in another + virtual environment, /sys/hypervisor will still be present, +- but will have no xen contents. +\ No newline at end of file ++ but will have no xen contents. ++ ++config XEN_MCE ++ def_bool y ++ depends on XEN_DOM0 && X86_64 && X86_MCE_INTEL ++ ++config XEN_XENBUS_FRONTEND ++ tristate ++ ++config XEN_GNTDEV ++ tristate "userspace grant access device driver" ++ depends on XEN ++ select MMU_NOTIFIER ++ help ++ Allows userspace processes use grants. ++ ++config XEN_S3 ++ def_bool y ++ depends on XEN_DOM0 && ACPI ++ ++config ACPI_PROCESSOR_XEN ++ tristate ++ depends on XEN_DOM0 && ACPI_PROCESSOR && CPU_FREQ ++ default y ++ ++config XEN_PLATFORM_PCI ++ tristate "xen platform pci device driver" ++ depends on XEN_PVHVM ++ default m ++ help ++ Driver for the Xen PCI Platform device: it is responsible for ++ initializing xenbus and grant_table when running in a Xen HVM ++ domain. As a consequence this driver is required to run any Xen PV ++ frontend on Xen HVM. +diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile +index 7c28434..ef1ea63 100644 +--- a/drivers/xen/Makefile ++++ b/drivers/xen/Makefile +@@ -1,12 +1,27 @@ +-obj-y += grant-table.o features.o events.o manage.o ++obj-y += grant-table.o features.o events.o manage.o biomerge.o pcpu.o + obj-y += xenbus/ + + nostackp := $(call cc-option, -fno-stack-protector) + CFLAGS_features.o := $(nostackp) + +-obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o +-obj-$(CONFIG_XEN_XENCOMM) += xencomm.o +-obj-$(CONFIG_XEN_BALLOON) += balloon.o +-obj-$(CONFIG_XEN_DEV_EVTCHN) += evtchn.o +-obj-$(CONFIG_XENFS) += xenfs/ +-obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o +\ No newline at end of file ++obj-$(CONFIG_PCI) += pci.o ++obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o ++obj-$(CONFIG_XEN_XENCOMM) += xencomm.o ++obj-$(CONFIG_XEN_BALLOON) += balloon.o ++obj-$(CONFIG_XEN_DEV_EVTCHN) += xen-evtchn.o ++obj-$(CONFIG_XEN_GNTDEV) += xen-gntdev.o ++obj-$(CONFIG_XEN_PCIDEV_BACKEND) += pciback/ ++obj-$(CONFIG_XEN_BLKDEV_BACKEND) += blkback/ ++obj-$(CONFIG_XEN_BLKDEV_TAP) += blktap/ ++obj-$(CONFIG_XEN_NETDEV_BACKEND) += netback/ ++obj-$(CONFIG_XENFS) += xenfs/ ++obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o ++obj-$(CONFIG_XEN_MCE) += mce.o ++ ++obj-$(CONFIG_XEN_S3) += acpi.o ++obj-$(CONFIG_ACPI_PROCESSOR_XEN) += acpi_processor.o ++obj-$(CONFIG_ACPI_HOTPLUG_MEMORY) += xen_acpi_memhotplug.o ++obj-$(CONFIG_XEN_PLATFORM_PCI) += platform-pci.o ++ ++xen-evtchn-y := evtchn.o ++xen-gntdev-y := gntdev.o +diff --git a/drivers/xen/acpi.c b/drivers/xen/acpi.c +new file mode 100644 +index 0000000..e6d3d0e +--- /dev/null ++++ b/drivers/xen/acpi.c +@@ -0,0 +1,23 @@ ++#include <xen/acpi.h> ++ ++#include <xen/interface/platform.h> ++#include <asm/xen/hypercall.h> ++#include <asm/xen/hypervisor.h> ++ ++int acpi_notify_hypervisor_state(u8 sleep_state, ++ u32 pm1a_cnt, u32 pm1b_cnt) ++{ ++ struct xen_platform_op op = { ++ .cmd = XENPF_enter_acpi_sleep, ++ .interface_version = XENPF_INTERFACE_VERSION, ++ .u = { ++ .enter_acpi_sleep = { ++ .pm1a_cnt_val = (u16)pm1a_cnt, ++ .pm1b_cnt_val = (u16)pm1b_cnt, ++ .sleep_state = sleep_state, ++ }, ++ }, ++ }; ++ ++ return HYPERVISOR_dom0_op(&op); ++} +diff --git a/drivers/xen/acpi_processor.c b/drivers/xen/acpi_processor.c +new file mode 100644 +index 0000000..e83b615 +--- /dev/null ++++ b/drivers/xen/acpi_processor.c +@@ -0,0 +1,417 @@ ++/* ++ * acpi_processor.c - interface to notify Xen on acpi processor object ++ * info parsing ++ * ++ * Copyright (C) 2008, Intel corporation ++ * ++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or (at ++ * your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, but ++ * WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with this program; if not, write to the Free Software Foundation, Inc., ++ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. ++ * ++ */ ++ ++#include <linux/kernel.h> ++#include <linux/init.h> ++#include <linux/types.h> ++#include <linux/acpi.h> ++#include <linux/pm.h> ++#include <linux/cpu.h> ++ ++#include <linux/cpufreq.h> ++#include <acpi/processor.h> ++#include <xen/acpi.h> ++#include <xen/pcpu.h> ++ ++#include <asm/xen/hypercall.h> ++#include <asm/xen/hypervisor.h> ++ ++static int xen_hotplug_notifier(struct acpi_processor *pr, int event); ++ ++static struct processor_cntl_xen_ops xen_ops = { ++ .hotplug = xen_hotplug_notifier, ++}; ++ ++static struct acpi_power_register *power_registers[XEN_MAX_ACPI_ID + 1]; ++ ++int processor_cntl_xen_power_cache(int cpu, int cx, ++ struct acpi_power_register *reg) ++{ ++ struct acpi_power_register *buf; ++ ++ if (cpu < 0 || cpu > XEN_MAX_ACPI_ID || ++ cx < 1 || cx > ACPI_PROCESSOR_MAX_POWER) { ++ return -EINVAL; ++ } ++ ++ if (power_registers[cpu] == NULL) { ++ buf = kzalloc(ACPI_PROCESSOR_MAX_POWER * ++ sizeof(struct xen_processor_cx), GFP_KERNEL); ++ if (buf == NULL) ++ return -ENOMEM; ++ ++ power_registers[cpu] = buf; ++ } ++ ++ memcpy(power_registers[cpu]+cx-1, reg, sizeof(*reg)); ++ ++ return 0; ++} ++EXPORT_SYMBOL(processor_cntl_xen_power_cache); ++ ++#ifdef CONFIG_ACPI_HOTPLUG_CPU ++static int xen_get_apic_id(acpi_handle handle) ++{ ++ struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; ++ union acpi_object *obj; ++ struct acpi_madt_local_apic *lapic; ++ u8 physid; ++ ++ if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer))) ++ return -EINVAL; ++ ++ if (!buffer.length || !buffer.pointer) ++ return -EINVAL; ++ ++ obj = buffer.pointer; ++ if (obj->type != ACPI_TYPE_BUFFER || ++ obj->buffer.length < sizeof(*lapic)) { ++ kfree(buffer.pointer); ++ return -EINVAL; ++ } ++ ++ lapic = (struct acpi_madt_local_apic *)obj->buffer.pointer; ++ ++ if (lapic->header.type != ACPI_MADT_TYPE_LOCAL_APIC || ++ !(lapic->lapic_flags & ACPI_MADT_ENABLED)) { ++ kfree(buffer.pointer); ++ return -EINVAL; ++ } ++ ++ physid = lapic->id; ++ kfree(buffer.pointer); ++ buffer.length = ACPI_ALLOCATE_BUFFER; ++ buffer.pointer = NULL; ++ ++ return physid; ++} ++#else ++static int xen_get_apic_id(acpi_handle handle) ++{ ++ return -1; ++} ++#endif ++ ++int processor_cntl_xen_notify(struct acpi_processor *pr, int event, int type) ++{ ++ int ret = -EINVAL; ++ ++ switch (event) { ++ case PROCESSOR_PM_INIT: ++ case PROCESSOR_PM_CHANGE: ++ if ((type >= PM_TYPE_MAX) || ++ !xen_ops.pm_ops[type]) ++ break; ++ ++ ret = xen_ops.pm_ops[type](pr, event); ++ break; ++ case PROCESSOR_HOTPLUG: ++ { ++ int apic_id; ++ ++ apic_id = xen_get_apic_id(pr->handle); ++ if (apic_id < 0) ++ break; ++ if (xen_ops.hotplug) ++ ret = xen_ops.hotplug(pr, type); ++ xen_pcpu_hotplug(type, apic_id); ++ break; ++ } ++ default: ++ printk(KERN_ERR "Unsupport processor events %d.\n", event); ++ break; ++ } ++ ++ return ret; ++} ++EXPORT_SYMBOL(processor_cntl_xen_notify); ++ ++static inline void xen_convert_pct_reg(struct xen_pct_register *xpct, ++ struct acpi_pct_register *apct) ++{ ++ xpct->descriptor = apct->descriptor; ++ xpct->length = apct->length; ++ xpct->space_id = apct->space_id; ++ xpct->bit_width = apct->bit_width; ++ xpct->bit_offset = apct->bit_offset; ++ xpct->reserved = apct->reserved; ++ xpct->address = apct->address; ++} ++ ++static inline void xen_convert_pss_states(struct xen_processor_px *xpss, ++ struct acpi_processor_px *apss, int state_count) ++{ ++ int i; ++ for (i = 0; i < state_count; i++) { ++ xpss->core_frequency = apss->core_frequency; ++ xpss->power = apss->power; ++ xpss->transition_latency = apss->transition_latency; ++ xpss->bus_master_latency = apss->bus_master_latency; ++ xpss->control = apss->control; ++ xpss->status = apss->status; ++ xpss++; ++ apss++; ++ } ++} ++ ++static inline void xen_convert_psd_pack(struct xen_psd_package *xpsd, ++ struct acpi_psd_package *apsd) ++{ ++ xpsd->num_entries = apsd->num_entries; ++ xpsd->revision = apsd->revision; ++ xpsd->domain = apsd->domain; ++ xpsd->coord_type = apsd->coord_type; ++ xpsd->num_processors = apsd->num_processors; ++} ++ ++static int xen_cx_notifier(struct acpi_processor *pr, int action) ++{ ++ int ret, count = 0, i; ++ xen_platform_op_t op = { ++ .cmd = XENPF_set_processor_pminfo, ++ .interface_version = XENPF_INTERFACE_VERSION, ++ .u.set_pminfo.id = pr->acpi_id, ++ .u.set_pminfo.type = XEN_PM_CX, ++ }; ++ struct xen_processor_cx *data, *buf; ++ struct acpi_processor_cx *cx; ++ struct acpi_power_register *reg; ++ ++ if (action == PROCESSOR_PM_CHANGE) ++ return -EINVAL; ++ ++ if (power_registers[pr->acpi_id] == NULL) { ++ printk(KERN_WARNING "No C state info for acpi processor %d\n", ++ pr->acpi_id); ++ return -EINVAL; ++ } ++ ++ /* Convert to Xen defined structure and hypercall */ ++ buf = kzalloc(pr->power.count * sizeof(struct xen_processor_cx), ++ GFP_KERNEL); ++ if (!buf) ++ return -ENOMEM; ++ ++ data = buf; ++ for (i = 1; i <= pr->power.count; i++) { ++ cx = &pr->power.states[i]; ++ reg = power_registers[pr->acpi_id]+i-1; ++ /* Skip invalid cstate entry */ ++ if (!cx->valid) ++ continue; ++ ++ data->type = cx->type; ++ data->latency = cx->latency; ++ data->power = cx->power; ++ data->reg.space_id = reg->space_id; ++ data->reg.bit_width = reg->bit_width; ++ data->reg.bit_offset = reg->bit_offset; ++ data->reg.access_size = reg->access_size; ++ data->reg.address = reg->address; ++ ++ /* Get dependency relationships, _CSD is not supported yet */ ++ data->dpcnt = 0; ++ set_xen_guest_handle(data->dp, NULL); ++ ++ data++; ++ count++; ++ } ++ ++ if (!count) { ++ printk(KERN_ERR "No available Cx info for cpu %d\n", ++ pr->acpi_id); ++ kfree(buf); ++ return -EINVAL; ++ } ++ ++ op.u.set_pminfo.power.count = count; ++ op.u.set_pminfo.power.flags.bm_control = pr->flags.bm_control; ++ op.u.set_pminfo.power.flags.bm_check = pr->flags.bm_check; ++ op.u.set_pminfo.power.flags.has_cst = pr->flags.has_cst; ++ op.u.set_pminfo.power.flags.power_setup_done = ++ pr->flags.power_setup_done; ++ ++ set_xen_guest_handle(op.u.set_pminfo.power.states, buf); ++ ret = HYPERVISOR_dom0_op(&op); ++ kfree(buf); ++ return ret; ++} ++ ++static int xen_px_notifier(struct acpi_processor *pr, int action) ++{ ++ int ret = -EINVAL; ++ xen_platform_op_t op = { ++ .cmd = XENPF_set_processor_pminfo, ++ .interface_version = XENPF_INTERFACE_VERSION, ++ .u.set_pminfo.id = pr->acpi_id, ++ .u.set_pminfo.type = XEN_PM_PX, ++ }; ++ struct xen_processor_performance *perf; ++ struct xen_processor_px *states = NULL; ++ struct acpi_processor_performance *px; ++ struct acpi_psd_package *pdomain; ++ ++ if (!pr) ++ return -EINVAL; ++ ++ perf = &op.u.set_pminfo.perf; ++ px = pr->performance; ++ ++ switch (action) { ++ case PROCESSOR_PM_CHANGE: ++ /* ppc dynamic handle */ ++ perf->flags = XEN_PX_PPC; ++ perf->platform_limit = pr->performance_platform_limit; ++ ++ ret = HYPERVISOR_dom0_op(&op); ++ break; ++ ++ case PROCESSOR_PM_INIT: ++ /* px normal init */ ++ perf->flags = XEN_PX_PPC | ++ XEN_PX_PCT | ++ XEN_PX_PSS | ++ XEN_PX_PSD; ++ ++ /* ppc */ ++ perf->platform_limit = pr->performance_platform_limit; ++ ++ /* pct */ ++ xen_convert_pct_reg(&perf->control_register, ++ &px->control_register); ++ xen_convert_pct_reg(&perf->status_register, ++ &px->status_register); ++ ++ /* pss */ ++ perf->state_count = px->state_count; ++ states = kzalloc(px->state_count*sizeof(xen_processor_px_t), ++ GFP_KERNEL); ++ if (!states) ++ return -ENOMEM; ++ xen_convert_pss_states(states, px->states, px->state_count); ++ set_xen_guest_handle(perf->states, states); ++ ++ /* psd */ ++ pdomain = &px->domain_info; ++ xen_convert_psd_pack(&perf->domain_info, pdomain); ++ if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ALL) ++ perf->shared_type = CPUFREQ_SHARED_TYPE_ALL; ++ else if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ANY) ++ perf->shared_type = CPUFREQ_SHARED_TYPE_ANY; ++ else if (pdomain->coord_type == DOMAIN_COORD_TYPE_HW_ALL) ++ perf->shared_type = CPUFREQ_SHARED_TYPE_HW; ++ else { ++ ret = -ENODEV; ++ kfree(states); ++ break; ++ } ++ ++ ret = HYPERVISOR_dom0_op(&op); ++ kfree(states); ++ break; ++ ++ default: ++ break; ++ } ++ ++ return ret; ++} ++ ++static int xen_tx_notifier(struct acpi_processor *pr, int action) ++{ ++ return -EINVAL; ++} ++ ++#ifdef CONFIG_ACPI_HOTPLUG_CPU ++static int xen_hotplug_notifier(struct acpi_processor *pr, int event) ++{ ++ int ret = -EINVAL; ++ uint32_t apic_id; ++ unsigned long long pxm; ++ acpi_status status = 0; ++ ++ xen_platform_op_t op = { ++ .interface_version = XENPF_INTERFACE_VERSION, ++ }; ++ ++ apic_id = xen_get_apic_id(pr->handle); ++ if (apic_id < 0) { ++ printk(KERN_WARNING "Can't get apic_id for acpi_id %x\n", ++ pr->acpi_id); ++ return -1; ++ } ++ ++ status = acpi_evaluate_integer(pr->handle, "_PXM", ++ NULL, &pxm); ++ if (ACPI_FAILURE(status)) { ++ printk(KERN_WARNING "can't get pxm for acpi_id %x\n", ++ pr->acpi_id); ++ return -1; ++ } ++ ++ switch (event) { ++ case HOTPLUG_TYPE_ADD: ++ op.cmd = XENPF_cpu_hotadd; ++ op.u.cpu_add.apic_id = apic_id; ++ op.u.cpu_add.acpi_id = pr->acpi_id; ++ op.u.cpu_add.pxm = pxm; ++ ret = HYPERVISOR_dom0_op(&op); ++ break; ++ case HOTPLUG_TYPE_REMOVE: ++ printk(KERN_WARNING "Xen not support CPU hotremove\n"); ++ ret = -ENOSYS; ++ break; ++ } ++ ++ return ret; ++} ++#else ++static int xen_hotplug_notifier(struct acpi_processor *pr, int event) ++{ ++ return -ENOSYS; ++} ++#endif ++ ++static int __init xen_acpi_processor_extcntl_init(void) ++{ ++ unsigned int pmbits; ++ ++ /* Only xen dom0 is allowed to handle ACPI processor info */ ++ if (!xen_initial_domain()) ++ return 0; ++ ++ pmbits = (xen_start_info->flags & SIF_PM_MASK) >> 8; ++ ++ if (pmbits & XEN_PROCESSOR_PM_CX) ++ xen_ops.pm_ops[PM_TYPE_IDLE] = xen_cx_notifier; ++ if (pmbits & XEN_PROCESSOR_PM_PX) ++ xen_ops.pm_ops[PM_TYPE_PERF] = xen_px_notifier; ++ if (pmbits & XEN_PROCESSOR_PM_TX) ++ xen_ops.pm_ops[PM_TYPE_THR] = xen_tx_notifier; ++ ++ return 0; ++} ++ ++subsys_initcall(xen_acpi_processor_extcntl_init); ++MODULE_LICENSE("GPL"); +diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c +index 4204336..158cdd1 100644 +--- a/drivers/xen/balloon.c ++++ b/drivers/xen/balloon.c +@@ -43,22 +43,26 @@ + #include <linux/mutex.h> + #include <linux/list.h> + #include <linux/sysdev.h> ++#include <linux/swap.h> + + #include <asm/page.h> + #include <asm/pgalloc.h> + #include <asm/pgtable.h> + #include <asm/uaccess.h> + #include <asm/tlb.h> ++#include <asm/e820.h> + + #include <asm/xen/hypervisor.h> + #include <asm/xen/hypercall.h> ++ ++#include <xen/xen.h> + #include <xen/interface/xen.h> + #include <xen/interface/memory.h> + #include <xen/xenbus.h> + #include <xen/features.h> + #include <xen/page.h> + +-#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10)) ++#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT+balloon_order-10)) + + #define BALLOON_CLASS_NAME "xen_memory" + +@@ -82,14 +86,15 @@ static struct sys_device balloon_sysdev; + + static int register_balloon(struct sys_device *sysdev); + ++static struct balloon_stats balloon_stats; ++ + /* +- * Protects atomic reservation decrease/increase against concurrent increases. +- * Also protects non-atomic updates of current_pages and driver_pages, and +- * balloon lists. ++ * Work in pages of this order. Can be either 0 for normal pages ++ * or 9 for hugepages. + */ +-static DEFINE_SPINLOCK(balloon_lock); +- +-static struct balloon_stats balloon_stats; ++static int balloon_order; ++static unsigned long balloon_npages; ++static unsigned long discontig_frame_list[PAGE_SIZE / sizeof(unsigned long)]; + + /* We increase/decrease in batches which fit in a page */ + static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)]; +@@ -118,12 +123,43 @@ static struct timer_list balloon_timer; + static void scrub_page(struct page *page) + { + #ifdef CONFIG_XEN_SCRUB_PAGES +- clear_highpage(page); ++ int i; ++ ++ for (i = 0; i < balloon_npages; i++) ++ clear_highpage(page++); + #endif + } + ++static void free_discontig_frame(void) ++{ ++ int rc; ++ struct xen_memory_reservation reservation = { ++ .address_bits = 0, ++ .domid = DOMID_SELF, ++ .nr_extents = balloon_npages, ++ .extent_order = 0 ++ }; ++ ++ set_xen_guest_handle(reservation.extent_start, discontig_frame_list); ++ rc = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); ++ BUG_ON(rc != balloon_npages); ++} ++ ++static unsigned long shrink_frame(unsigned long nr_pages) ++{ ++ unsigned long i, j; ++ ++ for (i = 0, j = 0; i < nr_pages; i++, j++) { ++ if (frame_list[i] == 0) ++ j++; ++ if (i != j) ++ frame_list[i] = frame_list[j]; ++ } ++ return i; ++} ++ + /* balloon_append: add the given page to the balloon. */ +-static void balloon_append(struct page *page) ++static void __balloon_append(struct page *page) + { + /* Lowmem is re-populated first, so highmem pages go at list tail. */ + if (PageHighMem(page)) { +@@ -134,7 +170,11 @@ static void balloon_append(struct page *page) + list_add(&page->lru, &ballooned_pages); + balloon_stats.balloon_low++; + } ++} + ++static void balloon_append(struct page *page) ++{ ++ __balloon_append(page); + totalram_pages--; + } + +@@ -195,20 +235,17 @@ static unsigned long current_target(void) + + static int increase_reservation(unsigned long nr_pages) + { +- unsigned long pfn, i, flags; ++ unsigned long pfn, mfn, i, j; + struct page *page; + long rc; + struct xen_memory_reservation reservation = { + .address_bits = 0, +- .extent_order = 0, + .domid = DOMID_SELF + }; + + if (nr_pages > ARRAY_SIZE(frame_list)) + nr_pages = ARRAY_SIZE(frame_list); + +- spin_lock_irqsave(&balloon_lock, flags); +- + page = balloon_first_page(); + for (i = 0; i < nr_pages; i++) { + BUG_ON(page == NULL); +@@ -218,6 +255,8 @@ static int increase_reservation(unsigned long nr_pages) + + set_xen_guest_handle(reservation.extent_start, frame_list); + reservation.nr_extents = nr_pages; ++ reservation.extent_order = balloon_order; ++ + rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation); + if (rc < 0) + goto out; +@@ -227,19 +266,22 @@ static int increase_reservation(unsigned long nr_pages) + BUG_ON(page == NULL); + + pfn = page_to_pfn(page); ++ mfn = frame_list[i]; + BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) && + phys_to_machine_mapping_valid(pfn)); + +- set_phys_to_machine(pfn, frame_list[i]); +- +- /* Link back into the page tables if not highmem. */ +- if (pfn < max_low_pfn) { +- int ret; +- ret = HYPERVISOR_update_va_mapping( +- (unsigned long)__va(pfn << PAGE_SHIFT), +- mfn_pte(frame_list[i], PAGE_KERNEL), +- 0); +- BUG_ON(ret); ++ for (j = 0; j < balloon_npages; j++, pfn++, mfn++) { ++ set_phys_to_machine(pfn, mfn); ++ ++ /* Link back into the page tables if not highmem. */ ++ if (pfn < max_low_pfn) { ++ int ret; ++ ret = HYPERVISOR_update_va_mapping( ++ (unsigned long)__va(pfn << PAGE_SHIFT), ++ mfn_pte(mfn, PAGE_KERNEL), ++ 0); ++ BUG_ON(ret); ++ } + } + + /* Relinquish the page back to the allocator. */ +@@ -251,20 +293,18 @@ static int increase_reservation(unsigned long nr_pages) + balloon_stats.current_pages += rc; + + out: +- spin_unlock_irqrestore(&balloon_lock, flags); +- + return rc < 0 ? rc : rc != nr_pages; + } + + static int decrease_reservation(unsigned long nr_pages) + { +- unsigned long pfn, i, flags; +- struct page *page; ++ unsigned long pfn, lpfn, mfn, i, j; ++ struct page *page = NULL; + int need_sleep = 0; +- int ret; ++ int discontig, discontig_free; ++ int ret; + struct xen_memory_reservation reservation = { + .address_bits = 0, +- .extent_order = 0, + .domid = DOMID_SELF + }; + +@@ -272,7 +312,7 @@ static int decrease_reservation(unsigned long nr_pages) + nr_pages = ARRAY_SIZE(frame_list); + + for (i = 0; i < nr_pages; i++) { +- if ((page = alloc_page(GFP_BALLOON)) == NULL) { ++ if ((page = alloc_pages(GFP_BALLOON, balloon_order)) == NULL) { + nr_pages = i; + need_sleep = 1; + break; +@@ -282,38 +322,49 @@ static int decrease_reservation(unsigned long nr_pages) + frame_list[i] = pfn_to_mfn(pfn); + + scrub_page(page); +- +- if (!PageHighMem(page)) { +- ret = HYPERVISOR_update_va_mapping( +- (unsigned long)__va(pfn << PAGE_SHIFT), +- __pte_ma(0), 0); +- BUG_ON(ret); +- } +- + } + + /* Ensure that ballooned highmem pages don't have kmaps. */ + kmap_flush_unused(); + flush_tlb_all(); + +- spin_lock_irqsave(&balloon_lock, flags); +- + /* No more mappings: invalidate P2M and add to balloon. */ + for (i = 0; i < nr_pages; i++) { +- pfn = mfn_to_pfn(frame_list[i]); +- set_phys_to_machine(pfn, INVALID_P2M_ENTRY); ++ mfn = frame_list[i]; ++ lpfn = pfn = mfn_to_pfn(mfn); + balloon_append(pfn_to_page(pfn)); ++ discontig_free = 0; ++ for (j = 0; j < balloon_npages; j++, lpfn++, mfn++) { ++ if ((discontig_frame_list[j] = pfn_to_mfn(lpfn)) != mfn) ++ discontig_free = 1; ++ ++ set_phys_to_machine(lpfn, INVALID_P2M_ENTRY); ++ page = pfn_to_page(lpfn); ++ ++ if (!PageHighMem(page)) { ++ ret = HYPERVISOR_update_va_mapping( ++ (unsigned long)__va(lpfn << PAGE_SHIFT), ++ __pte_ma(0), 0); ++ BUG_ON(ret); ++ } ++ } ++ if (discontig_free) { ++ free_discontig_frame(); ++ frame_list[i] = 0; ++ discontig = 1; ++ } + } ++ balloon_stats.current_pages -= nr_pages; ++ ++ if (discontig) ++ nr_pages = shrink_frame(nr_pages); + + set_xen_guest_handle(reservation.extent_start, frame_list); + reservation.nr_extents = nr_pages; ++ reservation.extent_order = balloon_order; + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); + BUG_ON(ret != nr_pages); + +- balloon_stats.current_pages -= nr_pages; +- +- spin_unlock_irqrestore(&balloon_lock, flags); +- + return need_sleep; + } + +@@ -379,7 +430,7 @@ static void watch_target(struct xenbus_watch *watch, + /* The given memory/target value is in KiB, so it needs converting to + * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10. + */ +- balloon_set_new_target(new_target >> (PAGE_SHIFT - 10)); ++ balloon_set_new_target(new_target >> ((PAGE_SHIFT - 10) + balloon_order)); + } + + static int balloon_init_watcher(struct notifier_block *notifier, +@@ -399,15 +450,18 @@ static struct notifier_block xenstore_notifier; + + static int __init balloon_init(void) + { +- unsigned long pfn; ++ unsigned long pfn, extra_pfn_end; + struct page *page; + + if (!xen_pv_domain()) + return -ENODEV; + +- pr_info("xen_balloon: Initialising balloon driver.\n"); ++ pr_info("xen_balloon: Initialising balloon driver with page order %d.\n", ++ balloon_order); ++ ++ balloon_npages = 1 << balloon_order; + +- balloon_stats.current_pages = min(xen_start_info->nr_pages, max_pfn); ++ balloon_stats.current_pages = (min(xen_start_info->nr_pages, max_pfn)) >> balloon_order; + balloon_stats.target_pages = balloon_stats.current_pages; + balloon_stats.balloon_low = 0; + balloon_stats.balloon_high = 0; +@@ -419,11 +473,24 @@ static int __init balloon_init(void) + + register_balloon(&balloon_sysdev); + +- /* Initialise the balloon with excess memory space. */ +- for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) { ++ /* ++ * Initialise the balloon with excess memory space. We need ++ * to make sure we don't add memory which doesn't exist or ++ * logically exist. The E820 map can be trimmed to be smaller ++ * than the amount of physical memory due to the mem= command ++ * line parameter. And if this is a 32-bit non-HIGHMEM kernel ++ * on a system with memory which requires highmem to access, ++ * don't try to use it. ++ */ ++ extra_pfn_end = min(min(max_pfn, e820_end_of_ram_pfn()), ++ (unsigned long)PFN_DOWN(xen_extra_mem_start + xen_extra_mem_size)); ++ for (pfn = PFN_UP(xen_extra_mem_start); ++ pfn < extra_pfn_end; ++ pfn += balloon_npages) { + page = pfn_to_page(pfn); +- if (!PageReserved(page)) +- balloon_append(page); ++ /* totalram_pages doesn't include the boot-time ++ balloon extension, so don't subtract from it. */ ++ __balloon_append(page); + } + + target_watch.callback = watch_target; +@@ -444,6 +511,121 @@ static void balloon_exit(void) + + module_exit(balloon_exit); + ++static int __init balloon_parse_huge(char *s) ++{ ++ balloon_order = 9; ++ return 1; ++} ++ ++__setup("balloon_hugepages", balloon_parse_huge); ++ ++static int dealloc_pte_fn(pte_t *pte, struct page *pmd_page, ++ unsigned long addr, void *data) ++{ ++ unsigned long mfn = pte_mfn(*pte); ++ int ret; ++ struct xen_memory_reservation reservation = { ++ .nr_extents = 1, ++ .extent_order = 0, ++ .domid = DOMID_SELF ++ }; ++ ++ set_xen_guest_handle(reservation.extent_start, &mfn); ++ set_pte_at(&init_mm, addr, pte, __pte_ma(0)); ++ set_phys_to_machine(__pa(addr) >> PAGE_SHIFT, INVALID_P2M_ENTRY); ++ ++ ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); ++ BUG_ON(ret != 1); ++ ++ return 0; ++} ++ ++struct page **alloc_empty_pages_and_pagevec(int nr_pages) ++{ ++ struct page *page, **pagevec; ++ int npages; ++ int i, j, ret; ++ ++ /* Round up to next number of balloon_order pages */ ++ npages = (nr_pages + (balloon_npages-1)) >> balloon_order; ++ ++ pagevec = kmalloc(sizeof(page) * nr_pages << balloon_order, GFP_KERNEL); ++ if (pagevec == NULL) ++ return NULL; ++ ++ for (i = 0; i < nr_pages; i++) { ++ void *v; ++ ++ page = alloc_pages(GFP_KERNEL|__GFP_COLD, balloon_order); ++ if (page == NULL) ++ goto err; ++ ++ scrub_page(page); ++ ++ mutex_lock(&balloon_mutex); ++ ++ v = page_address(page); ++ ++ ret = apply_to_page_range(&init_mm, (unsigned long)v, ++ PAGE_SIZE << balloon_order, ++ dealloc_pte_fn, NULL); ++ ++ if (ret != 0) { ++ mutex_unlock(&balloon_mutex); ++ //balloon_free_page(page); /* tries to use free_cold_page */ ++ __free_page(page); ++ goto err; ++ } ++ for (j = 0; j < balloon_npages; j++) ++ pagevec[(i<<balloon_order)+j] = page++; ++ ++ totalram_pages = balloon_stats.current_pages -= balloon_npages; ++ ++ mutex_unlock(&balloon_mutex); ++ } ++ ++ out: ++ schedule_work(&balloon_worker); ++ flush_tlb_all(); ++ return pagevec; ++ ++ err: ++ mutex_lock(&balloon_mutex); ++ while (--i >= 0) ++ balloon_append(pagevec[i << balloon_order]); ++ mutex_unlock(&balloon_mutex); ++ kfree(pagevec); ++ pagevec = NULL; ++ goto out; ++} ++EXPORT_SYMBOL_GPL(alloc_empty_pages_and_pagevec); ++ ++void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages) ++{ ++ struct page *page; ++ int i; ++ int npages; ++ ++ if (pagevec == NULL) ++ return; ++ ++ /* Round up to next number of balloon_order pages */ ++ npages = (nr_pages + (balloon_npages-1)) >> balloon_order; ++ ++ mutex_lock(&balloon_mutex); ++ for (i = 0; i < nr_pages; i++) { ++ page = pagevec[i << balloon_order]; ++ BUG_ON(page_count(page) != 1); ++ balloon_append(page); ++ } ++ mutex_unlock(&balloon_mutex); ++ ++ kfree(pagevec); ++ ++ schedule_work(&balloon_worker); ++} ++EXPORT_SYMBOL_GPL(free_empty_pages_and_pagevec); ++ + #define BALLOON_SHOW(name, format, args...) \ + static ssize_t show_##name(struct sys_device *dev, \ + struct sysdev_attribute *attr, \ +@@ -477,7 +659,7 @@ static ssize_t store_target_kb(struct sys_device *dev, + + target_bytes = simple_strtoull(buf, &endchar, 0) * 1024; + +- balloon_set_new_target(target_bytes >> PAGE_SHIFT); ++ balloon_set_new_target(target_bytes >> (PAGE_SHIFT + balloon_order)); + + return count; + } +@@ -491,7 +673,7 @@ static ssize_t show_target(struct sys_device *dev, struct sysdev_attribute *attr + { + return sprintf(buf, "%llu\n", + (unsigned long long)balloon_stats.target_pages +- << PAGE_SHIFT); ++ << (PAGE_SHIFT + balloon_order)); + } + + static ssize_t store_target(struct sys_device *dev, +@@ -507,7 +689,7 @@ static ssize_t store_target(struct sys_device *dev, + + target_bytes = memparse(buf, &endchar); + +- balloon_set_new_target(target_bytes >> PAGE_SHIFT); ++ balloon_set_new_target(target_bytes >> (PAGE_SHIFT + balloon_order)); + + return count; + } +diff --git a/drivers/xen/biomerge.c b/drivers/xen/biomerge.c +new file mode 100644 +index 0000000..d40f534 +--- /dev/null ++++ b/drivers/xen/biomerge.c +@@ -0,0 +1,14 @@ ++#include <linux/bio.h> ++#include <asm/io.h> ++#include <xen/page.h> ++ ++bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, ++ const struct bio_vec *vec2) ++{ ++ unsigned long mfn1 = pfn_to_mfn(page_to_pfn(vec1->bv_page)); ++ unsigned long mfn2 = pfn_to_mfn(page_to_pfn(vec2->bv_page)); ++ ++ return __BIOVEC_PHYS_MERGEABLE(vec1, vec2) && ++ ((mfn1 == mfn2) || ((mfn1+1) == mfn2)); ++} ++ +diff --git a/drivers/xen/blkback/Makefile b/drivers/xen/blkback/Makefile +new file mode 100644 +index 0000000..dee55ba +--- /dev/null ++++ b/drivers/xen/blkback/Makefile +@@ -0,0 +1,4 @@ ++obj-$(CONFIG_XEN_BLKDEV_BACKEND) := xen-blkback.o ++obj-$(CONFIG_XEN_BLKBACK_PAGEMAP) += blkback-pagemap.o ++ ++xen-blkback-y := blkback.o xenbus.o interface.o vbd.o +diff --git a/drivers/xen/blkback/blkback-pagemap.c b/drivers/xen/blkback/blkback-pagemap.c +new file mode 100644 +index 0000000..45f6eb2 +--- /dev/null ++++ b/drivers/xen/blkback/blkback-pagemap.c +@@ -0,0 +1,109 @@ ++#include <linux/module.h> ++#include "blkback-pagemap.h" ++ ++static int blkback_pagemap_size; ++static struct blkback_pagemap *blkback_pagemap; ++ ++static inline int ++blkback_pagemap_entry_clear(struct blkback_pagemap *map) ++{ ++ static struct blkback_pagemap zero; ++ return !memcmp(map, &zero, sizeof(zero)); ++} ++ ++int ++blkback_pagemap_init(int pages) ++{ ++ blkback_pagemap = kzalloc(pages * sizeof(struct blkback_pagemap), ++ GFP_KERNEL); ++ if (!blkback_pagemap) ++ return -ENOMEM; ++ ++ blkback_pagemap_size = pages; ++ return 0; ++} ++EXPORT_SYMBOL_GPL(blkback_pagemap_init); ++ ++void ++blkback_pagemap_set(int idx, struct page *page, ++ domid_t domid, busid_t busid, grant_ref_t gref) ++{ ++ struct blkback_pagemap *entry; ++ ++ BUG_ON(!blkback_pagemap); ++ BUG_ON(idx >= blkback_pagemap_size); ++ ++ set_page_private(page, idx); ++ ++ entry = blkback_pagemap + idx; ++ if (!blkback_pagemap_entry_clear(entry)) { ++ printk("overwriting pagemap %d: d %u b %u g %u\n", ++ idx, entry->domid, entry->busid, entry->gref); ++ BUG(); ++ } ++ ++ entry->page = page; ++ entry->domid = domid; ++ entry->busid = busid; ++ entry->gref = gref; ++} ++EXPORT_SYMBOL_GPL(blkback_pagemap_set); ++ ++void ++blkback_pagemap_clear(struct page *page) ++{ ++ int idx; ++ struct blkback_pagemap *entry; ++ ++ idx = (int)page_private(page); ++ ++ BUG_ON(!blkback_pagemap); ++ BUG_ON(idx >= blkback_pagemap_size); ++ ++ entry = blkback_pagemap + idx; ++ if (blkback_pagemap_entry_clear(entry)) { ++ printk("clearing empty pagemap %d\n", idx); ++ BUG(); ++ } ++ ++ memset(entry, 0, sizeof(*entry)); ++} ++EXPORT_SYMBOL_GPL(blkback_pagemap_clear); ++ ++struct blkback_pagemap ++blkback_pagemap_read(struct page *page) ++{ ++ int idx; ++ struct blkback_pagemap *entry; ++ ++ idx = (int)page_private(page); ++ ++ BUG_ON(!blkback_pagemap); ++ BUG_ON(idx >= blkback_pagemap_size); ++ ++ entry = blkback_pagemap + idx; ++ if (blkback_pagemap_entry_clear(entry)) { ++ printk("reading empty pagemap %d\n", idx); ++ BUG(); ++ } ++ ++ return *entry; ++} ++EXPORT_SYMBOL(blkback_pagemap_read); ++ ++MODULE_LICENSE("Dual BSD/GPL"); ++ ++int ++blkback_pagemap_contains_page(struct page *page) ++{ ++ struct blkback_pagemap *entry; ++ int idx = (int)page_private(page); ++ ++ if (idx < 0 || idx >= blkback_pagemap_size) ++ return 0; ++ ++ entry = blkback_pagemap + idx; ++ ++ return (entry->page == page); ++} ++EXPORT_SYMBOL(blkback_pagemap_contains_page); +diff --git a/drivers/xen/blkback/blkback-pagemap.h b/drivers/xen/blkback/blkback-pagemap.h +new file mode 100644 +index 0000000..7f97d15 +--- /dev/null ++++ b/drivers/xen/blkback/blkback-pagemap.h +@@ -0,0 +1,36 @@ ++#ifndef _BLKBACK_PAGEMAP_H_ ++#define _BLKBACK_PAGEMAP_H_ ++ ++#include <linux/mm.h> ++#include <xen/interface/xen.h> ++#include <xen/interface/grant_table.h> ++ ++typedef unsigned int busid_t; ++ ++struct blkback_pagemap { ++ struct page *page; ++ domid_t domid; ++ busid_t busid; ++ grant_ref_t gref; ++}; ++ ++#if defined(CONFIG_XEN_BLKBACK_PAGEMAP) || defined(CONFIG_XEN_BLKBACK_PAGEMAP_MODULE) ++ ++int blkback_pagemap_init(int); ++void blkback_pagemap_set(int, struct page *, domid_t, busid_t, grant_ref_t); ++void blkback_pagemap_clear(struct page *); ++struct blkback_pagemap blkback_pagemap_read(struct page *); ++int blkback_pagemap_contains_page(struct page *page); ++ ++#else /* CONFIG_XEN_BLKBACK_PAGEMAP */ ++ ++static inline int blkback_pagemap_init(int pages) { return 0; } ++static inline void blkback_pagemap_set(int idx, struct page *page, domid_t dom, ++ busid_t bus, grant_ref_t gnt) {} ++static inline void blkback_pagemap_clear(struct page *page) {} ++#define blkback_pagemap_read(_page) ({ BUG(); (struct blkback_pagemap){0}; }) ++static inline int blkback_pagemap_contains_page(struct page *page) { return 0; } ++ ++#endif /* CONFIG_XEN_BLKBACK_PAGEMAP */ ++ ++#endif +diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c +new file mode 100644 +index 0000000..0bef445 +--- /dev/null ++++ b/drivers/xen/blkback/blkback.c +@@ -0,0 +1,675 @@ ++/****************************************************************************** ++ * arch/xen/drivers/blkif/backend/main.c ++ * ++ * Back-end of the driver for virtual block devices. This portion of the ++ * driver exports a 'unified' block-device interface that can be accessed ++ * by any operating system that implements a compatible front end. A ++ * reference front-end implementation can be found in: ++ * arch/xen/drivers/blkif/frontend ++ * ++ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand ++ * Copyright (c) 2005, Christopher Clark ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#include <linux/spinlock.h> ++#include <linux/kthread.h> ++#include <linux/list.h> ++#include <linux/delay.h> ++#include <linux/freezer.h> ++ ++#include <xen/balloon.h> ++#include <xen/events.h> ++#include <xen/page.h> ++#include <asm/xen/hypervisor.h> ++#include <asm/xen/hypercall.h> ++#include "common.h" ++ ++/* ++ * These are rather arbitrary. They are fairly large because adjacent requests ++ * pulled from a communication ring are quite likely to end up being part of ++ * the same scatter/gather request at the disc. ++ * ++ * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW ** ++ * ++ * This will increase the chances of being able to write whole tracks. ++ * 64 should be enough to keep us competitive with Linux. ++ */ ++static int blkif_reqs = 64; ++module_param_named(reqs, blkif_reqs, int, 0); ++MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate"); ++ ++/* Run-time switchable: /sys/module/blkback/parameters/ */ ++static unsigned int log_stats = 0; ++static unsigned int debug_lvl = 0; ++module_param(log_stats, int, 0644); ++module_param(debug_lvl, int, 0644); ++ ++/* ++ * Each outstanding request that we've passed to the lower device layers has a ++ * 'pending_req' allocated to it. Each buffer_head that completes decrements ++ * the pendcnt towards zero. When it hits zero, the specified domain has a ++ * response queued for it, with the saved 'id' passed back. ++ */ ++typedef struct { ++ blkif_t *blkif; ++ u64 id; ++ int nr_pages; ++ atomic_t pendcnt; ++ unsigned short operation; ++ int status; ++ struct list_head free_list; ++} pending_req_t; ++ ++static pending_req_t *pending_reqs; ++static struct list_head pending_free; ++static DEFINE_SPINLOCK(pending_free_lock); ++static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq); ++ ++#define BLKBACK_INVALID_HANDLE (~0) ++ ++static struct page **pending_pages; ++static grant_handle_t *pending_grant_handles; ++ ++static inline int vaddr_pagenr(pending_req_t *req, int seg) ++{ ++ return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg; ++} ++ ++#define pending_page(req, seg) pending_pages[vaddr_pagenr(req, seg)] ++ ++static inline unsigned long vaddr(pending_req_t *req, int seg) ++{ ++ unsigned long pfn = page_to_pfn(pending_page(req, seg)); ++ return (unsigned long)pfn_to_kaddr(pfn); ++} ++ ++#define pending_handle(_req, _seg) \ ++ (pending_grant_handles[vaddr_pagenr(_req, _seg)]) ++ ++ ++static int do_block_io_op(blkif_t *blkif); ++static void dispatch_rw_block_io(blkif_t *blkif, ++ struct blkif_request *req, ++ pending_req_t *pending_req); ++static void make_response(blkif_t *blkif, u64 id, ++ unsigned short op, int st); ++ ++/****************************************************************** ++ * misc small helpers ++ */ ++static pending_req_t* alloc_req(void) ++{ ++ pending_req_t *req = NULL; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&pending_free_lock, flags); ++ if (!list_empty(&pending_free)) { ++ req = list_entry(pending_free.next, pending_req_t, free_list); ++ list_del(&req->free_list); ++ } ++ spin_unlock_irqrestore(&pending_free_lock, flags); ++ return req; ++} ++ ++static void free_req(pending_req_t *req) ++{ ++ unsigned long flags; ++ int was_empty; ++ ++ spin_lock_irqsave(&pending_free_lock, flags); ++ was_empty = list_empty(&pending_free); ++ list_add(&req->free_list, &pending_free); ++ spin_unlock_irqrestore(&pending_free_lock, flags); ++ if (was_empty) ++ wake_up(&pending_free_wq); ++} ++ ++static void unplug_queue(blkif_t *blkif) ++{ ++ if (blkif->plug == NULL) ++ return; ++ if (blkif->plug->unplug_fn) ++ blkif->plug->unplug_fn(blkif->plug); ++ blk_put_queue(blkif->plug); ++ blkif->plug = NULL; ++} ++ ++static void plug_queue(blkif_t *blkif, struct block_device *bdev) ++{ ++ struct request_queue *q = bdev_get_queue(bdev); ++ ++ if (q == blkif->plug) ++ return; ++ unplug_queue(blkif); ++ blk_get_queue(q); ++ blkif->plug = q; ++} ++ ++static void fast_flush_area(pending_req_t *req) ++{ ++ struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; ++ unsigned int i, invcount = 0; ++ grant_handle_t handle; ++ int ret; ++ ++ for (i = 0; i < req->nr_pages; i++) { ++ handle = pending_handle(req, i); ++ if (handle == BLKBACK_INVALID_HANDLE) ++ continue; ++ blkback_pagemap_clear(pending_page(req, i)); ++ gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i), ++ GNTMAP_host_map, handle); ++ pending_handle(req, i) = BLKBACK_INVALID_HANDLE; ++ invcount++; ++ } ++ ++ ret = HYPERVISOR_grant_table_op( ++ GNTTABOP_unmap_grant_ref, unmap, invcount); ++ BUG_ON(ret); ++} ++ ++/****************************************************************** ++ * SCHEDULER FUNCTIONS ++ */ ++ ++static void print_stats(blkif_t *blkif) ++{ ++ printk(KERN_DEBUG "%s: oo %3d | rd %4d | wr %4d | br %4d\n", ++ current->comm, blkif->st_oo_req, ++ blkif->st_rd_req, blkif->st_wr_req, blkif->st_br_req); ++ blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); ++ blkif->st_rd_req = 0; ++ blkif->st_wr_req = 0; ++ blkif->st_oo_req = 0; ++} ++ ++int blkif_schedule(void *arg) ++{ ++ blkif_t *blkif = arg; ++ struct vbd *vbd = &blkif->vbd; ++ ++ blkif_get(blkif); ++ ++ if (debug_lvl) ++ printk(KERN_DEBUG "%s: started\n", current->comm); ++ ++ while (!kthread_should_stop()) { ++ if (try_to_freeze()) ++ continue; ++ if (unlikely(vbd->size != vbd_size(vbd))) ++ vbd_resize(blkif); ++ ++ wait_event_interruptible( ++ blkif->wq, ++ blkif->waiting_reqs || kthread_should_stop()); ++ wait_event_interruptible( ++ pending_free_wq, ++ !list_empty(&pending_free) || kthread_should_stop()); ++ ++ blkif->waiting_reqs = 0; ++ smp_mb(); /* clear flag *before* checking for work */ ++ ++ if (do_block_io_op(blkif)) ++ blkif->waiting_reqs = 1; ++ unplug_queue(blkif); ++ ++ if (log_stats && time_after(jiffies, blkif->st_print)) ++ print_stats(blkif); ++ } ++ ++ if (log_stats) ++ print_stats(blkif); ++ if (debug_lvl) ++ printk(KERN_DEBUG "%s: exiting\n", current->comm); ++ ++ blkif->xenblkd = NULL; ++ blkif_put(blkif); ++ ++ return 0; ++} ++ ++/****************************************************************** ++ * COMPLETION CALLBACK -- Called as bh->b_end_io() ++ */ ++ ++static void __end_block_io_op(pending_req_t *pending_req, int error) ++{ ++ /* An error fails the entire request. */ ++ if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) && ++ (error == -EOPNOTSUPP)) { ++ DPRINTK("blkback: write barrier op failed, not supported\n"); ++ blkback_barrier(XBT_NIL, pending_req->blkif->be, 0); ++ pending_req->status = BLKIF_RSP_EOPNOTSUPP; ++ } else if (error) { ++ DPRINTK("Buffer not up-to-date at end of operation, " ++ "error=%d\n", error); ++ pending_req->status = BLKIF_RSP_ERROR; ++ } ++ ++ if (atomic_dec_and_test(&pending_req->pendcnt)) { ++ fast_flush_area(pending_req); ++ make_response(pending_req->blkif, pending_req->id, ++ pending_req->operation, pending_req->status); ++ blkif_put(pending_req->blkif); ++ free_req(pending_req); ++ } ++} ++ ++static void end_block_io_op(struct bio *bio, int error) ++{ ++ __end_block_io_op(bio->bi_private, error); ++ bio_put(bio); ++} ++ ++ ++/****************************************************************************** ++ * NOTIFICATION FROM GUEST OS. ++ */ ++ ++static void blkif_notify_work(blkif_t *blkif) ++{ ++ blkif->waiting_reqs = 1; ++ wake_up(&blkif->wq); ++} ++ ++irqreturn_t blkif_be_int(int irq, void *dev_id) ++{ ++ blkif_notify_work(dev_id); ++ return IRQ_HANDLED; ++} ++ ++ ++ ++/****************************************************************** ++ * DOWNWARD CALLS -- These interface with the block-device layer proper. ++ */ ++ ++static int do_block_io_op(blkif_t *blkif) ++{ ++ union blkif_back_rings *blk_rings = &blkif->blk_rings; ++ struct blkif_request req; ++ pending_req_t *pending_req; ++ RING_IDX rc, rp; ++ int more_to_do = 0; ++ ++ rc = blk_rings->common.req_cons; ++ rp = blk_rings->common.sring->req_prod; ++ rmb(); /* Ensure we see queued requests up to 'rp'. */ ++ ++ while (rc != rp) { ++ ++ if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc)) ++ break; ++ ++ if (kthread_should_stop()) { ++ more_to_do = 1; ++ break; ++ } ++ ++ pending_req = alloc_req(); ++ if (NULL == pending_req) { ++ blkif->st_oo_req++; ++ more_to_do = 1; ++ break; ++ } ++ ++ switch (blkif->blk_protocol) { ++ case BLKIF_PROTOCOL_NATIVE: ++ memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req)); ++ break; ++ case BLKIF_PROTOCOL_X86_32: ++ blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc)); ++ break; ++ case BLKIF_PROTOCOL_X86_64: ++ blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc)); ++ break; ++ default: ++ BUG(); ++ } ++ blk_rings->common.req_cons = ++rc; /* before make_response() */ ++ ++ /* Apply all sanity checks to /private copy/ of request. */ ++ barrier(); ++ ++ switch (req.operation) { ++ case BLKIF_OP_READ: ++ blkif->st_rd_req++; ++ dispatch_rw_block_io(blkif, &req, pending_req); ++ break; ++ case BLKIF_OP_WRITE_BARRIER: ++ blkif->st_br_req++; ++ /* fall through */ ++ case BLKIF_OP_WRITE: ++ blkif->st_wr_req++; ++ dispatch_rw_block_io(blkif, &req, pending_req); ++ break; ++ default: ++ /* A good sign something is wrong: sleep for a while to ++ * avoid excessive CPU consumption by a bad guest. */ ++ msleep(1); ++ DPRINTK("error: unknown block io operation [%d]\n", ++ req.operation); ++ make_response(blkif, req.id, req.operation, ++ BLKIF_RSP_ERROR); ++ free_req(pending_req); ++ break; ++ } ++ ++ /* Yield point for this unbounded loop. */ ++ cond_resched(); ++ } ++ ++ return more_to_do; ++} ++ ++static void dispatch_rw_block_io(blkif_t *blkif, ++ struct blkif_request *req, ++ pending_req_t *pending_req) ++{ ++ struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; ++ struct phys_req preq; ++ struct { ++ unsigned long buf; unsigned int nsec; ++ } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; ++ unsigned int nseg; ++ struct bio *bio = NULL; ++ int ret, i; ++ int operation; ++ ++ switch (req->operation) { ++ case BLKIF_OP_READ: ++ operation = READ; ++ break; ++ case BLKIF_OP_WRITE: ++ operation = WRITE; ++ break; ++ case BLKIF_OP_WRITE_BARRIER: ++ operation = WRITE_BARRIER; ++ break; ++ default: ++ operation = 0; /* make gcc happy */ ++ BUG(); ++ } ++ ++ /* Check that number of segments is sane. */ ++ nseg = req->nr_segments; ++ if (unlikely(nseg == 0 && operation != WRITE_BARRIER) || ++ unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) { ++ DPRINTK("Bad number of segments in request (%d)\n", nseg); ++ goto fail_response; ++ } ++ ++ preq.dev = req->handle; ++ preq.sector_number = req->sector_number; ++ preq.nr_sects = 0; ++ ++ pending_req->blkif = blkif; ++ pending_req->id = req->id; ++ pending_req->operation = req->operation; ++ pending_req->status = BLKIF_RSP_OKAY; ++ pending_req->nr_pages = nseg; ++ ++ for (i = 0; i < nseg; i++) { ++ uint32_t flags; ++ ++ seg[i].nsec = req->seg[i].last_sect - ++ req->seg[i].first_sect + 1; ++ ++ if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) || ++ (req->seg[i].last_sect < req->seg[i].first_sect)) ++ goto fail_response; ++ preq.nr_sects += seg[i].nsec; ++ ++ flags = GNTMAP_host_map; ++ if (operation != READ) ++ flags |= GNTMAP_readonly; ++ gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags, ++ req->seg[i].gref, blkif->domid); ++ } ++ ++ ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg); ++ BUG_ON(ret); ++ ++ for (i = 0; i < nseg; i++) { ++ if (unlikely(map[i].status != 0)) { ++ DPRINTK("invalid buffer -- could not remap it\n"); ++ map[i].handle = BLKBACK_INVALID_HANDLE; ++ ret |= 1; ++ continue; ++ } ++ ++ set_phys_to_machine( ++ page_to_pfn(pending_page(pending_req, i)), ++ FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT)); ++ seg[i].buf = map[i].dev_bus_addr | ++ (req->seg[i].first_sect << 9); ++ blkback_pagemap_set(vaddr_pagenr(pending_req, i), ++ pending_page(pending_req, i), ++ blkif->domid, req->handle, ++ req->seg[i].gref); ++ pending_handle(pending_req, i) = map[i].handle; ++ } ++ ++ if (ret) ++ goto fail_flush; ++ ++ if (vbd_translate(&preq, blkif, operation) != 0) { ++ DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n", ++ operation == READ ? "read" : "write", ++ preq.sector_number, ++ preq.sector_number + preq.nr_sects, preq.dev); ++ goto fail_flush; ++ } ++ ++ plug_queue(blkif, preq.bdev); ++ atomic_set(&pending_req->pendcnt, 1); ++ blkif_get(blkif); ++ ++ for (i = 0; i < nseg; i++) { ++ if (((int)preq.sector_number|(int)seg[i].nsec) & ++ ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) { ++ DPRINTK("Misaligned I/O request from domain %d", ++ blkif->domid); ++ goto fail_put_bio; ++ } ++ ++ while ((bio == NULL) || ++ (bio_add_page(bio, ++ pending_page(pending_req, i), ++ seg[i].nsec << 9, ++ seg[i].buf & ~PAGE_MASK) == 0)) { ++ if (bio) { ++ atomic_inc(&pending_req->pendcnt); ++ submit_bio(operation, bio); ++ } ++ ++ bio = bio_alloc(GFP_KERNEL, nseg-i); ++ if (unlikely(bio == NULL)) ++ goto fail_put_bio; ++ ++ bio->bi_bdev = preq.bdev; ++ bio->bi_private = pending_req; ++ bio->bi_end_io = end_block_io_op; ++ bio->bi_sector = preq.sector_number; ++ } ++ ++ preq.sector_number += seg[i].nsec; ++ } ++ ++ if (!bio) { ++ BUG_ON(operation != WRITE_BARRIER); ++ bio = bio_alloc(GFP_KERNEL, 0); ++ if (unlikely(bio == NULL)) ++ goto fail_put_bio; ++ ++ bio->bi_bdev = preq.bdev; ++ bio->bi_private = pending_req; ++ bio->bi_end_io = end_block_io_op; ++ bio->bi_sector = -1; ++ } ++ ++ submit_bio(operation, bio); ++ ++ if (operation == READ) ++ blkif->st_rd_sect += preq.nr_sects; ++ else if (operation == WRITE || operation == WRITE_BARRIER) ++ blkif->st_wr_sect += preq.nr_sects; ++ ++ return; ++ ++ fail_flush: ++ fast_flush_area(pending_req); ++ fail_response: ++ make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR); ++ free_req(pending_req); ++ msleep(1); /* back off a bit */ ++ return; ++ ++ fail_put_bio: ++ __end_block_io_op(pending_req, -EINVAL); ++ if (bio) ++ bio_put(bio); ++ unplug_queue(blkif); ++ msleep(1); /* back off a bit */ ++ return; ++} ++ ++ ++ ++/****************************************************************** ++ * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING ++ */ ++ ++ ++static void make_response(blkif_t *blkif, u64 id, ++ unsigned short op, int st) ++{ ++ struct blkif_response resp; ++ unsigned long flags; ++ union blkif_back_rings *blk_rings = &blkif->blk_rings; ++ int more_to_do = 0; ++ int notify; ++ ++ resp.id = id; ++ resp.operation = op; ++ resp.status = st; ++ ++ spin_lock_irqsave(&blkif->blk_ring_lock, flags); ++ /* Place on the response ring for the relevant domain. */ ++ switch (blkif->blk_protocol) { ++ case BLKIF_PROTOCOL_NATIVE: ++ memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt), ++ &resp, sizeof(resp)); ++ break; ++ case BLKIF_PROTOCOL_X86_32: ++ memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, blk_rings->x86_32.rsp_prod_pvt), ++ &resp, sizeof(resp)); ++ break; ++ case BLKIF_PROTOCOL_X86_64: ++ memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, blk_rings->x86_64.rsp_prod_pvt), ++ &resp, sizeof(resp)); ++ break; ++ default: ++ BUG(); ++ } ++ blk_rings->common.rsp_prod_pvt++; ++ RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify); ++ if (blk_rings->common.rsp_prod_pvt == blk_rings->common.req_cons) { ++ /* ++ * Tail check for pending requests. Allows frontend to avoid ++ * notifications if requests are already in flight (lower ++ * overheads and promotes batching). ++ */ ++ RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do); ++ ++ } else if (RING_HAS_UNCONSUMED_REQUESTS(&blk_rings->common)) { ++ more_to_do = 1; ++ } ++ ++ spin_unlock_irqrestore(&blkif->blk_ring_lock, flags); ++ ++ if (more_to_do) ++ blkif_notify_work(blkif); ++ if (notify) ++ notify_remote_via_irq(blkif->irq); ++} ++ ++static int __init blkif_init(void) ++{ ++ int i, mmap_pages; ++ int rc = 0; ++ ++ if (!xen_pv_domain()) ++ return -ENODEV; ++ ++ mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST; ++ ++ pending_reqs = kmalloc(sizeof(pending_reqs[0]) * ++ blkif_reqs, GFP_KERNEL); ++ pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) * ++ mmap_pages, GFP_KERNEL); ++ pending_pages = alloc_empty_pages_and_pagevec(mmap_pages); ++ ++ if (blkback_pagemap_init(mmap_pages)) ++ goto out_of_memory; ++ ++ if (!pending_reqs || !pending_grant_handles || !pending_pages) { ++ rc = -ENOMEM; ++ goto out_of_memory; ++ } ++ ++ for (i = 0; i < mmap_pages; i++) ++ pending_grant_handles[i] = BLKBACK_INVALID_HANDLE; ++ ++ rc = blkif_interface_init(); ++ if (rc) ++ goto failed_init; ++ ++ memset(pending_reqs, 0, sizeof(pending_reqs)); ++ INIT_LIST_HEAD(&pending_free); ++ ++ for (i = 0; i < blkif_reqs; i++) ++ list_add_tail(&pending_reqs[i].free_list, &pending_free); ++ ++ rc = blkif_xenbus_init(); ++ if (rc) ++ goto failed_init; ++ ++ return 0; ++ ++ out_of_memory: ++ printk(KERN_ERR "%s: out of memory\n", __func__); ++ failed_init: ++ kfree(pending_reqs); ++ kfree(pending_grant_handles); ++ free_empty_pages_and_pagevec(pending_pages, mmap_pages); ++ return rc; ++} ++ ++module_init(blkif_init); ++ ++MODULE_LICENSE("Dual BSD/GPL"); +diff --git a/drivers/xen/blkback/common.h b/drivers/xen/blkback/common.h +new file mode 100644 +index 0000000..531ba81 +--- /dev/null ++++ b/drivers/xen/blkback/common.h +@@ -0,0 +1,143 @@ ++/* ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#ifndef __BLKIF__BACKEND__COMMON_H__ ++#define __BLKIF__BACKEND__COMMON_H__ ++ ++#include <linux/version.h> ++#include <linux/module.h> ++#include <linux/interrupt.h> ++#include <linux/slab.h> ++#include <linux/blkdev.h> ++#include <linux/vmalloc.h> ++#include <linux/wait.h> ++#include <asm/io.h> ++#include <asm/setup.h> ++#include <asm/pgalloc.h> ++#include <asm/hypervisor.h> ++#include <xen/blkif.h> ++#include <xen/grant_table.h> ++#include <xen/xenbus.h> ++#include "blkback-pagemap.h" ++ ++ ++#define DPRINTK(_f, _a...) \ ++ pr_debug("(file=%s, line=%d) " _f, \ ++ __FILE__ , __LINE__ , ## _a ) ++ ++struct vbd { ++ blkif_vdev_t handle; /* what the domain refers to this vbd as */ ++ unsigned char readonly; /* Non-zero -> read-only */ ++ unsigned char type; /* VDISK_xxx */ ++ u32 pdevice; /* phys device that this vbd maps to */ ++ struct block_device *bdev; ++ sector_t size; /* Cached size parameter */ ++}; ++ ++struct backend_info; ++ ++typedef struct blkif_st { ++ /* Unique identifier for this interface. */ ++ domid_t domid; ++ unsigned int handle; ++ /* Physical parameters of the comms window. */ ++ unsigned int irq; ++ /* Comms information. */ ++ enum blkif_protocol blk_protocol; ++ union blkif_back_rings blk_rings; ++ struct vm_struct *blk_ring_area; ++ /* The VBD attached to this interface. */ ++ struct vbd vbd; ++ /* Back pointer to the backend_info. */ ++ struct backend_info *be; ++ /* Private fields. */ ++ spinlock_t blk_ring_lock; ++ atomic_t refcnt; ++ ++ wait_queue_head_t wq; ++ struct task_struct *xenblkd; ++ unsigned int waiting_reqs; ++ struct request_queue *plug; ++ ++ /* statistics */ ++ unsigned long st_print; ++ int st_rd_req; ++ int st_wr_req; ++ int st_oo_req; ++ int st_br_req; ++ int st_rd_sect; ++ int st_wr_sect; ++ ++ wait_queue_head_t waiting_to_free; ++ ++ grant_handle_t shmem_handle; ++ grant_ref_t shmem_ref; ++} blkif_t; ++ ++blkif_t *blkif_alloc(domid_t domid); ++void blkif_disconnect(blkif_t *blkif); ++void blkif_free(blkif_t *blkif); ++int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn); ++void vbd_resize(blkif_t *blkif); ++ ++#define blkif_get(_b) (atomic_inc(&(_b)->refcnt)) ++#define blkif_put(_b) \ ++ do { \ ++ if (atomic_dec_and_test(&(_b)->refcnt)) \ ++ wake_up(&(_b)->waiting_to_free);\ ++ } while (0) ++ ++/* Create a vbd. */ ++int vbd_create(blkif_t *blkif, blkif_vdev_t vdevice, unsigned major, ++ unsigned minor, int readonly, int cdrom); ++void vbd_free(struct vbd *vbd); ++ ++unsigned long long vbd_size(struct vbd *vbd); ++unsigned int vbd_info(struct vbd *vbd); ++unsigned long vbd_secsize(struct vbd *vbd); ++ ++struct phys_req { ++ unsigned short dev; ++ unsigned short nr_sects; ++ struct block_device *bdev; ++ blkif_sector_t sector_number; ++}; ++ ++int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation); ++ ++int blkif_interface_init(void); ++ ++int blkif_xenbus_init(void); ++ ++irqreturn_t blkif_be_int(int irq, void *dev_id); ++int blkif_schedule(void *arg); ++ ++int blkback_barrier(struct xenbus_transaction xbt, ++ struct backend_info *be, int state); ++ ++struct xenbus_device *blkback_xenbus(struct backend_info *be); ++ ++#endif /* __BLKIF__BACKEND__COMMON_H__ */ +diff --git a/drivers/xen/blkback/interface.c b/drivers/xen/blkback/interface.c +new file mode 100644 +index 0000000..e397a41 +--- /dev/null ++++ b/drivers/xen/blkback/interface.c +@@ -0,0 +1,186 @@ ++/****************************************************************************** ++ * arch/xen/drivers/blkif/backend/interface.c ++ * ++ * Block-device interface management. ++ * ++ * Copyright (c) 2004, Keir Fraser ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#include "common.h" ++#include <xen/events.h> ++#include <xen/grant_table.h> ++#include <linux/kthread.h> ++ ++static struct kmem_cache *blkif_cachep; ++ ++blkif_t *blkif_alloc(domid_t domid) ++{ ++ blkif_t *blkif; ++ ++ blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL); ++ if (!blkif) ++ return ERR_PTR(-ENOMEM); ++ ++ memset(blkif, 0, sizeof(*blkif)); ++ blkif->domid = domid; ++ spin_lock_init(&blkif->blk_ring_lock); ++ atomic_set(&blkif->refcnt, 1); ++ init_waitqueue_head(&blkif->wq); ++ blkif->st_print = jiffies; ++ init_waitqueue_head(&blkif->waiting_to_free); ++ ++ return blkif; ++} ++ ++static int map_frontend_page(blkif_t *blkif, unsigned long shared_page) ++{ ++ struct gnttab_map_grant_ref op; ++ ++ gnttab_set_map_op(&op, (unsigned long)blkif->blk_ring_area->addr, ++ GNTMAP_host_map, shared_page, blkif->domid); ++ ++ if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)) ++ BUG(); ++ ++ if (op.status) { ++ DPRINTK(" Grant table operation failure !\n"); ++ return op.status; ++ } ++ ++ blkif->shmem_ref = shared_page; ++ blkif->shmem_handle = op.handle; ++ ++ return 0; ++} ++ ++static void unmap_frontend_page(blkif_t *blkif) ++{ ++ struct gnttab_unmap_grant_ref op; ++ ++ gnttab_set_unmap_op(&op, (unsigned long)blkif->blk_ring_area->addr, ++ GNTMAP_host_map, blkif->shmem_handle); ++ ++ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)) ++ BUG(); ++} ++ ++int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn) ++{ ++ int err; ++ ++ /* Already connected through? */ ++ if (blkif->irq) ++ return 0; ++ ++ if ( (blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE)) == NULL ) ++ return -ENOMEM; ++ ++ err = map_frontend_page(blkif, shared_page); ++ if (err) { ++ free_vm_area(blkif->blk_ring_area); ++ return err; ++ } ++ ++ switch (blkif->blk_protocol) { ++ case BLKIF_PROTOCOL_NATIVE: ++ { ++ struct blkif_sring *sring; ++ sring = (struct blkif_sring *)blkif->blk_ring_area->addr; ++ BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE); ++ break; ++ } ++ case BLKIF_PROTOCOL_X86_32: ++ { ++ struct blkif_x86_32_sring *sring_x86_32; ++ sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring_area->addr; ++ BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE); ++ break; ++ } ++ case BLKIF_PROTOCOL_X86_64: ++ { ++ struct blkif_x86_64_sring *sring_x86_64; ++ sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring_area->addr; ++ BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE); ++ break; ++ } ++ default: ++ BUG(); ++ } ++ ++ err = bind_interdomain_evtchn_to_irqhandler( ++ blkif->domid, evtchn, blkif_be_int, 0, "blkif-backend", blkif); ++ if (err < 0) ++ { ++ unmap_frontend_page(blkif); ++ free_vm_area(blkif->blk_ring_area); ++ blkif->blk_rings.common.sring = NULL; ++ return err; ++ } ++ blkif->irq = err; ++ ++ return 0; ++} ++ ++void blkif_disconnect(blkif_t *blkif) ++{ ++ if (blkif->xenblkd) { ++ kthread_stop(blkif->xenblkd); ++ blkif->xenblkd = NULL; ++ } ++ ++ atomic_dec(&blkif->refcnt); ++ wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0); ++ atomic_inc(&blkif->refcnt); ++ ++ if (blkif->irq) { ++ unbind_from_irqhandler(blkif->irq, blkif); ++ blkif->irq = 0; ++ } ++ ++ if (blkif->blk_rings.common.sring) { ++ unmap_frontend_page(blkif); ++ free_vm_area(blkif->blk_ring_area); ++ blkif->blk_rings.common.sring = NULL; ++ } ++} ++ ++void blkif_free(blkif_t *blkif) ++{ ++ if (!atomic_dec_and_test(&blkif->refcnt)) ++ BUG(); ++ kmem_cache_free(blkif_cachep, blkif); ++} ++ ++int __init blkif_interface_init(void) ++{ ++ blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t), ++ 0, 0, NULL); ++ if (!blkif_cachep) ++ return -ENOMEM; ++ ++ return 0; ++} +diff --git a/drivers/xen/blkback/vbd.c b/drivers/xen/blkback/vbd.c +new file mode 100644 +index 0000000..943ec23 +--- /dev/null ++++ b/drivers/xen/blkback/vbd.c +@@ -0,0 +1,161 @@ ++/****************************************************************************** ++ * blkback/vbd.c ++ * ++ * Routines for managing virtual block devices (VBDs). ++ * ++ * Copyright (c) 2003-2005, Keir Fraser & Steve Hand ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#include "common.h" ++ ++#define vbd_sz(_v) ((_v)->bdev->bd_part ? \ ++ (_v)->bdev->bd_part->nr_sects : get_capacity((_v)->bdev->bd_disk)) ++ ++unsigned long long vbd_size(struct vbd *vbd) ++{ ++ return vbd_sz(vbd); ++} ++ ++unsigned int vbd_info(struct vbd *vbd) ++{ ++ return vbd->type | (vbd->readonly?VDISK_READONLY:0); ++} ++ ++unsigned long vbd_secsize(struct vbd *vbd) ++{ ++ return bdev_logical_block_size(vbd->bdev); ++} ++ ++int vbd_create(blkif_t *blkif, blkif_vdev_t handle, unsigned major, ++ unsigned minor, int readonly, int cdrom) ++{ ++ struct vbd *vbd; ++ struct block_device *bdev; ++ ++ vbd = &blkif->vbd; ++ vbd->handle = handle; ++ vbd->readonly = readonly; ++ vbd->type = 0; ++ ++ vbd->pdevice = MKDEV(major, minor); ++ ++ bdev = open_by_devnum(vbd->pdevice, ++ vbd->readonly ? FMODE_READ : FMODE_WRITE); ++ ++ if (IS_ERR(bdev)) { ++ DPRINTK("vbd_creat: device %08x could not be opened.\n", ++ vbd->pdevice); ++ return -ENOENT; ++ } ++ ++ vbd->bdev = bdev; ++ vbd->size = vbd_size(vbd); ++ ++ if (vbd->bdev->bd_disk == NULL) { ++ DPRINTK("vbd_creat: device %08x doesn't exist.\n", ++ vbd->pdevice); ++ vbd_free(vbd); ++ return -ENOENT; ++ } ++ ++ if (vbd->bdev->bd_disk->flags & GENHD_FL_CD || cdrom) ++ vbd->type |= VDISK_CDROM; ++ if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE) ++ vbd->type |= VDISK_REMOVABLE; ++ ++ DPRINTK("Successful creation of handle=%04x (dom=%u)\n", ++ handle, blkif->domid); ++ return 0; ++} ++ ++void vbd_free(struct vbd *vbd) ++{ ++ if (vbd->bdev) ++ blkdev_put(vbd->bdev, vbd->readonly ? FMODE_READ : FMODE_WRITE); ++ vbd->bdev = NULL; ++} ++ ++int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation) ++{ ++ struct vbd *vbd = &blkif->vbd; ++ int rc = -EACCES; ++ ++ if ((operation != READ) && vbd->readonly) ++ goto out; ++ ++ if (unlikely((req->sector_number + req->nr_sects) > vbd_sz(vbd))) ++ goto out; ++ ++ req->dev = vbd->pdevice; ++ req->bdev = vbd->bdev; ++ rc = 0; ++ ++ out: ++ return rc; ++} ++ ++void vbd_resize(blkif_t *blkif) ++{ ++ struct vbd *vbd = &blkif->vbd; ++ struct xenbus_transaction xbt; ++ int err; ++ struct xenbus_device *dev = blkback_xenbus(blkif->be); ++ unsigned long long new_size = vbd_size(vbd); ++ ++ printk(KERN_INFO "VBD Resize: new size %Lu\n", new_size); ++ vbd->size = new_size; ++again: ++ err = xenbus_transaction_start(&xbt); ++ if (err) { ++ printk(KERN_WARNING "Error starting transaction"); ++ return; ++ } ++ err = xenbus_printf(xbt, dev->nodename, "sectors", "%Lu", ++ vbd_size(vbd)); ++ if (err) { ++ printk(KERN_WARNING "Error writing new size"); ++ goto abort; ++ } ++ /* ++ * Write the current state; we will use this to synchronize ++ * the front-end. If the current state is "connected" the ++ * front-end will get the new size information online. ++ */ ++ err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state); ++ if (err) { ++ printk(KERN_WARNING "Error writing the state"); ++ goto abort; ++ } ++ ++ err = xenbus_transaction_end(xbt, 0); ++ if (err == -EAGAIN) ++ goto again; ++ if (err) ++ printk(KERN_WARNING "Error ending transaction"); ++abort: ++ xenbus_transaction_end(xbt, 1); ++} +diff --git a/drivers/xen/blkback/xenbus.c b/drivers/xen/blkback/xenbus.c +new file mode 100644 +index 0000000..a0534fc +--- /dev/null ++++ b/drivers/xen/blkback/xenbus.c +@@ -0,0 +1,553 @@ ++/* Xenbus code for blkif backend ++ Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au> ++ Copyright (C) 2005 XenSource Ltd ++ ++ This program is free software; you can redistribute it and/or modify ++ it under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 2 of the License, or ++ (at your option) any later version. ++ ++ This program is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ GNU General Public License for more details. ++ ++ You should have received a copy of the GNU General Public License ++ along with this program; if not, write to the Free Software ++ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++*/ ++ ++#include <stdarg.h> ++#include <linux/module.h> ++#include <linux/kthread.h> ++#include "common.h" ++ ++#undef DPRINTK ++#define DPRINTK(fmt, args...) \ ++ pr_debug("blkback/xenbus (%s:%d) " fmt ".\n", \ ++ __FUNCTION__, __LINE__, ##args) ++ ++struct backend_info ++{ ++ struct xenbus_device *dev; ++ blkif_t *blkif; ++ struct xenbus_watch backend_watch; ++ unsigned major; ++ unsigned minor; ++ char *mode; ++}; ++ ++static void connect(struct backend_info *); ++static int connect_ring(struct backend_info *); ++static void backend_changed(struct xenbus_watch *, const char **, ++ unsigned int); ++ ++struct xenbus_device *blkback_xenbus(struct backend_info *be) ++{ ++ return be->dev; ++} ++ ++static int blkback_name(blkif_t *blkif, char *buf) ++{ ++ char *devpath, *devname; ++ struct xenbus_device *dev = blkif->be->dev; ++ ++ devpath = xenbus_read(XBT_NIL, dev->nodename, "dev", NULL); ++ if (IS_ERR(devpath)) ++ return PTR_ERR(devpath); ++ ++ if ((devname = strstr(devpath, "/dev/")) != NULL) ++ devname += strlen("/dev/"); ++ else ++ devname = devpath; ++ ++ snprintf(buf, TASK_COMM_LEN, "blkback.%d.%s", blkif->domid, devname); ++ kfree(devpath); ++ ++ return 0; ++} ++ ++static void update_blkif_status(blkif_t *blkif) ++{ ++ int err; ++ char name[TASK_COMM_LEN]; ++ ++ /* Not ready to connect? */ ++ if (!blkif->irq || !blkif->vbd.bdev) ++ return; ++ ++ /* Already connected? */ ++ if (blkif->be->dev->state == XenbusStateConnected) ++ return; ++ ++ /* Attempt to connect: exit if we fail to. */ ++ connect(blkif->be); ++ if (blkif->be->dev->state != XenbusStateConnected) ++ return; ++ ++ err = blkback_name(blkif, name); ++ if (err) { ++ xenbus_dev_error(blkif->be->dev, err, "get blkback dev name"); ++ return; ++ } ++ ++ err = filemap_write_and_wait(blkif->vbd.bdev->bd_inode->i_mapping); ++ if (err) { ++ xenbus_dev_error(blkif->be->dev, err, "block flush"); ++ return; ++ } ++ invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping); ++ ++ blkif->xenblkd = kthread_run(blkif_schedule, blkif, name); ++ if (IS_ERR(blkif->xenblkd)) { ++ err = PTR_ERR(blkif->xenblkd); ++ blkif->xenblkd = NULL; ++ xenbus_dev_error(blkif->be->dev, err, "start xenblkd"); ++ } ++} ++ ++ ++/**************************************************************** ++ * sysfs interface for VBD I/O requests ++ */ ++ ++#define VBD_SHOW(name, format, args...) \ ++ static ssize_t show_##name(struct device *_dev, \ ++ struct device_attribute *attr, \ ++ char *buf) \ ++ { \ ++ struct xenbus_device *dev = to_xenbus_device(_dev); \ ++ struct backend_info *be = dev_get_drvdata(&dev->dev); \ ++ \ ++ return sprintf(buf, format, ##args); \ ++ } \ ++ static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) ++ ++VBD_SHOW(oo_req, "%d\n", be->blkif->st_oo_req); ++VBD_SHOW(rd_req, "%d\n", be->blkif->st_rd_req); ++VBD_SHOW(wr_req, "%d\n", be->blkif->st_wr_req); ++VBD_SHOW(br_req, "%d\n", be->blkif->st_br_req); ++VBD_SHOW(rd_sect, "%d\n", be->blkif->st_rd_sect); ++VBD_SHOW(wr_sect, "%d\n", be->blkif->st_wr_sect); ++ ++static struct attribute *vbdstat_attrs[] = { ++ &dev_attr_oo_req.attr, ++ &dev_attr_rd_req.attr, ++ &dev_attr_wr_req.attr, ++ &dev_attr_br_req.attr, ++ &dev_attr_rd_sect.attr, ++ &dev_attr_wr_sect.attr, ++ NULL ++}; ++ ++static struct attribute_group vbdstat_group = { ++ .name = "statistics", ++ .attrs = vbdstat_attrs, ++}; ++ ++VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor); ++VBD_SHOW(mode, "%s\n", be->mode); ++ ++int xenvbd_sysfs_addif(struct xenbus_device *dev) ++{ ++ int error; ++ ++ error = device_create_file(&dev->dev, &dev_attr_physical_device); ++ if (error) ++ goto fail1; ++ ++ error = device_create_file(&dev->dev, &dev_attr_mode); ++ if (error) ++ goto fail2; ++ ++ error = sysfs_create_group(&dev->dev.kobj, &vbdstat_group); ++ if (error) ++ goto fail3; ++ ++ return 0; ++ ++fail3: sysfs_remove_group(&dev->dev.kobj, &vbdstat_group); ++fail2: device_remove_file(&dev->dev, &dev_attr_mode); ++fail1: device_remove_file(&dev->dev, &dev_attr_physical_device); ++ return error; ++} ++ ++void xenvbd_sysfs_delif(struct xenbus_device *dev) ++{ ++ sysfs_remove_group(&dev->dev.kobj, &vbdstat_group); ++ device_remove_file(&dev->dev, &dev_attr_mode); ++ device_remove_file(&dev->dev, &dev_attr_physical_device); ++} ++ ++static int blkback_remove(struct xenbus_device *dev) ++{ ++ struct backend_info *be = dev_get_drvdata(&dev->dev); ++ ++ DPRINTK(""); ++ ++ if (be->major || be->minor) ++ xenvbd_sysfs_delif(dev); ++ ++ if (be->backend_watch.node) { ++ unregister_xenbus_watch(&be->backend_watch); ++ kfree(be->backend_watch.node); ++ be->backend_watch.node = NULL; ++ } ++ ++ if (be->blkif) { ++ blkif_disconnect(be->blkif); ++ vbd_free(&be->blkif->vbd); ++ blkif_free(be->blkif); ++ be->blkif = NULL; ++ } ++ ++ kfree(be); ++ dev_set_drvdata(&dev->dev, NULL); ++ return 0; ++} ++ ++int blkback_barrier(struct xenbus_transaction xbt, ++ struct backend_info *be, int state) ++{ ++ struct xenbus_device *dev = be->dev; ++ int err; ++ ++ err = xenbus_printf(xbt, dev->nodename, "feature-barrier", ++ "%d", state); ++ if (err) ++ xenbus_dev_fatal(dev, err, "writing feature-barrier"); ++ ++ return err; ++} ++ ++/** ++ * Entry point to this code when a new device is created. Allocate the basic ++ * structures, and watch the store waiting for the hotplug scripts to tell us ++ * the device's physical major and minor numbers. Switch to InitWait. ++ */ ++static int blkback_probe(struct xenbus_device *dev, ++ const struct xenbus_device_id *id) ++{ ++ int err; ++ struct backend_info *be = kzalloc(sizeof(struct backend_info), ++ GFP_KERNEL); ++ if (!be) { ++ xenbus_dev_fatal(dev, -ENOMEM, ++ "allocating backend structure"); ++ return -ENOMEM; ++ } ++ be->dev = dev; ++ dev_set_drvdata(&dev->dev, be); ++ ++ be->blkif = blkif_alloc(dev->otherend_id); ++ if (IS_ERR(be->blkif)) { ++ err = PTR_ERR(be->blkif); ++ be->blkif = NULL; ++ xenbus_dev_fatal(dev, err, "creating block interface"); ++ goto fail; ++ } ++ ++ /* setup back pointer */ ++ be->blkif->be = be; ++ ++ err = xenbus_watch_pathfmt(dev, &be->backend_watch, backend_changed, ++ "%s/%s", dev->nodename, "physical-device"); ++ if (err) ++ goto fail; ++ ++ err = xenbus_switch_state(dev, XenbusStateInitWait); ++ if (err) ++ goto fail; ++ ++ return 0; ++ ++fail: ++ DPRINTK("failed"); ++ blkback_remove(dev); ++ return err; ++} ++ ++ ++/** ++ * Callback received when the hotplug scripts have placed the physical-device ++ * node. Read it and the mode node, and create a vbd. If the frontend is ++ * ready, connect. ++ */ ++static void backend_changed(struct xenbus_watch *watch, ++ const char **vec, unsigned int len) ++{ ++ int err; ++ unsigned major; ++ unsigned minor; ++ struct backend_info *be ++ = container_of(watch, struct backend_info, backend_watch); ++ struct xenbus_device *dev = be->dev; ++ int cdrom = 0; ++ char *device_type; ++ ++ DPRINTK(""); ++ ++ err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x", ++ &major, &minor); ++ if (XENBUS_EXIST_ERR(err)) { ++ /* Since this watch will fire once immediately after it is ++ registered, we expect this. Ignore it, and wait for the ++ hotplug scripts. */ ++ return; ++ } ++ if (err != 2) { ++ xenbus_dev_fatal(dev, err, "reading physical-device"); ++ return; ++ } ++ ++ if ((be->major || be->minor) && ++ ((be->major != major) || (be->minor != minor))) { ++ printk(KERN_WARNING ++ "blkback: changing physical device (from %x:%x to " ++ "%x:%x) not supported.\n", be->major, be->minor, ++ major, minor); ++ return; ++ } ++ ++ be->mode = xenbus_read(XBT_NIL, dev->nodename, "mode", NULL); ++ if (IS_ERR(be->mode)) { ++ err = PTR_ERR(be->mode); ++ be->mode = NULL; ++ xenbus_dev_fatal(dev, err, "reading mode"); ++ return; ++ } ++ ++ device_type = xenbus_read(XBT_NIL, dev->otherend, "device-type", NULL); ++ if (!IS_ERR(device_type)) { ++ cdrom = strcmp(device_type, "cdrom") == 0; ++ kfree(device_type); ++ } ++ ++ if (be->major == 0 && be->minor == 0) { ++ /* Front end dir is a number, which is used as the handle. */ ++ ++ char *p = strrchr(dev->otherend, '/') + 1; ++ long handle = simple_strtoul(p, NULL, 0); ++ ++ be->major = major; ++ be->minor = minor; ++ ++ err = vbd_create(be->blkif, handle, major, minor, ++ (NULL == strchr(be->mode, 'w')), cdrom); ++ if (err) { ++ be->major = be->minor = 0; ++ xenbus_dev_fatal(dev, err, "creating vbd structure"); ++ return; ++ } ++ ++ err = xenvbd_sysfs_addif(dev); ++ if (err) { ++ vbd_free(&be->blkif->vbd); ++ be->major = be->minor = 0; ++ xenbus_dev_fatal(dev, err, "creating sysfs entries"); ++ return; ++ } ++ ++ /* We're potentially connected now */ ++ update_blkif_status(be->blkif); ++ } ++} ++ ++ ++/** ++ * Callback received when the frontend's state changes. ++ */ ++static void frontend_changed(struct xenbus_device *dev, ++ enum xenbus_state frontend_state) ++{ ++ struct backend_info *be = dev_get_drvdata(&dev->dev); ++ int err; ++ ++ DPRINTK("%s", xenbus_strstate(frontend_state)); ++ ++ switch (frontend_state) { ++ case XenbusStateInitialising: ++ if (dev->state == XenbusStateClosed) { ++ printk(KERN_INFO "%s: %s: prepare for reconnect\n", ++ __FUNCTION__, dev->nodename); ++ xenbus_switch_state(dev, XenbusStateInitWait); ++ } ++ break; ++ ++ case XenbusStateInitialised: ++ case XenbusStateConnected: ++ /* Ensure we connect even when two watches fire in ++ close successsion and we miss the intermediate value ++ of frontend_state. */ ++ if (dev->state == XenbusStateConnected) ++ break; ++ ++ err = connect_ring(be); ++ if (err) ++ break; ++ update_blkif_status(be->blkif); ++ break; ++ ++ case XenbusStateClosing: ++ blkif_disconnect(be->blkif); ++ xenbus_switch_state(dev, XenbusStateClosing); ++ break; ++ ++ case XenbusStateClosed: ++ xenbus_switch_state(dev, XenbusStateClosed); ++ if (xenbus_dev_is_online(dev)) ++ break; ++ /* fall through if not online */ ++ case XenbusStateUnknown: ++ device_unregister(&dev->dev); ++ break; ++ ++ default: ++ xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend", ++ frontend_state); ++ break; ++ } ++} ++ ++ ++/* ** Connection ** */ ++ ++ ++/** ++ * Write the physical details regarding the block device to the store, and ++ * switch to Connected state. ++ */ ++static void connect(struct backend_info *be) ++{ ++ struct xenbus_transaction xbt; ++ int err; ++ struct xenbus_device *dev = be->dev; ++ ++ DPRINTK("%s", dev->otherend); ++ ++ /* Supply the information about the device the frontend needs */ ++again: ++ err = xenbus_transaction_start(&xbt); ++ if (err) { ++ xenbus_dev_fatal(dev, err, "starting transaction"); ++ return; ++ } ++ ++ err = blkback_barrier(xbt, be, 1); ++ if (err) ++ goto abort; ++ ++ err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", ++ vbd_size(&be->blkif->vbd)); ++ if (err) { ++ xenbus_dev_fatal(dev, err, "writing %s/sectors", ++ dev->nodename); ++ goto abort; ++ } ++ ++ /* FIXME: use a typename instead */ ++ err = xenbus_printf(xbt, dev->nodename, "info", "%u", ++ vbd_info(&be->blkif->vbd)); ++ if (err) { ++ xenbus_dev_fatal(dev, err, "writing %s/info", ++ dev->nodename); ++ goto abort; ++ } ++ err = xenbus_printf(xbt, dev->nodename, "sector-size", "%lu", ++ vbd_secsize(&be->blkif->vbd)); ++ if (err) { ++ xenbus_dev_fatal(dev, err, "writing %s/sector-size", ++ dev->nodename); ++ goto abort; ++ } ++ ++ err = xenbus_transaction_end(xbt, 0); ++ if (err == -EAGAIN) ++ goto again; ++ if (err) ++ xenbus_dev_fatal(dev, err, "ending transaction"); ++ ++ err = xenbus_switch_state(dev, XenbusStateConnected); ++ if (err) ++ xenbus_dev_fatal(dev, err, "switching to Connected state", ++ dev->nodename); ++ ++ return; ++ abort: ++ xenbus_transaction_end(xbt, 1); ++} ++ ++ ++static int connect_ring(struct backend_info *be) ++{ ++ struct xenbus_device *dev = be->dev; ++ unsigned long ring_ref; ++ unsigned int evtchn; ++ char protocol[64] = ""; ++ int err; ++ ++ DPRINTK("%s", dev->otherend); ++ ++ err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu", &ring_ref, ++ "event-channel", "%u", &evtchn, NULL); ++ if (err) { ++ xenbus_dev_fatal(dev, err, ++ "reading %s/ring-ref and event-channel", ++ dev->otherend); ++ return err; ++ } ++ ++ be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; ++ err = xenbus_gather(XBT_NIL, dev->otherend, "protocol", ++ "%63s", protocol, NULL); ++ if (err) ++ strcpy(protocol, "unspecified, assuming native"); ++ else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE)) ++ be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; ++ else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32)) ++ be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32; ++ else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64)) ++ be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64; ++ else { ++ xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol); ++ return -1; ++ } ++ printk(KERN_INFO ++ "blkback: ring-ref %ld, event-channel %d, protocol %d (%s)\n", ++ ring_ref, evtchn, be->blkif->blk_protocol, protocol); ++ ++ /* Map the shared frame, irq etc. */ ++ err = blkif_map(be->blkif, ring_ref, evtchn); ++ if (err) { ++ xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u", ++ ring_ref, evtchn); ++ return err; ++ } ++ ++ return 0; ++} ++ ++ ++/* ** Driver Registration ** */ ++ ++ ++static const struct xenbus_device_id blkback_ids[] = { ++ { "vbd" }, ++ { "" } ++}; ++ ++ ++static struct xenbus_driver blkback = { ++ .name = "vbd", ++ .owner = THIS_MODULE, ++ .ids = blkback_ids, ++ .probe = blkback_probe, ++ .remove = blkback_remove, ++ .otherend_changed = frontend_changed ++}; ++ ++ ++int blkif_xenbus_init(void) ++{ ++ return xenbus_register_backend(&blkback); ++} +diff --git a/drivers/xen/blktap/Makefile b/drivers/xen/blktap/Makefile +new file mode 100644 +index 0000000..822b4e4 +--- /dev/null ++++ b/drivers/xen/blktap/Makefile +@@ -0,0 +1,3 @@ ++obj-$(CONFIG_XEN_BLKDEV_TAP) := blktap.o ++ ++blktap-objs := control.o ring.o device.o request.o sysfs.o +diff --git a/drivers/xen/blktap/blktap.h b/drivers/xen/blktap/blktap.h +new file mode 100644 +index 0000000..fe63fc9 +--- /dev/null ++++ b/drivers/xen/blktap/blktap.h +@@ -0,0 +1,209 @@ ++#ifndef _BLKTAP_H_ ++#define _BLKTAP_H_ ++ ++#include <linux/mm.h> ++#include <linux/fs.h> ++#include <linux/cdev.h> ++#include <linux/init.h> ++#include <linux/scatterlist.h> ++#include <xen/blkif.h> ++ ++extern int blktap_debug_level; ++extern int blktap_ring_major; ++extern int blktap_device_major; ++ ++#define BTPRINTK(level, tag, force, _f, _a...) \ ++ do { \ ++ if (blktap_debug_level > level && \ ++ (force || printk_ratelimit())) \ ++ printk(tag "%s: " _f, __func__, ##_a); \ ++ } while (0) ++ ++#define BTDBG(_f, _a...) BTPRINTK(8, KERN_DEBUG, 1, _f, ##_a) ++#define BTINFO(_f, _a...) BTPRINTK(0, KERN_INFO, 0, _f, ##_a) ++#define BTWARN(_f, _a...) BTPRINTK(0, KERN_WARNING, 0, _f, ##_a) ++#define BTERR(_f, _a...) BTPRINTK(0, KERN_ERR, 0, _f, ##_a) ++ ++#define MAX_BLKTAP_DEVICE 1024 ++ ++#define BLKTAP_DEVICE 4 ++#define BLKTAP_DEVICE_CLOSED 5 ++#define BLKTAP_SHUTDOWN_REQUESTED 8 ++ ++/* blktap IOCTLs: */ ++#define BLKTAP2_IOCTL_KICK_FE 1 ++#define BLKTAP2_IOCTL_ALLOC_TAP 200 ++#define BLKTAP2_IOCTL_FREE_TAP 201 ++#define BLKTAP2_IOCTL_CREATE_DEVICE 202 ++#define BLKTAP2_IOCTL_REMOVE_DEVICE 207 ++ ++#define BLKTAP2_MAX_MESSAGE_LEN 256 ++ ++#define BLKTAP2_RING_MESSAGE_CLOSE 3 ++ ++#define BLKTAP_REQUEST_FREE 0 ++#define BLKTAP_REQUEST_PENDING 1 ++ ++/* ++ * The maximum number of requests that can be outstanding at any time ++ * is determined by ++ * ++ * [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST] ++ * ++ * where mmap_alloc < MAX_DYNAMIC_MEM. ++ * ++ * TODO: ++ * mmap_alloc is initialised to 2 and should be adjustable on the fly via ++ * sysfs. ++ */ ++#define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE) ++#define MAX_DYNAMIC_MEM BLK_RING_SIZE ++#define MAX_PENDING_REQS BLK_RING_SIZE ++#define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST) ++#define MMAP_VADDR(_start, _req, _seg) \ ++ (_start + \ ++ ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \ ++ ((_seg) * PAGE_SIZE)) ++ ++struct grant_handle_pair { ++ grant_handle_t kernel; ++ grant_handle_t user; ++}; ++#define INVALID_GRANT_HANDLE 0xFFFF ++ ++struct blktap_handle { ++ unsigned int ring; ++ unsigned int device; ++ unsigned int minor; ++}; ++ ++struct blktap_params { ++ char name[BLKTAP2_MAX_MESSAGE_LEN]; ++ unsigned long long capacity; ++ unsigned long sector_size; ++}; ++ ++struct blktap_device { ++ spinlock_t lock; ++ struct gendisk *gd; ++}; ++ ++struct blktap_ring { ++ struct task_struct *task; ++ ++ struct vm_area_struct *vma; ++ struct blkif_front_ring ring; ++ unsigned long ring_vstart; ++ unsigned long user_vstart; ++ ++ int n_pending; ++ struct blktap_request *pending[MAX_PENDING_REQS]; ++ ++ wait_queue_head_t poll_wait; ++ ++ dev_t devno; ++ struct device *dev; ++}; ++ ++struct blktap_statistics { ++ unsigned long st_print; ++ int st_rd_req; ++ int st_wr_req; ++ int st_oo_req; ++ int st_rd_sect; ++ int st_wr_sect; ++ s64 st_rd_cnt; ++ s64 st_rd_sum_usecs; ++ s64 st_rd_max_usecs; ++ s64 st_wr_cnt; ++ s64 st_wr_sum_usecs; ++ s64 st_wr_max_usecs; ++}; ++ ++struct blktap_request { ++ struct blktap *tap; ++ struct request *rq; ++ int usr_idx; ++ ++ int operation; ++ struct timeval time; ++ ++ struct scatterlist sg_table[BLKIF_MAX_SEGMENTS_PER_REQUEST]; ++ struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; ++ int nr_pages; ++}; ++ ++#define blktap_for_each_sg(_sg, _req, _i) \ ++ for (_sg = (_req)->sg_table, _i = 0; \ ++ _i < (_req)->nr_pages; \ ++ (_sg)++, (_i)++) ++ ++struct blktap { ++ int minor; ++ unsigned long dev_inuse; ++ ++ struct blktap_ring ring; ++ struct blktap_device device; ++ struct blktap_page_pool *pool; ++ ++ wait_queue_head_t remove_wait; ++ struct work_struct remove_work; ++ char name[BLKTAP2_MAX_MESSAGE_LEN]; ++ ++ struct blktap_statistics stats; ++}; ++ ++struct blktap_page_pool { ++ struct mempool_s *bufs; ++ spinlock_t lock; ++ struct kobject kobj; ++ wait_queue_head_t wait; ++}; ++ ++extern struct mutex blktap_lock; ++extern struct blktap **blktaps; ++extern int blktap_max_minor; ++ ++int blktap_control_destroy_tap(struct blktap *); ++size_t blktap_control_debug(struct blktap *, char *, size_t); ++ ++int blktap_ring_init(void); ++void blktap_ring_exit(void); ++size_t blktap_ring_debug(struct blktap *, char *, size_t); ++int blktap_ring_create(struct blktap *); ++int blktap_ring_destroy(struct blktap *); ++struct blktap_request *blktap_ring_make_request(struct blktap *); ++void blktap_ring_free_request(struct blktap *,struct blktap_request *); ++void blktap_ring_submit_request(struct blktap *, struct blktap_request *); ++int blktap_ring_map_request_segment(struct blktap *, struct blktap_request *, int); ++int blktap_ring_map_request(struct blktap *, struct blktap_request *); ++void blktap_ring_unmap_request(struct blktap *, struct blktap_request *); ++void blktap_ring_set_message(struct blktap *, int); ++void blktap_ring_kick_user(struct blktap *); ++ ++int blktap_sysfs_init(void); ++void blktap_sysfs_exit(void); ++int blktap_sysfs_create(struct blktap *); ++void blktap_sysfs_destroy(struct blktap *); ++ ++int blktap_device_init(void); ++void blktap_device_exit(void); ++size_t blktap_device_debug(struct blktap *, char *, size_t); ++int blktap_device_create(struct blktap *, struct blktap_params *); ++int blktap_device_destroy(struct blktap *); ++void blktap_device_destroy_sync(struct blktap *); ++void blktap_device_run_queue(struct blktap *); ++void blktap_device_end_request(struct blktap *, struct blktap_request *, int); ++ ++int blktap_page_pool_init(struct kobject *); ++void blktap_page_pool_exit(void); ++struct blktap_page_pool *blktap_page_pool_get(const char *); ++ ++size_t blktap_request_debug(struct blktap *, char *, size_t); ++struct blktap_request *blktap_request_alloc(struct blktap *); ++int blktap_request_get_pages(struct blktap *, struct blktap_request *, int); ++void blktap_request_free(struct blktap *, struct blktap_request *); ++void blktap_request_bounce(struct blktap *, struct blktap_request *, int, int); ++ ++ ++#endif +diff --git a/drivers/xen/blktap/control.c b/drivers/xen/blktap/control.c +new file mode 100644 +index 0000000..f339bba +--- /dev/null ++++ b/drivers/xen/blktap/control.c +@@ -0,0 +1,315 @@ ++#include <linux/module.h> ++#include <linux/sched.h> ++#include <linux/miscdevice.h> ++#include <linux/device.h> ++#include <asm/uaccess.h> ++ ++#include "blktap.h" ++ ++DEFINE_MUTEX(blktap_lock); ++ ++struct blktap **blktaps; ++int blktap_max_minor; ++static struct blktap_page_pool *default_pool; ++ ++static struct blktap * ++blktap_control_get_minor(void) ++{ ++ int minor; ++ struct blktap *tap; ++ ++ tap = kzalloc(sizeof(*tap), GFP_KERNEL); ++ if (unlikely(!tap)) ++ return NULL; ++ ++ mutex_lock(&blktap_lock); ++ ++ for (minor = 0; minor < blktap_max_minor; minor++) ++ if (!blktaps[minor]) ++ break; ++ ++ if (minor == MAX_BLKTAP_DEVICE) ++ goto fail; ++ ++ if (minor == blktap_max_minor) { ++ void *p; ++ int n; ++ ++ n = min(2 * blktap_max_minor, MAX_BLKTAP_DEVICE); ++ p = krealloc(blktaps, n * sizeof(blktaps[0]), GFP_KERNEL); ++ if (!p) ++ goto fail; ++ ++ blktaps = p; ++ minor = blktap_max_minor; ++ blktap_max_minor = n; ++ ++ memset(&blktaps[minor], 0, (n - minor) * sizeof(blktaps[0])); ++ } ++ ++ tap->minor = minor; ++ blktaps[minor] = tap; ++ ++ __module_get(THIS_MODULE); ++out: ++ mutex_unlock(&blktap_lock); ++ return tap; ++ ++fail: ++ mutex_unlock(&blktap_lock); ++ kfree(tap); ++ tap = NULL; ++ goto out; ++} ++ ++static void ++blktap_control_put_minor(struct blktap* tap) ++{ ++ blktaps[tap->minor] = NULL; ++ kfree(tap); ++ ++ module_put(THIS_MODULE); ++} ++ ++static struct blktap* ++blktap_control_create_tap(void) ++{ ++ struct blktap *tap; ++ int err; ++ ++ tap = blktap_control_get_minor(); ++ if (!tap) ++ return NULL; ++ ++ kobject_get(&default_pool->kobj); ++ tap->pool = default_pool; ++ ++ err = blktap_ring_create(tap); ++ if (err) ++ goto fail_tap; ++ ++ err = blktap_sysfs_create(tap); ++ if (err) ++ goto fail_ring; ++ ++ return tap; ++ ++fail_ring: ++ blktap_ring_destroy(tap); ++fail_tap: ++ blktap_control_put_minor(tap); ++ ++ return NULL; ++} ++ ++int ++blktap_control_destroy_tap(struct blktap *tap) ++{ ++ int err; ++ ++ err = blktap_ring_destroy(tap); ++ if (err) ++ return err; ++ ++ kobject_put(&tap->pool->kobj); ++ ++ blktap_sysfs_destroy(tap); ++ ++ blktap_control_put_minor(tap); ++ ++ return 0; ++} ++ ++static int ++blktap_control_ioctl(struct inode *inode, struct file *filp, ++ unsigned int cmd, unsigned long arg) ++{ ++ struct blktap *tap; ++ ++ switch (cmd) { ++ case BLKTAP2_IOCTL_ALLOC_TAP: { ++ struct blktap_handle h; ++ void __user *ptr = (void __user*)arg; ++ ++ tap = blktap_control_create_tap(); ++ if (!tap) ++ return -ENOMEM; ++ ++ h.ring = blktap_ring_major; ++ h.device = blktap_device_major; ++ h.minor = tap->minor; ++ ++ if (copy_to_user(ptr, &h, sizeof(h))) { ++ blktap_control_destroy_tap(tap); ++ return -EFAULT; ++ } ++ ++ return 0; ++ } ++ ++ case BLKTAP2_IOCTL_FREE_TAP: { ++ int minor = arg; ++ ++ if (minor > MAX_BLKTAP_DEVICE) ++ return -EINVAL; ++ ++ tap = blktaps[minor]; ++ if (!tap) ++ return -ENODEV; ++ ++ return blktap_control_destroy_tap(tap); ++ } ++ } ++ ++ return -ENOIOCTLCMD; ++} ++ ++static struct file_operations blktap_control_file_operations = { ++ .owner = THIS_MODULE, ++ .ioctl = blktap_control_ioctl, ++}; ++ ++static struct miscdevice blktap_control = { ++ .minor = MISC_DYNAMIC_MINOR, ++ .name = "blktap-control", ++ .fops = &blktap_control_file_operations, ++}; ++ ++static struct device *control_device; ++ ++static ssize_t ++blktap_control_show_default_pool(struct device *device, ++ struct device_attribute *attr, ++ char *buf) ++{ ++ return sprintf(buf, "%s", kobject_name(&default_pool->kobj)); ++} ++ ++static ssize_t ++blktap_control_store_default_pool(struct device *device, ++ struct device_attribute *attr, ++ const char *buf, size_t size) ++{ ++ struct blktap_page_pool *pool, *tmp = default_pool; ++ ++ pool = blktap_page_pool_get(buf); ++ if (IS_ERR(pool)) ++ return PTR_ERR(pool); ++ ++ default_pool = pool; ++ kobject_put(&tmp->kobj); ++ ++ return size; ++} ++ ++static DEVICE_ATTR(default_pool, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH, ++ blktap_control_show_default_pool, ++ blktap_control_store_default_pool); ++ ++size_t ++blktap_control_debug(struct blktap *tap, char *buf, size_t size) ++{ ++ char *s = buf, *end = buf + size; ++ ++ s += snprintf(s, end - s, ++ "tap %u:%u name:'%s' flags:%#08lx\n", ++ MAJOR(tap->ring.devno), MINOR(tap->ring.devno), ++ tap->name, tap->dev_inuse); ++ ++ return s - buf; ++} ++ ++static int __init ++blktap_control_init(void) ++{ ++ int err; ++ ++ err = misc_register(&blktap_control); ++ if (err) ++ return err; ++ ++ control_device = blktap_control.this_device; ++ ++ blktap_max_minor = min(64, MAX_BLKTAP_DEVICE); ++ blktaps = kzalloc(blktap_max_minor * sizeof(blktaps[0]), GFP_KERNEL); ++ if (!blktaps) { ++ BTERR("failed to allocate blktap minor map"); ++ return -ENOMEM; ++ } ++ ++ err = blktap_page_pool_init(&control_device->kobj); ++ if (err) ++ return err; ++ ++ default_pool = blktap_page_pool_get("default"); ++ if (!default_pool) ++ return -ENOMEM; ++ ++ err = device_create_file(control_device, &dev_attr_default_pool); ++ if (err) ++ return err; ++ ++ return 0; ++} ++ ++static void ++blktap_control_exit(void) ++{ ++ if (default_pool) { ++ kobject_put(&default_pool->kobj); ++ default_pool = NULL; ++ } ++ ++ blktap_page_pool_exit(); ++ ++ if (blktaps) { ++ kfree(blktaps); ++ blktaps = NULL; ++ } ++ ++ if (control_device) { ++ misc_deregister(&blktap_control); ++ control_device = NULL; ++ } ++} ++ ++static void ++blktap_exit(void) ++{ ++ blktap_control_exit(); ++ blktap_ring_exit(); ++ blktap_sysfs_exit(); ++ blktap_device_exit(); ++} ++ ++static int __init ++blktap_init(void) ++{ ++ int err; ++ ++ err = blktap_device_init(); ++ if (err) ++ goto fail; ++ ++ err = blktap_ring_init(); ++ if (err) ++ goto fail; ++ ++ err = blktap_sysfs_init(); ++ if (err) ++ goto fail; ++ ++ err = blktap_control_init(); ++ if (err) ++ goto fail; ++ ++ return 0; ++ ++fail: ++ blktap_exit(); ++ return err; ++} ++ ++module_init(blktap_init); ++module_exit(blktap_exit); ++MODULE_LICENSE("Dual BSD/GPL"); +diff --git a/drivers/xen/blktap/device.c b/drivers/xen/blktap/device.c +new file mode 100644 +index 0000000..fce2769 +--- /dev/null ++++ b/drivers/xen/blktap/device.c +@@ -0,0 +1,564 @@ ++#include <linux/fs.h> ++#include <linux/blkdev.h> ++#include <linux/cdrom.h> ++#include <linux/hdreg.h> ++#include <scsi/scsi.h> ++#include <scsi/scsi_ioctl.h> ++ ++#include "blktap.h" ++ ++int blktap_device_major; ++ ++#define dev_to_blktap(_dev) container_of(_dev, struct blktap, device) ++ ++static int ++blktap_device_open(struct block_device *bdev, fmode_t mode) ++{ ++ struct gendisk *disk = bdev->bd_disk; ++ struct blktap_device *tapdev = disk->private_data; ++ ++ if (!tapdev) ++ return -ENXIO; ++ ++ /* NB. we might have bounced a bd trylock by tapdisk. when ++ * failing for reasons not !tapdev, make sure to kick tapdisk ++ * out of destroy wait state again. */ ++ ++ return 0; ++} ++ ++static int ++blktap_device_release(struct gendisk *disk, fmode_t mode) ++{ ++ struct blktap_device *tapdev = disk->private_data; ++ struct block_device *bdev = bdget_disk(disk, 0); ++ struct blktap *tap = dev_to_blktap(tapdev); ++ ++ bdput(bdev); ++ ++ if (!bdev->bd_openers) { ++ set_bit(BLKTAP_DEVICE_CLOSED, &tap->dev_inuse); ++ blktap_ring_kick_user(tap); ++ } ++ ++ return 0; ++} ++ ++static int ++blktap_device_getgeo(struct block_device *bd, struct hd_geometry *hg) ++{ ++ /* We don't have real geometry info, but let's at least return ++ values consistent with the size of the device */ ++ sector_t nsect = get_capacity(bd->bd_disk); ++ sector_t cylinders = nsect; ++ ++ hg->heads = 0xff; ++ hg->sectors = 0x3f; ++ sector_div(cylinders, hg->heads * hg->sectors); ++ hg->cylinders = cylinders; ++ if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect) ++ hg->cylinders = 0xffff; ++ return 0; ++} ++ ++static int ++blktap_device_ioctl(struct block_device *bd, fmode_t mode, ++ unsigned command, unsigned long argument) ++{ ++ int i; ++ ++ switch (command) { ++ case CDROMMULTISESSION: ++ BTDBG("FIXME: support multisession CDs later\n"); ++ for (i = 0; i < sizeof(struct cdrom_multisession); i++) ++ if (put_user(0, (char __user *)(argument + i))) ++ return -EFAULT; ++ return 0; ++ ++ case SCSI_IOCTL_GET_IDLUN: ++ if (!access_ok(VERIFY_WRITE, argument, ++ sizeof(struct scsi_idlun))) ++ return -EFAULT; ++ ++ /* return 0 for now. */ ++ __put_user(0, &((struct scsi_idlun __user *)argument)->dev_id); ++ __put_user(0, ++ &((struct scsi_idlun __user *)argument)->host_unique_id); ++ return 0; ++ ++ default: ++ /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n", ++ command);*/ ++ return -EINVAL; /* same return as native Linux */ ++ } ++ ++ return 0; ++} ++ ++static struct block_device_operations blktap_device_file_operations = { ++ .owner = THIS_MODULE, ++ .open = blktap_device_open, ++ .release = blktap_device_release, ++ .ioctl = blktap_device_ioctl, ++ .getgeo = blktap_device_getgeo ++}; ++ ++/* NB. __blktap holding the queue lock; blktap where unlocked */ ++ ++static inline struct request* ++__blktap_next_queued_rq(struct request_queue *q) ++{ ++ return blk_peek_request(q); ++} ++ ++static inline void ++__blktap_dequeue_rq(struct request *rq) ++{ ++ blk_start_request(rq); ++} ++ ++/* NB. err == 0 indicates success, failures < 0 */ ++ ++static inline void ++__blktap_end_queued_rq(struct request *rq, int err) ++{ ++ blk_start_request(rq); ++ __blk_end_request(rq, err, blk_rq_bytes(rq)); ++} ++ ++static inline void ++__blktap_end_rq(struct request *rq, int err) ++{ ++ __blk_end_request(rq, err, blk_rq_bytes(rq)); ++} ++ ++static inline void ++blktap_end_rq(struct request *rq, int err) ++{ ++ spin_lock_irq(rq->q->queue_lock); ++ __blktap_end_rq(rq, err); ++ spin_unlock_irq(rq->q->queue_lock); ++} ++ ++void ++blktap_device_end_request(struct blktap *tap, ++ struct blktap_request *request, ++ int error) ++{ ++ struct blktap_device *tapdev = &tap->device; ++ struct request *rq = request->rq; ++ ++ blktap_ring_unmap_request(tap, request); ++ ++ blktap_ring_free_request(tap, request); ++ ++ dev_dbg(disk_to_dev(tapdev->gd), ++ "end_request: op=%d error=%d bytes=%d\n", ++ rq_data_dir(rq), error, blk_rq_bytes(rq)); ++ ++ blktap_end_rq(rq, error); ++} ++ ++int ++blktap_device_make_request(struct blktap *tap, struct request *rq) ++{ ++ struct blktap_device *tapdev = &tap->device; ++ struct blktap_request *request; ++ int write, nsegs; ++ int err; ++ ++ request = blktap_ring_make_request(tap); ++ if (IS_ERR(request)) { ++ err = PTR_ERR(request); ++ request = NULL; ++ ++ if (err == -ENOSPC || err == -ENOMEM) ++ goto stop; ++ ++ goto fail; ++ } ++ ++ write = rq_data_dir(rq) == WRITE; ++ nsegs = blk_rq_map_sg(rq->q, rq, request->sg_table); ++ ++ dev_dbg(disk_to_dev(tapdev->gd), ++ "make_request: op=%c bytes=%d nsegs=%d\n", ++ write ? 'w' : 'r', blk_rq_bytes(rq), nsegs); ++ ++ request->rq = rq; ++ request->operation = write ? BLKIF_OP_WRITE : BLKIF_OP_READ; ++ ++ err = blktap_request_get_pages(tap, request, nsegs); ++ if (err) ++ goto stop; ++ ++ err = blktap_ring_map_request(tap, request); ++ if (err) ++ goto fail; ++ ++ blktap_ring_submit_request(tap, request); ++ ++ return 0; ++ ++stop: ++ tap->stats.st_oo_req++; ++ err = -EBUSY; ++ ++_out: ++ if (request) ++ blktap_ring_free_request(tap, request); ++ ++ return err; ++fail: ++ if (printk_ratelimit()) ++ dev_warn(disk_to_dev(tapdev->gd), ++ "make request: %d, failing\n", err); ++ goto _out; ++} ++ ++/* ++ * called from tapdisk context ++ */ ++void ++blktap_device_run_queue(struct blktap *tap) ++{ ++ struct blktap_device *tapdev = &tap->device; ++ struct request_queue *q; ++ struct request *rq; ++ int err; ++ ++ if (!tapdev->gd) ++ return; ++ ++ q = tapdev->gd->queue; ++ ++ spin_lock_irq(&tapdev->lock); ++ queue_flag_clear(QUEUE_FLAG_STOPPED, q); ++ ++ do { ++ rq = __blktap_next_queued_rq(q); ++ if (!rq) ++ break; ++ ++ if (!blk_fs_request(rq)) { ++ __blktap_end_queued_rq(rq, -EOPNOTSUPP); ++ continue; ++ } ++ ++ spin_unlock_irq(&tapdev->lock); ++ ++ err = blktap_device_make_request(tap, rq); ++ ++ spin_lock_irq(&tapdev->lock); ++ ++ if (err == -EBUSY) { ++ blk_stop_queue(q); ++ break; ++ } ++ ++ __blktap_dequeue_rq(rq); ++ ++ if (unlikely(err)) ++ __blktap_end_rq(rq, err); ++ } while (1); ++ ++ spin_unlock_irq(&tapdev->lock); ++} ++ ++static void ++blktap_device_do_request(struct request_queue *rq) ++{ ++ struct blktap_device *tapdev = rq->queuedata; ++ struct blktap *tap = dev_to_blktap(tapdev); ++ ++ blktap_ring_kick_user(tap); ++} ++ ++static void ++blktap_device_configure(struct blktap *tap, ++ struct blktap_params *params) ++{ ++ struct request_queue *rq; ++ struct blktap_device *dev = &tap->device; ++ ++ dev = &tap->device; ++ rq = dev->gd->queue; ++ ++ spin_lock_irq(&dev->lock); ++ ++ set_capacity(dev->gd, params->capacity); ++ ++ /* Hard sector size and max sectors impersonate the equiv. hardware. */ ++ blk_queue_logical_block_size(rq, params->sector_size); ++ blk_queue_max_sectors(rq, 512); ++ ++ /* Each segment in a request is up to an aligned page in size. */ ++ blk_queue_segment_boundary(rq, PAGE_SIZE - 1); ++ blk_queue_max_segment_size(rq, PAGE_SIZE); ++ ++ /* Ensure a merged request will fit in a single I/O ring slot. */ ++ blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); ++ blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); ++ ++ /* Make sure buffer addresses are sector-aligned. */ ++ blk_queue_dma_alignment(rq, 511); ++ ++ /* We are reordering, but cacheless. */ ++ blk_queue_ordered(rq, QUEUE_ORDERED_DRAIN, NULL); ++ ++ spin_unlock_irq(&dev->lock); ++} ++ ++static int ++blktap_device_validate_params(struct blktap *tap, ++ struct blktap_params *params) ++{ ++ struct device *dev = tap->ring.dev; ++ int sector_order, name_sz; ++ ++ sector_order = ffs(params->sector_size) - 1; ++ ++ if (sector_order < 9 || ++ sector_order > 12 || ++ params->sector_size != 1U<<sector_order) ++ goto fail; ++ ++ if (!params->capacity || ++ (params->capacity > ULLONG_MAX >> sector_order)) ++ goto fail; ++ ++ name_sz = min(sizeof(params->name), sizeof(tap->name)); ++ if (strnlen(params->name, name_sz) >= name_sz) ++ goto fail; ++ ++ return 0; ++ ++fail: ++ params->name[name_sz-1] = 0; ++ dev_err(dev, "capacity: %llu, sector-size: %lu, name: %s\n", ++ params->capacity, params->sector_size, params->name); ++ return -EINVAL; ++} ++ ++int ++blktap_device_destroy(struct blktap *tap) ++{ ++ struct blktap_device *tapdev = &tap->device; ++ struct block_device *bdev; ++ struct gendisk *gd; ++ int err; ++ ++ gd = tapdev->gd; ++ if (!gd) ++ return 0; ++ ++ bdev = bdget_disk(gd, 0); ++ ++ err = !mutex_trylock(&bdev->bd_mutex); ++ if (err) { ++ /* NB. avoid a deadlock. the last opener syncs the ++ * bdev holding bd_mutex. */ ++ err = -EBUSY; ++ goto out_nolock; ++ } ++ ++ if (bdev->bd_openers) { ++ err = -EBUSY; ++ goto out; ++ } ++ ++ del_gendisk(gd); ++ gd->private_data = NULL; ++ ++ blk_cleanup_queue(gd->queue); ++ ++ put_disk(gd); ++ tapdev->gd = NULL; ++ ++ clear_bit(BLKTAP_DEVICE, &tap->dev_inuse); ++ err = 0; ++out: ++ mutex_unlock(&bdev->bd_mutex); ++out_nolock: ++ bdput(bdev); ++ ++ return err; ++} ++ ++static void ++blktap_device_fail_queue(struct blktap *tap) ++{ ++ struct blktap_device *tapdev = &tap->device; ++ struct request_queue *q = tapdev->gd->queue; ++ ++ spin_lock_irq(&tapdev->lock); ++ queue_flag_clear(QUEUE_FLAG_STOPPED, q); ++ ++ do { ++ struct request *rq = __blktap_next_queued_rq(q); ++ if (!rq) ++ break; ++ ++ __blktap_end_queued_rq(rq, -EIO); ++ } while (1); ++ ++ spin_unlock_irq(&tapdev->lock); ++} ++ ++static int ++blktap_device_try_destroy(struct blktap *tap) ++{ ++ int err; ++ ++ err = blktap_device_destroy(tap); ++ if (err) ++ blktap_device_fail_queue(tap); ++ ++ return err; ++} ++ ++void ++blktap_device_destroy_sync(struct blktap *tap) ++{ ++ wait_event(tap->ring.poll_wait, ++ !blktap_device_try_destroy(tap)); ++} ++ ++int ++blktap_device_create(struct blktap *tap, struct blktap_params *params) ++{ ++ int minor, err; ++ struct gendisk *gd; ++ struct request_queue *rq; ++ struct blktap_device *tapdev; ++ ++ gd = NULL; ++ rq = NULL; ++ tapdev = &tap->device; ++ minor = tap->minor; ++ ++ if (test_bit(BLKTAP_DEVICE, &tap->dev_inuse)) ++ return -EEXIST; ++ ++ if (blktap_device_validate_params(tap, params)) ++ return -EINVAL; ++ ++ gd = alloc_disk(1); ++ if (!gd) { ++ err = -ENOMEM; ++ goto fail; ++ } ++ ++ if (minor < 26) { ++ sprintf(gd->disk_name, "td%c", 'a' + minor % 26); ++ } else if (minor < (26 + 1) * 26) { ++ sprintf(gd->disk_name, "td%c%c", ++ 'a' + minor / 26 - 1,'a' + minor % 26); ++ } else { ++ const unsigned int m1 = (minor / 26 - 1) / 26 - 1; ++ const unsigned int m2 = (minor / 26 - 1) % 26; ++ const unsigned int m3 = minor % 26; ++ sprintf(gd->disk_name, "td%c%c%c", ++ 'a' + m1, 'a' + m2, 'a' + m3); ++ } ++ ++ gd->major = blktap_device_major; ++ gd->first_minor = minor; ++ gd->fops = &blktap_device_file_operations; ++ gd->private_data = tapdev; ++ ++ spin_lock_init(&tapdev->lock); ++ rq = blk_init_queue(blktap_device_do_request, &tapdev->lock); ++ if (!rq) { ++ err = -ENOMEM; ++ goto fail; ++ } ++ elevator_init(rq, "noop"); ++ ++ gd->queue = rq; ++ rq->queuedata = tapdev; ++ tapdev->gd = gd; ++ ++ blktap_device_configure(tap, params); ++ add_disk(gd); ++ ++ if (params->name[0]) ++ strncpy(tap->name, params->name, sizeof(tap->name)-1); ++ ++ set_bit(BLKTAP_DEVICE, &tap->dev_inuse); ++ ++ dev_info(disk_to_dev(gd), "sector-size: %u capacity: %llu\n", ++ queue_logical_block_size(rq), ++ (unsigned long long)get_capacity(gd)); ++ ++ return 0; ++ ++fail: ++ if (gd) ++ del_gendisk(gd); ++ if (rq) ++ blk_cleanup_queue(rq); ++ ++ return err; ++} ++ ++size_t ++blktap_device_debug(struct blktap *tap, char *buf, size_t size) ++{ ++ struct gendisk *disk = tap->device.gd; ++ struct request_queue *q; ++ struct block_device *bdev; ++ char *s = buf, *end = buf + size; ++ ++ if (!disk) ++ return 0; ++ ++ q = disk->queue; ++ ++ s += snprintf(s, end - s, ++ "disk capacity:%llu sector size:%u\n", ++ (unsigned long long)get_capacity(disk), ++ queue_logical_block_size(q)); ++ ++ s += snprintf(s, end - s, ++ "queue flags:%#lx plugged:%d stopped:%d empty:%d\n", ++ q->queue_flags, ++ blk_queue_plugged(q), blk_queue_stopped(q), ++ elv_queue_empty(q)); ++ ++ bdev = bdget_disk(disk, 0); ++ if (bdev) { ++ s += snprintf(s, end - s, ++ "bdev openers:%d closed:%d\n", ++ bdev->bd_openers, ++ test_bit(BLKTAP_DEVICE_CLOSED, &tap->dev_inuse)); ++ bdput(bdev); ++ } ++ ++ return s - buf; ++} ++ ++int __init ++blktap_device_init() ++{ ++ int major; ++ ++ /* Dynamically allocate a major for this device */ ++ major = register_blkdev(0, "tapdev"); ++ if (major < 0) { ++ BTERR("Couldn't register blktap device\n"); ++ return -ENOMEM; ++ } ++ ++ blktap_device_major = major; ++ BTINFO("blktap device major %d\n", major); ++ ++ return 0; ++} ++ ++void ++blktap_device_exit(void) ++{ ++ if (blktap_device_major) ++ unregister_blkdev(blktap_device_major, "tapdev"); ++} +diff --git a/drivers/xen/blktap/request.c b/drivers/xen/blktap/request.c +new file mode 100644 +index 0000000..9bef48c +--- /dev/null ++++ b/drivers/xen/blktap/request.c +@@ -0,0 +1,418 @@ ++#include <linux/mempool.h> ++#include <linux/spinlock.h> ++#include <linux/mutex.h> ++#include <linux/sched.h> ++#include <linux/device.h> ++ ++#include "blktap.h" ++ ++/* max pages per shared pool. just to prevent accidental dos. */ ++#define POOL_MAX_PAGES (256*BLKIF_MAX_SEGMENTS_PER_REQUEST) ++ ++/* default page pool size. when considering to shrink a shared pool, ++ * note that paused tapdisks may grab a whole lot of pages for a long ++ * time. */ ++#define POOL_DEFAULT_PAGES (2 * MMAP_PAGES) ++ ++/* max number of pages allocatable per request. */ ++#define POOL_MAX_REQUEST_PAGES BLKIF_MAX_SEGMENTS_PER_REQUEST ++ ++/* min request structs per pool. These grow dynamically. */ ++#define POOL_MIN_REQS BLK_RING_SIZE ++ ++static struct kset *pool_set; ++ ++#define kobj_to_pool(_kobj) \ ++ container_of(_kobj, struct blktap_page_pool, kobj) ++ ++static struct kmem_cache *request_cache; ++static mempool_t *request_pool; ++ ++static void ++__page_pool_wake(struct blktap_page_pool *pool) ++{ ++ mempool_t *mem = pool->bufs; ++ ++ /* ++ NB. slightly wasteful to always wait for a full segment ++ set. but this ensures the next disk makes ++ progress. presently, the repeated request struct ++ alloc/release cycles would otherwise keep everyone spinning. ++ */ ++ ++ if (mem->curr_nr >= POOL_MAX_REQUEST_PAGES) ++ wake_up(&pool->wait); ++} ++ ++int ++blktap_request_get_pages(struct blktap *tap, ++ struct blktap_request *request, int nr_pages) ++{ ++ struct blktap_page_pool *pool = tap->pool; ++ mempool_t *mem = pool->bufs; ++ struct page *page; ++ ++ BUG_ON(request->nr_pages != 0); ++ BUG_ON(nr_pages > POOL_MAX_REQUEST_PAGES); ++ ++ if (mem->curr_nr < nr_pages) ++ return -ENOMEM; ++ ++ /* NB. avoid thundering herds of tapdisks colliding. */ ++ spin_lock(&pool->lock); ++ ++ if (mem->curr_nr < nr_pages) { ++ spin_unlock(&pool->lock); ++ return -ENOMEM; ++ } ++ ++ while (request->nr_pages < nr_pages) { ++ page = mempool_alloc(mem, GFP_NOWAIT); ++ BUG_ON(!page); ++ request->pages[request->nr_pages++] = page; ++ } ++ ++ spin_unlock(&pool->lock); ++ ++ return 0; ++} ++ ++static void ++blktap_request_put_pages(struct blktap *tap, ++ struct blktap_request *request) ++{ ++ struct blktap_page_pool *pool = tap->pool; ++ struct page *page; ++ ++ while (request->nr_pages) { ++ page = request->pages[--request->nr_pages]; ++ mempool_free(page, pool->bufs); ++ } ++} ++ ++size_t ++blktap_request_debug(struct blktap *tap, char *buf, size_t size) ++{ ++ struct blktap_page_pool *pool = tap->pool; ++ mempool_t *mem = pool->bufs; ++ char *s = buf, *end = buf + size; ++ ++ s += snprintf(buf, end - s, ++ "pool:%s pages:%d free:%d\n", ++ kobject_name(&pool->kobj), ++ mem->min_nr, mem->curr_nr); ++ ++ return s - buf; ++} ++ ++struct blktap_request* ++blktap_request_alloc(struct blktap *tap) ++{ ++ struct blktap_request *request; ++ ++ request = mempool_alloc(request_pool, GFP_NOWAIT); ++ if (request) ++ request->tap = tap; ++ ++ return request; ++} ++ ++void ++blktap_request_free(struct blktap *tap, ++ struct blktap_request *request) ++{ ++ blktap_request_put_pages(tap, request); ++ ++ mempool_free(request, request_pool); ++ ++ __page_pool_wake(tap->pool); ++} ++ ++void ++blktap_request_bounce(struct blktap *tap, ++ struct blktap_request *request, ++ int seg, int write) ++{ ++ struct scatterlist *sg = &request->sg_table[seg]; ++ void *s, *p; ++ ++ BUG_ON(seg >= request->nr_pages); ++ ++ s = sg_virt(sg); ++ p = page_address(request->pages[seg]) + sg->offset; ++ ++ if (write) ++ memcpy(p, s, sg->length); ++ else ++ memcpy(s, p, sg->length); ++} ++ ++static void ++blktap_request_ctor(void *obj) ++{ ++ struct blktap_request *request = obj; ++ ++ memset(request, 0, sizeof(*request)); ++ sg_init_table(request->sg_table, ARRAY_SIZE(request->sg_table)); ++} ++ ++static int ++blktap_page_pool_resize(struct blktap_page_pool *pool, int target) ++{ ++ mempool_t *bufs = pool->bufs; ++ int err; ++ ++ /* NB. mempool asserts min_nr >= 1 */ ++ target = max(1, target); ++ ++ err = mempool_resize(bufs, target, GFP_KERNEL); ++ if (err) ++ return err; ++ ++ __page_pool_wake(pool); ++ ++ return 0; ++} ++ ++struct pool_attribute { ++ struct attribute attr; ++ ++ ssize_t (*show)(struct blktap_page_pool *pool, ++ char *buf); ++ ++ ssize_t (*store)(struct blktap_page_pool *pool, ++ const char *buf, size_t count); ++}; ++ ++#define kattr_to_pool_attr(_kattr) \ ++ container_of(_kattr, struct pool_attribute, attr) ++ ++static ssize_t ++blktap_page_pool_show_size(struct blktap_page_pool *pool, ++ char *buf) ++{ ++ mempool_t *mem = pool->bufs; ++ return sprintf(buf, "%d", mem->min_nr); ++} ++ ++static ssize_t ++blktap_page_pool_store_size(struct blktap_page_pool *pool, ++ const char *buf, size_t size) ++{ ++ int target; ++ ++ /* ++ * NB. target fixup to avoid undesired results. less than a ++ * full segment set can wedge the disk. much more than a ++ * couple times the physical queue depth is rarely useful. ++ */ ++ ++ target = simple_strtoul(buf, NULL, 0); ++ target = max(POOL_MAX_REQUEST_PAGES, target); ++ target = min(target, POOL_MAX_PAGES); ++ ++ return blktap_page_pool_resize(pool, target) ? : size; ++} ++ ++static struct pool_attribute blktap_page_pool_attr_size = ++ __ATTR(size, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH, ++ blktap_page_pool_show_size, ++ blktap_page_pool_store_size); ++ ++static ssize_t ++blktap_page_pool_show_free(struct blktap_page_pool *pool, ++ char *buf) ++{ ++ mempool_t *mem = pool->bufs; ++ return sprintf(buf, "%d", mem->curr_nr); ++} ++ ++static struct pool_attribute blktap_page_pool_attr_free = ++ __ATTR(free, S_IRUSR|S_IRGRP|S_IROTH, ++ blktap_page_pool_show_free, ++ NULL); ++ ++static struct attribute *blktap_page_pool_attrs[] = { ++ &blktap_page_pool_attr_size.attr, ++ &blktap_page_pool_attr_free.attr, ++ NULL, ++}; ++ ++static inline struct kobject* ++__blktap_kset_find_obj(struct kset *kset, const char *name) ++{ ++ struct kobject *k; ++ struct kobject *ret = NULL; ++ ++ spin_lock(&kset->list_lock); ++ list_for_each_entry(k, &kset->list, entry) { ++ if (kobject_name(k) && !strcmp(kobject_name(k), name)) { ++ ret = kobject_get(k); ++ break; ++ } ++ } ++ spin_unlock(&kset->list_lock); ++ return ret; ++} ++ ++static ssize_t ++blktap_page_pool_show_attr(struct kobject *kobj, struct attribute *kattr, ++ char *buf) ++{ ++ struct blktap_page_pool *pool = kobj_to_pool(kobj); ++ struct pool_attribute *attr = kattr_to_pool_attr(kattr); ++ ++ if (attr->show) ++ return attr->show(pool, buf); ++ ++ return -EIO; ++} ++ ++static ssize_t ++blktap_page_pool_store_attr(struct kobject *kobj, struct attribute *kattr, ++ const char *buf, size_t size) ++{ ++ struct blktap_page_pool *pool = kobj_to_pool(kobj); ++ struct pool_attribute *attr = kattr_to_pool_attr(kattr); ++ ++ if (attr->show) ++ return attr->store(pool, buf, size); ++ ++ return -EIO; ++} ++ ++static struct sysfs_ops blktap_page_pool_sysfs_ops = { ++ .show = blktap_page_pool_show_attr, ++ .store = blktap_page_pool_store_attr, ++}; ++ ++static void ++blktap_page_pool_release(struct kobject *kobj) ++{ ++ struct blktap_page_pool *pool = kobj_to_pool(kobj); ++ mempool_destroy(pool->bufs); ++ kfree(pool); ++} ++ ++struct kobj_type blktap_page_pool_ktype = { ++ .release = blktap_page_pool_release, ++ .sysfs_ops = &blktap_page_pool_sysfs_ops, ++ .default_attrs = blktap_page_pool_attrs, ++}; ++ ++static void* ++__mempool_page_alloc(gfp_t gfp_mask, void *pool_data) ++{ ++ struct page *page; ++ ++ if (!(gfp_mask & __GFP_WAIT)) ++ return NULL; ++ ++ page = alloc_page(gfp_mask); ++ if (page) ++ SetPageReserved(page); ++ ++ return page; ++} ++ ++static void ++__mempool_page_free(void *element, void *pool_data) ++{ ++ struct page *page = element; ++ ++ ClearPageReserved(page); ++ put_page(page); ++} ++ ++static struct kobject* ++blktap_page_pool_create(const char *name, int nr_pages) ++{ ++ struct blktap_page_pool *pool; ++ int err; ++ ++ pool = kzalloc(sizeof(*pool), GFP_KERNEL); ++ if (!pool) ++ goto fail; ++ ++ spin_lock_init(&pool->lock); ++ init_waitqueue_head(&pool->wait); ++ ++ pool->bufs = mempool_create(nr_pages, ++ __mempool_page_alloc, __mempool_page_free, ++ pool); ++ if (!pool->bufs) ++ goto fail_pool; ++ ++ kobject_init(&pool->kobj, &blktap_page_pool_ktype); ++ pool->kobj.kset = pool_set; ++ err = kobject_add(&pool->kobj, &pool_set->kobj, "%s", name); ++ if (err) ++ goto fail_bufs; ++ ++ return &pool->kobj; ++ ++ kobject_del(&pool->kobj); ++fail_bufs: ++ mempool_destroy(pool->bufs); ++fail_pool: ++ kfree(pool); ++fail: ++ return NULL; ++} ++ ++struct blktap_page_pool* ++blktap_page_pool_get(const char *name) ++{ ++ struct kobject *kobj; ++ ++ kobj = __blktap_kset_find_obj(pool_set, name); ++ if (!kobj) ++ kobj = blktap_page_pool_create(name, ++ POOL_DEFAULT_PAGES); ++ if (!kobj) ++ return ERR_PTR(-ENOMEM); ++ ++ return kobj_to_pool(kobj); ++} ++ ++int __init ++blktap_page_pool_init(struct kobject *parent) ++{ ++ request_cache = ++ kmem_cache_create("blktap-request", ++ sizeof(struct blktap_request), 0, ++ 0, blktap_request_ctor); ++ if (!request_cache) ++ return -ENOMEM; ++ ++ request_pool = ++ mempool_create_slab_pool(POOL_MIN_REQS, request_cache); ++ if (!request_pool) ++ return -ENOMEM; ++ ++ pool_set = kset_create_and_add("pools", NULL, parent); ++ if (!pool_set) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++void ++blktap_page_pool_exit(void) ++{ ++ if (pool_set) { ++ BUG_ON(!list_empty(&pool_set->list)); ++ kset_unregister(pool_set); ++ pool_set = NULL; ++ } ++ ++ if (request_pool) { ++ mempool_destroy(request_pool); ++ request_pool = NULL; ++ } ++ ++ if (request_cache) { ++ kmem_cache_destroy(request_cache); ++ request_cache = NULL; ++ } ++} +diff --git a/drivers/xen/blktap/ring.c b/drivers/xen/blktap/ring.c +new file mode 100644 +index 0000000..6b86be5 +--- /dev/null ++++ b/drivers/xen/blktap/ring.c +@@ -0,0 +1,550 @@ ++ ++#include <linux/device.h> ++#include <linux/signal.h> ++#include <linux/sched.h> ++#include <linux/poll.h> ++#include <linux/blkdev.h> ++ ++#include "blktap.h" ++ ++int blktap_ring_major; ++static struct cdev blktap_ring_cdev; ++ ++ /* ++ * BLKTAP - immediately before the mmap area, ++ * we have a bunch of pages reserved for shared memory rings. ++ */ ++#define RING_PAGES 1 ++ ++static void ++blktap_ring_read_response(struct blktap *tap, ++ const struct blkif_response *rsp) ++{ ++ struct blktap_ring *ring = &tap->ring; ++ struct blktap_request *request; ++ int usr_idx, err; ++ ++ request = NULL; ++ ++ usr_idx = rsp->id; ++ if (usr_idx < 0 || usr_idx >= MAX_PENDING_REQS) { ++ err = -ERANGE; ++ goto invalid; ++ } ++ ++ request = ring->pending[usr_idx]; ++ ++ if (!request) { ++ err = -ESRCH; ++ goto invalid; ++ } ++ ++ if (rsp->operation != request->operation) { ++ err = -EINVAL; ++ goto invalid; ++ } ++ ++ dev_dbg(ring->dev, ++ "request %d [%p] response: %d\n", ++ request->usr_idx, request, rsp->status); ++ ++ err = rsp->status == BLKIF_RSP_OKAY ? 0 : -EIO; ++end_request: ++ blktap_device_end_request(tap, request, err); ++ return; ++ ++invalid: ++ dev_warn(ring->dev, ++ "invalid response, idx:%d status:%d op:%d/%d: err %d\n", ++ usr_idx, rsp->status, ++ rsp->operation, request->operation, ++ err); ++ if (request) ++ goto end_request; ++} ++ ++static void ++blktap_read_ring(struct blktap *tap) ++{ ++ struct blktap_ring *ring = &tap->ring; ++ struct blkif_response rsp; ++ RING_IDX rc, rp; ++ ++ down_read(¤t->mm->mmap_sem); ++ if (!ring->vma) { ++ up_read(¤t->mm->mmap_sem); ++ return; ++ } ++ ++ /* for each outstanding message on the ring */ ++ rp = ring->ring.sring->rsp_prod; ++ rmb(); ++ ++ for (rc = ring->ring.rsp_cons; rc != rp; rc++) { ++ memcpy(&rsp, RING_GET_RESPONSE(&ring->ring, rc), sizeof(rsp)); ++ blktap_ring_read_response(tap, &rsp); ++ } ++ ++ ring->ring.rsp_cons = rc; ++ ++ up_read(¤t->mm->mmap_sem); ++} ++ ++static int blktap_ring_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ++{ ++ return VM_FAULT_SIGBUS; ++} ++ ++static void ++blktap_ring_fail_pending(struct blktap *tap) ++{ ++ struct blktap_ring *ring = &tap->ring; ++ struct blktap_request *request; ++ int usr_idx; ++ ++ for (usr_idx = 0; usr_idx < MAX_PENDING_REQS; usr_idx++) { ++ request = ring->pending[usr_idx]; ++ if (!request) ++ continue; ++ ++ blktap_device_end_request(tap, request, -EIO); ++ } ++} ++ ++static void ++blktap_ring_vm_close(struct vm_area_struct *vma) ++{ ++ struct blktap *tap = vma->vm_private_data; ++ struct blktap_ring *ring = &tap->ring; ++ struct page *page = virt_to_page(ring->ring.sring); ++ ++ blktap_ring_fail_pending(tap); ++ ++ zap_page_range(vma, vma->vm_start, PAGE_SIZE, NULL); ++ ClearPageReserved(page); ++ __free_page(page); ++ ++ ring->vma = NULL; ++ ++ if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) ++ blktap_control_destroy_tap(tap); ++} ++ ++static struct vm_operations_struct blktap_ring_vm_operations = { ++ .close = blktap_ring_vm_close, ++ .fault = blktap_ring_fault, ++}; ++ ++int ++blktap_ring_map_segment(struct blktap *tap, ++ struct blktap_request *request, ++ int seg) ++{ ++ struct blktap_ring *ring = &tap->ring; ++ unsigned long uaddr; ++ ++ uaddr = MMAP_VADDR(ring->user_vstart, request->usr_idx, seg); ++ return vm_insert_page(ring->vma, uaddr, request->pages[seg]); ++} ++ ++int ++blktap_ring_map_request(struct blktap *tap, ++ struct blktap_request *request) ++{ ++ int seg, err = 0; ++ int write; ++ ++ write = request->operation == BLKIF_OP_WRITE; ++ ++ for (seg = 0; seg < request->nr_pages; seg++) { ++ if (write) ++ blktap_request_bounce(tap, request, seg, write); ++ ++ err = blktap_ring_map_segment(tap, request, seg); ++ if (err) ++ break; ++ } ++ ++ if (err) ++ blktap_ring_unmap_request(tap, request); ++ ++ return err; ++} ++ ++void ++blktap_ring_unmap_request(struct blktap *tap, ++ struct blktap_request *request) ++{ ++ struct blktap_ring *ring = &tap->ring; ++ unsigned long uaddr; ++ unsigned size; ++ int seg, read; ++ ++ uaddr = MMAP_VADDR(ring->user_vstart, request->usr_idx, 0); ++ size = request->nr_pages << PAGE_SHIFT; ++ read = request->operation == BLKIF_OP_READ; ++ ++ if (read) ++ for (seg = 0; seg < request->nr_pages; seg++) ++ blktap_request_bounce(tap, request, seg, !read); ++ ++ zap_page_range(ring->vma, uaddr, size, NULL); ++} ++ ++void ++blktap_ring_free_request(struct blktap *tap, ++ struct blktap_request *request) ++{ ++ struct blktap_ring *ring = &tap->ring; ++ ++ ring->pending[request->usr_idx] = NULL; ++ ring->n_pending--; ++ ++ blktap_request_free(tap, request); ++} ++ ++struct blktap_request* ++blktap_ring_make_request(struct blktap *tap) ++{ ++ struct blktap_ring *ring = &tap->ring; ++ struct blktap_request *request; ++ int usr_idx; ++ ++ if (RING_FULL(&ring->ring)) ++ return ERR_PTR(-ENOSPC); ++ ++ request = blktap_request_alloc(tap); ++ if (!request) ++ return ERR_PTR(-ENOMEM); ++ ++ for (usr_idx = 0; usr_idx < BLK_RING_SIZE; usr_idx++) ++ if (!ring->pending[usr_idx]) ++ break; ++ ++ BUG_ON(usr_idx >= BLK_RING_SIZE); ++ ++ request->tap = tap; ++ request->usr_idx = usr_idx; ++ ++ ring->pending[usr_idx] = request; ++ ring->n_pending++; ++ ++ return request; ++} ++ ++void ++blktap_ring_submit_request(struct blktap *tap, ++ struct blktap_request *request) ++{ ++ struct blktap_ring *ring = &tap->ring; ++ struct blkif_request *breq; ++ struct scatterlist *sg; ++ int i, nsecs = 0; ++ ++ dev_dbg(ring->dev, ++ "request %d [%p] submit\n", request->usr_idx, request); ++ ++ breq = RING_GET_REQUEST(&ring->ring, ring->ring.req_prod_pvt); ++ ++ breq->id = request->usr_idx; ++ breq->sector_number = blk_rq_pos(request->rq); ++ breq->handle = 0; ++ breq->operation = request->operation; ++ breq->nr_segments = request->nr_pages; ++ ++ blktap_for_each_sg(sg, request, i) { ++ struct blkif_request_segment *seg = &breq->seg[i]; ++ int first, count; ++ ++ count = sg->length >> 9; ++ first = sg->offset >> 9; ++ ++ seg->first_sect = first; ++ seg->last_sect = first + count - 1; ++ ++ nsecs += count; ++ } ++ ++ ring->ring.req_prod_pvt++; ++ ++ do_gettimeofday(&request->time); ++ ++ ++ if (request->operation == BLKIF_OP_WRITE) { ++ tap->stats.st_wr_sect += nsecs; ++ tap->stats.st_wr_req++; ++ } ++ ++ if (request->operation == BLKIF_OP_READ) { ++ tap->stats.st_rd_sect += nsecs; ++ tap->stats.st_rd_req++; ++ } ++} ++ ++static int ++blktap_ring_open(struct inode *inode, struct file *filp) ++{ ++ struct blktap *tap = NULL; ++ int minor; ++ ++ minor = iminor(inode); ++ ++ if (minor < blktap_max_minor) ++ tap = blktaps[minor]; ++ ++ if (!tap) ++ return -ENXIO; ++ ++ if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) ++ return -ENXIO; ++ ++ if (tap->ring.task) ++ return -EBUSY; ++ ++ filp->private_data = tap; ++ tap->ring.task = current; ++ ++ return 0; ++} ++ ++static int ++blktap_ring_release(struct inode *inode, struct file *filp) ++{ ++ struct blktap *tap = filp->private_data; ++ ++ blktap_device_destroy_sync(tap); ++ ++ tap->ring.task = NULL; ++ ++ if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) ++ blktap_control_destroy_tap(tap); ++ ++ return 0; ++} ++ ++static int ++blktap_ring_mmap(struct file *filp, struct vm_area_struct *vma) ++{ ++ struct blktap *tap = filp->private_data; ++ struct blktap_ring *ring = &tap->ring; ++ struct blkif_sring *sring; ++ struct page *page = NULL; ++ int err; ++ ++ if (ring->vma) ++ return -EBUSY; ++ ++ page = alloc_page(GFP_KERNEL|__GFP_ZERO); ++ if (!page) ++ return -ENOMEM; ++ ++ SetPageReserved(page); ++ ++ err = vm_insert_page(vma, vma->vm_start, page); ++ if (err) ++ goto fail; ++ ++ sring = page_address(page); ++ SHARED_RING_INIT(sring); ++ FRONT_RING_INIT(&ring->ring, sring, PAGE_SIZE); ++ ++ ring->ring_vstart = vma->vm_start; ++ ring->user_vstart = ring->ring_vstart + PAGE_SIZE; ++ ++ vma->vm_private_data = tap; ++ ++ vma->vm_flags |= VM_DONTCOPY; ++ vma->vm_flags |= VM_RESERVED; ++ ++ vma->vm_ops = &blktap_ring_vm_operations; ++ ++ ring->vma = vma; ++ return 0; ++ ++fail: ++ if (page) { ++ zap_page_range(vma, vma->vm_start, PAGE_SIZE, NULL); ++ ClearPageReserved(page); ++ __free_page(page); ++ } ++ ++ return err; ++} ++ ++static int ++blktap_ring_ioctl(struct inode *inode, struct file *filp, ++ unsigned int cmd, unsigned long arg) ++{ ++ struct blktap *tap = filp->private_data; ++ struct blktap_ring *ring = &tap->ring; ++ ++ BTDBG("%d: cmd: %u, arg: %lu\n", tap->minor, cmd, arg); ++ ++ if (!ring->vma || ring->vma->vm_mm != current->mm) ++ return -EACCES; ++ ++ switch(cmd) { ++ case BLKTAP2_IOCTL_KICK_FE: ++ ++ blktap_read_ring(tap); ++ return 0; ++ ++ case BLKTAP2_IOCTL_CREATE_DEVICE: { ++ struct blktap_params params; ++ void __user *ptr = (void *)arg; ++ ++ if (!arg) ++ return -EINVAL; ++ ++ if (copy_from_user(¶ms, ptr, sizeof(params))) ++ return -EFAULT; ++ ++ return blktap_device_create(tap, ¶ms); ++ } ++ ++ case BLKTAP2_IOCTL_REMOVE_DEVICE: ++ ++ return blktap_device_destroy(tap); ++ } ++ ++ return -ENOIOCTLCMD; ++} ++ ++static unsigned int blktap_ring_poll(struct file *filp, poll_table *wait) ++{ ++ struct blktap *tap = filp->private_data; ++ struct blktap_ring *ring = &tap->ring; ++ int work; ++ ++ poll_wait(filp, &tap->pool->wait, wait); ++ poll_wait(filp, &ring->poll_wait, wait); ++ ++ down_read(¤t->mm->mmap_sem); ++ if (ring->vma && tap->device.gd) ++ blktap_device_run_queue(tap); ++ up_read(¤t->mm->mmap_sem); ++ ++ work = ring->ring.req_prod_pvt - ring->ring.sring->req_prod; ++ RING_PUSH_REQUESTS(&ring->ring); ++ ++ if (work || ++ ring->ring.sring->private.tapif_user.msg || ++ test_and_clear_bit(BLKTAP_DEVICE_CLOSED, &tap->dev_inuse)) ++ return POLLIN | POLLRDNORM; ++ ++ return 0; ++} ++ ++static struct file_operations blktap_ring_file_operations = { ++ .owner = THIS_MODULE, ++ .open = blktap_ring_open, ++ .release = blktap_ring_release, ++ .ioctl = blktap_ring_ioctl, ++ .mmap = blktap_ring_mmap, ++ .poll = blktap_ring_poll, ++}; ++ ++void ++blktap_ring_kick_user(struct blktap *tap) ++{ ++ wake_up(&tap->ring.poll_wait); ++} ++ ++int ++blktap_ring_destroy(struct blktap *tap) ++{ ++ struct blktap_ring *ring = &tap->ring; ++ ++ if (ring->task || ring->vma) ++ return -EBUSY; ++ ++ return 0; ++} ++ ++int ++blktap_ring_create(struct blktap *tap) ++{ ++ struct blktap_ring *ring = &tap->ring; ++ ++ init_waitqueue_head(&ring->poll_wait); ++ ring->devno = MKDEV(blktap_ring_major, tap->minor); ++ ++ return 0; ++} ++ ++size_t ++blktap_ring_debug(struct blktap *tap, char *buf, size_t size) ++{ ++ struct blktap_ring *ring = &tap->ring; ++ char *s = buf, *end = buf + size; ++ int usr_idx; ++ ++ s += snprintf(s, end - s, ++ "begin pending:%d\n", ring->n_pending); ++ ++ for (usr_idx = 0; usr_idx < MAX_PENDING_REQS; usr_idx++) { ++ struct blktap_request *request; ++ struct timeval *time; ++ int write; ++ ++ request = ring->pending[usr_idx]; ++ if (!request) ++ continue; ++ ++ write = request->operation == BLKIF_OP_WRITE; ++ time = &request->time; ++ ++ s += snprintf(s, end - s, ++ "%02d: usr_idx:%02d " ++ "op:%c nr_pages:%02d time:%lu.%09lu\n", ++ usr_idx, request->usr_idx, ++ write ? 'W' : 'R', request->nr_pages, ++ time->tv_sec, time->tv_usec); ++ } ++ ++ s += snprintf(s, end - s, "end pending\n"); ++ ++ return s - buf; ++} ++ ++ ++int __init ++blktap_ring_init(void) ++{ ++ dev_t dev = 0; ++ int err; ++ ++ cdev_init(&blktap_ring_cdev, &blktap_ring_file_operations); ++ blktap_ring_cdev.owner = THIS_MODULE; ++ ++ err = alloc_chrdev_region(&dev, 0, MAX_BLKTAP_DEVICE, "blktap2"); ++ if (err < 0) { ++ BTERR("error registering ring devices: %d\n", err); ++ return err; ++ } ++ ++ err = cdev_add(&blktap_ring_cdev, dev, MAX_BLKTAP_DEVICE); ++ if (err) { ++ BTERR("error adding ring device: %d\n", err); ++ unregister_chrdev_region(dev, MAX_BLKTAP_DEVICE); ++ return err; ++ } ++ ++ blktap_ring_major = MAJOR(dev); ++ BTINFO("blktap ring major: %d\n", blktap_ring_major); ++ ++ return 0; ++} ++ ++void ++blktap_ring_exit(void) ++{ ++ if (!blktap_ring_major) ++ return; ++ ++ cdev_del(&blktap_ring_cdev); ++ unregister_chrdev_region(MKDEV(blktap_ring_major, 0), ++ MAX_BLKTAP_DEVICE); ++ ++ blktap_ring_major = 0; ++} +diff --git a/drivers/xen/blktap/sysfs.c b/drivers/xen/blktap/sysfs.c +new file mode 100644 +index 0000000..3c424af +--- /dev/null ++++ b/drivers/xen/blktap/sysfs.c +@@ -0,0 +1,288 @@ ++#include <linux/types.h> ++#include <linux/device.h> ++#include <linux/module.h> ++#include <linux/sched.h> ++#include <linux/genhd.h> ++#include <linux/blkdev.h> ++ ++#include "blktap.h" ++ ++int blktap_debug_level = 1; ++ ++static struct class *class; ++ ++static ssize_t ++blktap_sysfs_set_name(struct device *dev, struct device_attribute *attr, const char *buf, size_t size) ++{ ++ struct blktap *tap; ++ ++ tap = dev_get_drvdata(dev); ++ if (!tap) ++ return 0; ++ ++ if (size >= BLKTAP2_MAX_MESSAGE_LEN) ++ return -ENAMETOOLONG; ++ ++ if (strnlen(buf, size) != size) ++ return -EINVAL; ++ ++ strcpy(tap->name, buf); ++ ++ return size; ++} ++ ++static ssize_t ++blktap_sysfs_get_name(struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ struct blktap *tap; ++ ssize_t size; ++ ++ tap = dev_get_drvdata(dev); ++ if (!tap) ++ return 0; ++ ++ if (tap->name[0]) ++ size = sprintf(buf, "%s\n", tap->name); ++ else ++ size = sprintf(buf, "%d\n", tap->minor); ++ ++ return size; ++} ++static DEVICE_ATTR(name, S_IRUGO|S_IWUSR, ++ blktap_sysfs_get_name, blktap_sysfs_set_name); ++ ++static void ++blktap_sysfs_remove_work(struct work_struct *work) ++{ ++ struct blktap *tap ++ = container_of(work, struct blktap, remove_work); ++ blktap_control_destroy_tap(tap); ++} ++ ++static ssize_t ++blktap_sysfs_remove_device(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t size) ++{ ++ struct blktap *tap; ++ int err; ++ ++ tap = dev_get_drvdata(dev); ++ if (!tap) ++ return size; ++ ++ if (test_and_set_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) ++ goto wait; ++ ++ if (tap->ring.vma) { ++ struct blkif_sring *sring = tap->ring.ring.sring; ++ sring->private.tapif_user.msg = BLKTAP2_RING_MESSAGE_CLOSE; ++ blktap_ring_kick_user(tap); ++ } else { ++ INIT_WORK(&tap->remove_work, blktap_sysfs_remove_work); ++ schedule_work(&tap->remove_work); ++ } ++wait: ++ err = wait_event_interruptible(tap->remove_wait, ++ !dev_get_drvdata(dev)); ++ if (err) ++ return err; ++ ++ return size; ++} ++static DEVICE_ATTR(remove, S_IWUSR, NULL, blktap_sysfs_remove_device); ++ ++static ssize_t ++blktap_sysfs_debug_device(struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ struct blktap *tap; ++ char *s = buf, *end = buf + PAGE_SIZE; ++ ++ tap = dev_get_drvdata(dev); ++ if (!tap) ++ return 0; ++ ++ s += blktap_control_debug(tap, s, end - s); ++ ++ s += blktap_request_debug(tap, s, end - s); ++ ++ s += blktap_device_debug(tap, s, end - s); ++ ++ s += blktap_ring_debug(tap, s, end - s); ++ ++ return s - buf; ++} ++static DEVICE_ATTR(debug, S_IRUGO, blktap_sysfs_debug_device, NULL); ++ ++static ssize_t ++blktap_sysfs_show_task(struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ struct blktap *tap; ++ ssize_t rv = 0; ++ ++ tap = dev_get_drvdata(dev); ++ if (!tap) ++ return 0; ++ ++ if (tap->ring.task) ++ rv = sprintf(buf, "%d\n", tap->ring.task->pid); ++ ++ return rv; ++} ++static DEVICE_ATTR(task, S_IRUGO, blktap_sysfs_show_task, NULL); ++ ++static ssize_t ++blktap_sysfs_show_pool(struct device *dev, ++ struct device_attribute *attr, ++ char *buf) ++{ ++ struct blktap *tap = dev_get_drvdata(dev); ++ return sprintf(buf, "%s", kobject_name(&tap->pool->kobj)); ++} ++ ++static ssize_t ++blktap_sysfs_store_pool(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t size) ++{ ++ struct blktap *tap = dev_get_drvdata(dev); ++ struct blktap_page_pool *pool, *tmp = tap->pool; ++ ++ if (tap->device.gd) ++ return -EBUSY; ++ ++ pool = blktap_page_pool_get(buf); ++ if (IS_ERR(pool)) ++ return PTR_ERR(pool); ++ ++ tap->pool = pool; ++ kobject_put(&tmp->kobj); ++ ++ return size; ++} ++DEVICE_ATTR(pool, S_IRUSR|S_IWUSR, ++ blktap_sysfs_show_pool, blktap_sysfs_store_pool); ++ ++int ++blktap_sysfs_create(struct blktap *tap) ++{ ++ struct blktap_ring *ring = &tap->ring; ++ struct device *dev; ++ int err = 0; ++ ++ init_waitqueue_head(&tap->remove_wait); ++ ++ dev = device_create(class, NULL, ring->devno, ++ tap, "blktap%d", tap->minor); ++ if (IS_ERR(dev)) ++ err = PTR_ERR(dev); ++ if (!err) ++ err = device_create_file(dev, &dev_attr_name); ++ if (!err) ++ err = device_create_file(dev, &dev_attr_remove); ++ if (!err) ++ err = device_create_file(dev, &dev_attr_debug); ++ if (!err) ++ err = device_create_file(dev, &dev_attr_task); ++ if (!err) ++ err = device_create_file(dev, &dev_attr_pool); ++ if (!err) ++ ring->dev = dev; ++ else ++ device_unregister(dev); ++ ++ return err; ++} ++ ++void ++blktap_sysfs_destroy(struct blktap *tap) ++{ ++ struct blktap_ring *ring = &tap->ring; ++ struct device *dev; ++ ++ dev = ring->dev; ++ ++ if (!dev) ++ return; ++ ++ dev_set_drvdata(dev, NULL); ++ wake_up(&tap->remove_wait); ++ ++ device_unregister(dev); ++ ring->dev = NULL; ++} ++ ++static ssize_t ++blktap_sysfs_show_verbosity(struct class *class, char *buf) ++{ ++ return sprintf(buf, "%d\n", blktap_debug_level); ++} ++ ++static ssize_t ++blktap_sysfs_set_verbosity(struct class *class, const char *buf, size_t size) ++{ ++ int level; ++ ++ if (sscanf(buf, "%d", &level) == 1) { ++ blktap_debug_level = level; ++ return size; ++ } ++ ++ return -EINVAL; ++} ++static CLASS_ATTR(verbosity, S_IRUGO|S_IWUSR, ++ blktap_sysfs_show_verbosity, blktap_sysfs_set_verbosity); ++ ++static ssize_t ++blktap_sysfs_show_devices(struct class *class, char *buf) ++{ ++ int i, ret; ++ struct blktap *tap; ++ ++ mutex_lock(&blktap_lock); ++ ++ ret = 0; ++ for (i = 0; i < blktap_max_minor; i++) { ++ tap = blktaps[i]; ++ if (!tap) ++ continue; ++ ++ if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse)) ++ continue; ++ ++ ret += sprintf(buf + ret, "%d %s\n", tap->minor, tap->name); ++ } ++ ++ mutex_unlock(&blktap_lock); ++ ++ return ret; ++} ++static CLASS_ATTR(devices, S_IRUGO, blktap_sysfs_show_devices, NULL); ++ ++void ++blktap_sysfs_exit(void) ++{ ++ if (class) ++ class_destroy(class); ++} ++ ++int __init ++blktap_sysfs_init(void) ++{ ++ struct class *cls; ++ int err = 0; ++ ++ cls = class_create(THIS_MODULE, "blktap2"); ++ if (IS_ERR(cls)) ++ err = PTR_ERR(cls); ++ if (!err) ++ err = class_create_file(cls, &class_attr_verbosity); ++ if (!err) ++ err = class_create_file(cls, &class_attr_devices); ++ if (!err) ++ class = cls; ++ else ++ class_destroy(cls); ++ ++ return err; ++} +diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c +index bdfd584..6625ffe 100644 +--- a/drivers/xen/cpu_hotplug.c ++++ b/drivers/xen/cpu_hotplug.c +@@ -1,5 +1,6 @@ + #include <linux/notifier.h> + ++#include <xen/xen.h> + #include <xen/xenbus.h> + + #include <asm/xen/hypervisor.h> +diff --git a/drivers/xen/events.c b/drivers/xen/events.c +index 1417015..ac7b42f 100644 +--- a/drivers/xen/events.c ++++ b/drivers/xen/events.c +@@ -16,7 +16,7 @@ + * (typically dom0). + * 2. VIRQs, typically used for timers. These are per-cpu events. + * 3. IPIs. +- * 4. Hardware interrupts. Not supported at present. ++ * 4. PIRQs - Hardware interrupts. + * + * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 + */ +@@ -27,18 +27,32 @@ + #include <linux/module.h> + #include <linux/string.h> + #include <linux/bootmem.h> ++#include <linux/irqnr.h> ++#include <linux/pci_regs.h> ++#include <linux/pci.h> ++#include <linux/msi.h> + ++#include <asm/desc.h> + #include <asm/ptrace.h> + #include <asm/irq.h> + #include <asm/idle.h> ++#include <asm/io_apic.h> + #include <asm/sync_bitops.h> + #include <asm/xen/hypercall.h> + #include <asm/xen/hypervisor.h> ++#include <asm/xen/pci.h> + ++#include <xen/xen.h> ++#include <xen/hvm.h> + #include <xen/xen-ops.h> + #include <xen/events.h> + #include <xen/interface/xen.h> + #include <xen/interface/event_channel.h> ++#include <xen/interface/hvm/hvm_op.h> ++#include <xen/interface/hvm/params.h> ++#include <xen/page.h> ++ ++#include "../pci/msi.h" + + /* + * This lock protects updates to the following mapping and reference-count +@@ -67,7 +81,7 @@ enum xen_irq_type { + * event channel - irq->event channel mapping + * cpu - cpu this event channel is bound to + * index - type-specific information: +- * PIRQ - vector, with MSB being "needs EIO" ++ * PIRQ - with MSB being "needs EIO" + * VIRQ - virq number + * IPI - IPI vector + * EVTCHN - +@@ -83,20 +97,30 @@ struct irq_info + enum ipi_vector ipi; + struct { + unsigned short gsi; +- unsigned short vector; ++ unsigned char vector; ++ unsigned char flags; ++ uint16_t domid; + } pirq; + } u; + }; ++#define PIRQ_SHAREABLE (1 << 1) + +-static struct irq_info irq_info[NR_IRQS]; ++/* Bitmap indicating which PIRQs require Xen to be notified on unmask. */ ++static bool pirq_eoi_does_unmask; ++static unsigned long *pirq_needs_eoi_bits; + +-static int evtchn_to_irq[NR_EVENT_CHANNELS] = { +- [0 ... NR_EVENT_CHANNELS-1] = -1 +-}; ++static struct irq_info *irq_info; ++ ++static int *evtchn_to_irq; + struct cpu_evtchn_s { + unsigned long bits[NR_EVENT_CHANNELS/BITS_PER_LONG]; + }; +-static struct cpu_evtchn_s *cpu_evtchn_mask_p; ++ ++static __initdata struct cpu_evtchn_s init_evtchn_mask = { ++ .bits[0 ... (NR_EVENT_CHANNELS/BITS_PER_LONG)-1] = ~0ul, ++}; ++static struct cpu_evtchn_s *cpu_evtchn_mask_p = &init_evtchn_mask; ++ + static inline unsigned long *cpu_evtchn_mask(int cpu) + { + return cpu_evtchn_mask_p[cpu].bits; +@@ -107,6 +131,7 @@ static inline unsigned long *cpu_evtchn_mask(int cpu) + + static struct irq_chip xen_dynamic_chip; + static struct irq_chip xen_percpu_chip; ++static struct irq_chip xen_pirq_chip; + + /* Constructor for packed IRQ information. */ + static struct irq_info mk_unbound_info(void) +@@ -136,7 +161,8 @@ static struct irq_info mk_pirq_info(unsigned short evtchn, + unsigned short gsi, unsigned short vector) + { + return (struct irq_info) { .type = IRQT_PIRQ, .evtchn = evtchn, +- .cpu = 0, .u.pirq = { .gsi = gsi, .vector = vector } }; ++ .cpu = 0, .u.pirq = ++ { .gsi = gsi, .vector = vector, .domid = DOMID_SELF } }; + } + + /* +@@ -219,6 +245,15 @@ static unsigned int cpu_from_evtchn(unsigned int evtchn) + return ret; + } + ++static bool pirq_needs_eoi(unsigned irq) ++{ ++ struct irq_info *info = info_for_irq(irq); ++ ++ BUG_ON(info->type != IRQT_PIRQ); ++ ++ return test_bit(info->u.pirq.gsi, pirq_needs_eoi_bits); ++} ++ + static inline unsigned long active_evtchns(unsigned int cpu, + struct shared_info *sh, + unsigned int idx) +@@ -237,17 +272,17 @@ static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) + cpumask_copy(irq_to_desc(irq)->affinity, cpumask_of(cpu)); + #endif + +- __clear_bit(chn, cpu_evtchn_mask(cpu_from_irq(irq))); +- __set_bit(chn, cpu_evtchn_mask(cpu)); ++ clear_bit(chn, cpu_evtchn_mask(cpu_from_irq(irq))); ++ set_bit(chn, cpu_evtchn_mask(cpu)); + + irq_info[irq].cpu = cpu; + } + + static void init_evtchn_cpu_bindings(void) + { ++ int i; + #ifdef CONFIG_SMP + struct irq_desc *desc; +- int i; + + /* By default all event channels notify CPU#0. */ + for_each_irq_desc(i, desc) { +@@ -255,7 +290,9 @@ static void init_evtchn_cpu_bindings(void) + } + #endif + +- memset(cpu_evtchn_mask(0), ~0, sizeof(struct cpu_evtchn_s)); ++ for_each_possible_cpu(i) ++ memset(cpu_evtchn_mask(i), ++ (i == 0) ? ~0 : 0, sizeof(struct cpu_evtchn_s)); + } + + static inline void clear_evtchn(int port) +@@ -300,6 +337,14 @@ static void mask_evtchn(int port) + sync_set_bit(port, &s->evtchn_mask[0]); + } + ++static void mask_irq(unsigned int irq) ++{ ++ int evtchn = evtchn_from_irq(irq); ++ ++ if (VALID_EVTCHN(evtchn)) ++ mask_evtchn(evtchn); ++} ++ + static void unmask_evtchn(int port) + { + struct shared_info *s = HYPERVISOR_shared_info; +@@ -330,26 +375,370 @@ static void unmask_evtchn(int port) + put_cpu(); + } + ++static void unmask_irq(unsigned int irq) ++{ ++ int evtchn = evtchn_from_irq(irq); ++ ++ if (VALID_EVTCHN(evtchn)) ++ unmask_evtchn(evtchn); ++} ++ ++static int get_nr_hw_irqs(void) ++{ ++ int ret = 1; ++ ++#ifdef CONFIG_X86_IO_APIC ++ ret = get_nr_irqs_gsi(); ++#endif ++ ++ return ret; ++} ++ + static int find_unbound_irq(void) + { + int irq; + struct irq_desc *desc; ++ int start = get_nr_hw_irqs(); + +- for (irq = 0; irq < nr_irqs; irq++) ++ if (start == nr_irqs) ++ goto no_irqs; ++ ++ /* nr_irqs is a magic value. Must not use it.*/ ++ for (irq = nr_irqs-1; irq > start; irq--) { ++ desc = irq_to_desc(irq); ++ /* only 0->15 have init'd desc; handle irq > 16 */ ++ if (desc == NULL) ++ break; ++ if (desc->chip == &no_irq_chip) ++ break; ++ if (desc->chip != &xen_dynamic_chip) ++ continue; + if (irq_info[irq].type == IRQT_UNBOUND) + break; ++ } + +- if (irq == nr_irqs) +- panic("No available IRQ to bind to: increase nr_irqs!\n"); ++ if (irq == start) ++ goto no_irqs; + +- desc = irq_to_desc_alloc_node(irq, 0); ++ desc = irq_to_desc_alloc_node(irq, -1); + if (WARN_ON(desc == NULL)) + return -1; + +- dynamic_irq_init(irq); ++ dynamic_irq_init_keep_chip_data(irq); ++ ++ return irq; ++ ++no_irqs: ++ panic("No available IRQ to bind to: increase nr_irqs!\n"); ++} ++ ++static bool identity_mapped_irq(unsigned irq) ++{ ++ /* identity map all the hardware irqs */ ++ return irq < get_nr_hw_irqs(); ++} ++ ++static void pirq_eoi(unsigned int irq) ++{ ++ struct irq_info *info = info_for_irq(irq); ++ struct physdev_eoi eoi = { .irq = info->u.pirq.gsi }; ++ bool need_eoi; ++ ++ need_eoi = pirq_needs_eoi(irq); ++ ++ if (!need_eoi || !pirq_eoi_does_unmask) ++ unmask_evtchn(info->evtchn); ++ ++ if (need_eoi) { ++ int rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi); ++ WARN_ON(rc); ++ } ++} ++ ++static void pirq_query_unmask(int irq) ++{ ++ struct physdev_irq_status_query irq_status; ++ struct irq_info *info = info_for_irq(irq); ++ ++ if (pirq_eoi_does_unmask) ++ return; ++ ++ BUG_ON(info->type != IRQT_PIRQ); ++ ++ irq_status.irq = info->u.pirq.gsi; ++ if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status)) ++ irq_status.flags = 0; ++ ++ clear_bit(info->u.pirq.gsi, pirq_needs_eoi_bits); ++ if (irq_status.flags & XENIRQSTAT_needs_eoi) ++ set_bit(info->u.pirq.gsi, pirq_needs_eoi_bits); ++} ++ ++static bool probing_irq(int irq) ++{ ++ struct irq_desc *desc = irq_to_desc(irq); ++ ++ return desc && desc->action == NULL; ++} ++ ++static unsigned int startup_pirq(unsigned int irq) ++{ ++ struct evtchn_bind_pirq bind_pirq; ++ struct irq_info *info = info_for_irq(irq); ++ int evtchn = evtchn_from_irq(irq); ++ int rc; ++ ++ BUG_ON(info->type != IRQT_PIRQ); ++ ++ if (VALID_EVTCHN(evtchn)) ++ goto out; ++ ++ bind_pirq.pirq = info->u.pirq.gsi; ++ /* NB. We are happy to share unless we are probing. */ ++ bind_pirq.flags = info->u.pirq.flags & PIRQ_SHAREABLE ? ++ BIND_PIRQ__WILL_SHARE : 0; ++ rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq); ++ if (rc != 0) { ++ if (!probing_irq(irq)) ++ printk(KERN_INFO "Failed to obtain physical IRQ %d" \ ++ " (GSI:%d)\n", irq, info->u.pirq.gsi); ++ return 0; ++ } ++ evtchn = bind_pirq.port; ++ ++ pirq_query_unmask(irq); ++ ++ evtchn_to_irq[evtchn] = irq; ++ bind_evtchn_to_cpu(evtchn, 0); ++ info->evtchn = evtchn; ++ ++ out: ++ pirq_eoi(irq); ++ ++ return 0; ++} ++ ++static void shutdown_pirq(unsigned int irq) ++{ ++ struct evtchn_close close; ++ struct irq_info *info = info_for_irq(irq); ++ int evtchn = evtchn_from_irq(irq); ++ ++ BUG_ON(info->type != IRQT_PIRQ); ++ ++ if (!VALID_EVTCHN(evtchn)) ++ return; ++ ++ mask_evtchn(evtchn); ++ ++ close.port = evtchn; ++ if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) ++ BUG(); ++ ++ bind_evtchn_to_cpu(evtchn, 0); ++ evtchn_to_irq[evtchn] = -1; ++ info->evtchn = 0; ++} ++ ++static void ack_pirq(unsigned int irq) ++{ ++ move_masked_irq(irq); ++ ++ pirq_eoi(irq); ++} ++ ++static void end_pirq(unsigned int irq) ++{ ++ int evtchn = evtchn_from_irq(irq); ++ struct irq_desc *desc = irq_to_desc(irq); ++ ++ if (WARN_ON(!desc)) ++ return; ++ ++ if ((desc->status & (IRQ_DISABLED|IRQ_PENDING)) == ++ (IRQ_DISABLED|IRQ_PENDING)) { ++ shutdown_pirq(irq); ++ } else if (VALID_EVTCHN(evtchn)) { ++ pirq_eoi(irq); ++ } ++} ++ ++static int find_irq_by_gsi(unsigned gsi) ++{ ++ int irq; ++ ++ for (irq = 0; irq < nr_irqs; irq++) { ++ struct irq_info *info = info_for_irq(irq); ++ ++ if (info == NULL || info->type != IRQT_PIRQ) ++ continue; ++ ++ if (gsi_from_irq(irq) == gsi) ++ return irq; ++ } ++ ++ return -1; ++} ++ ++/* ++ * Allocate a physical irq, along with a vector. We don't assign an ++ * event channel until the irq actually started up. Return an ++ * existing irq if we've already got one for the gsi. ++ */ ++int xen_allocate_pirq(unsigned gsi, int shareable, char *name) ++{ ++ int irq; ++ struct physdev_irq irq_op; ++ ++ spin_lock(&irq_mapping_update_lock); ++ ++ irq = find_irq_by_gsi(gsi); ++ if (irq != -1) { ++ printk(KERN_INFO "xen_allocate_pirq: returning irq %d for gsi %u\n", ++ irq, gsi); ++ goto out; /* XXX need refcount? */ ++ } ++ ++ /* If we are a PV guest, we don't have GSIs (no ACPI passed). Therefore ++ * we are using the !xen_initial_domain() to drop in the function.*/ ++ if (identity_mapped_irq(gsi) || !xen_initial_domain()) { ++ irq = gsi; ++ irq_to_desc_alloc_node(irq, 0); ++ dynamic_irq_init(irq); ++ } else ++ irq = find_unbound_irq(); ++ ++ set_irq_chip_and_handler_name(irq, &xen_pirq_chip, ++ handle_fasteoi_irq, name); ++ ++ irq_op.irq = gsi; ++ irq_op.vector = 0; ++ ++ /* Only the privileged domain can do this. For non-priv, the pcifront ++ * driver provides a PCI bus that does the call to do exactly ++ * this in the priv domain. */ ++ if (xen_initial_domain() && ++ HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) { ++ dynamic_irq_cleanup(irq); ++ irq = -ENOSPC; ++ goto out; ++ } ++ ++ irq_info[irq] = mk_pirq_info(0, gsi, irq_op.vector); ++ irq_info[irq].u.pirq.flags |= shareable ? PIRQ_SHAREABLE : 0; ++ ++out: ++ spin_unlock(&irq_mapping_update_lock); ++ ++ return irq; ++} ++ ++#ifdef CONFIG_PCI_MSI ++int xen_destroy_irq(int irq) ++{ ++ struct irq_desc *desc; ++ struct physdev_unmap_pirq unmap_irq; ++ struct irq_info *info = info_for_irq(irq); ++ int rc = -ENOENT; ++ ++ spin_lock(&irq_mapping_update_lock); ++ ++ desc = irq_to_desc(irq); ++ if (!desc) ++ goto out; ++ ++ if (xen_initial_domain()) { ++ unmap_irq.pirq = info->u.pirq.gsi; ++ unmap_irq.domid = info->u.pirq.domid; ++ rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq); ++ if (rc) { ++ printk(KERN_WARNING "unmap irq failed %d\n", rc); ++ goto out; ++ } ++ } ++ irq_info[irq] = mk_unbound_info(); ++ ++ dynamic_irq_cleanup(irq); ++ ++out: ++ spin_unlock(&irq_mapping_update_lock); ++ return rc; ++} ++ ++#ifdef CONFIG_PCI_XEN ++int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type) ++{ ++ int irq = 0; ++ struct physdev_map_pirq map_irq; ++ int rc; ++ domid_t domid; ++ int pos; ++ u32 table_offset, bir; ++ ++ domid = rc = xen_find_device_domain_owner(dev); ++ if (rc < 0) ++ domid = DOMID_SELF; ++ ++ memset(&map_irq, 0, sizeof(map_irq)); ++ map_irq.domid = domid; ++ map_irq.type = MAP_PIRQ_TYPE_MSI; ++ map_irq.index = -1; ++ map_irq.pirq = -1; ++ map_irq.bus = dev->bus->number; ++ map_irq.devfn = dev->devfn; ++ ++ if (type == PCI_CAP_ID_MSIX) { ++ pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); ++ ++ pci_read_config_dword(dev, msix_table_offset_reg(pos), ++ &table_offset); ++ bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK); ++ ++ map_irq.table_base = pci_resource_start(dev, bir); ++ map_irq.entry_nr = msidesc->msi_attrib.entry_nr; ++ } ++ ++ spin_lock(&irq_mapping_update_lock); ++ ++ irq = find_unbound_irq(); ++ ++ if (irq == -1) ++ goto out; ++ ++ rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); ++ if (rc) { ++ printk(KERN_WARNING "xen map irq failed %d\n", rc); ++ ++ dynamic_irq_cleanup(irq); ++ ++ irq = -1; ++ goto out; ++ } ++ irq_info[irq] = mk_pirq_info(0, map_irq.pirq, map_irq.index); ++ if (domid) ++ irq_info[irq].u.pirq.domid = domid; ++ ++ set_irq_chip_and_handler_name(irq, &xen_pirq_chip, ++ handle_fasteoi_irq, ++ (type == PCI_CAP_ID_MSIX) ? "msi-x":"msi"); + ++out: ++ spin_unlock(&irq_mapping_update_lock); + return irq; + } ++#endif ++#endif ++ ++int xen_vector_from_irq(unsigned irq) ++{ ++ return vector_from_irq(irq); ++} ++ ++int xen_gsi_from_irq(unsigned irq) ++{ ++ return gsi_from_irq(irq); ++} ++EXPORT_SYMBOL_GPL(xen_gsi_from_irq); + + int bind_evtchn_to_irq(unsigned int evtchn) + { +@@ -363,7 +752,7 @@ int bind_evtchn_to_irq(unsigned int evtchn) + irq = find_unbound_irq(); + + set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, +- handle_edge_irq, "event"); ++ handle_fasteoi_irq, "event"); + + evtchn_to_irq[evtchn] = irq; + irq_info[irq] = mk_evtchn_info(evtchn); +@@ -410,8 +799,23 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) + return irq; + } + ++static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain, ++ unsigned int remote_port) ++{ ++ struct evtchn_bind_interdomain bind_interdomain; ++ int err; ++ ++ bind_interdomain.remote_dom = remote_domain; ++ bind_interdomain.remote_port = remote_port; ++ ++ err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, ++ &bind_interdomain); + +-static int bind_virq_to_irq(unsigned int virq, unsigned int cpu) ++ return err ? : bind_evtchn_to_irq(bind_interdomain.local_port); ++} ++ ++ ++int bind_virq_to_irq(unsigned int virq, unsigned int cpu) + { + struct evtchn_bind_virq bind_virq; + int evtchn, irq; +@@ -421,6 +825,11 @@ static int bind_virq_to_irq(unsigned int virq, unsigned int cpu) + irq = per_cpu(virq_to_irq, cpu)[virq]; + + if (irq == -1) { ++ irq = find_unbound_irq(); ++ ++ set_irq_chip_and_handler_name(irq, &xen_percpu_chip, ++ handle_percpu_irq, "virq"); ++ + bind_virq.virq = virq; + bind_virq.vcpu = cpu; + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, +@@ -428,11 +837,6 @@ static int bind_virq_to_irq(unsigned int virq, unsigned int cpu) + BUG(); + evtchn = bind_virq.port; + +- irq = find_unbound_irq(); +- +- set_irq_chip_and_handler_name(irq, &xen_percpu_chip, +- handle_percpu_irq, "virq"); +- + evtchn_to_irq[evtchn] = irq; + irq_info[irq] = mk_virq_info(evtchn, virq); + +@@ -505,6 +909,29 @@ int bind_evtchn_to_irqhandler(unsigned int evtchn, + } + EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler); + ++int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain, ++ unsigned int remote_port, ++ irq_handler_t handler, ++ unsigned long irqflags, ++ const char *devname, ++ void *dev_id) ++{ ++ int irq, retval; ++ ++ irq = bind_interdomain_evtchn_to_irq(remote_domain, remote_port); ++ if (irq < 0) ++ return irq; ++ ++ retval = request_irq(irq, handler, irqflags, devname, dev_id); ++ if (retval != 0) { ++ unbind_from_irq(irq); ++ return retval; ++ } ++ ++ return irq; ++} ++EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler); ++ + int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, + irq_handler_t handler, + unsigned long irqflags, const char *devname, void *dev_id) +@@ -564,41 +991,75 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id) + { + struct shared_info *sh = HYPERVISOR_shared_info; + int cpu = smp_processor_id(); ++ unsigned long *cpu_evtchn = cpu_evtchn_mask(cpu); + int i; + unsigned long flags; + static DEFINE_SPINLOCK(debug_lock); ++ struct vcpu_info *v; + + spin_lock_irqsave(&debug_lock, flags); + +- printk("vcpu %d\n ", cpu); ++ printk("\nvcpu %d\n ", cpu); + + for_each_online_cpu(i) { +- struct vcpu_info *v = per_cpu(xen_vcpu, i); +- printk("%d: masked=%d pending=%d event_sel %08lx\n ", i, +- (get_irq_regs() && i == cpu) ? xen_irqs_disabled(get_irq_regs()) : v->evtchn_upcall_mask, +- v->evtchn_upcall_pending, +- v->evtchn_pending_sel); +- } +- printk("pending:\n "); +- for(i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--) +- printk("%08lx%s", sh->evtchn_pending[i], +- i % 8 == 0 ? "\n " : " "); +- printk("\nmasks:\n "); +- for(i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) +- printk("%08lx%s", sh->evtchn_mask[i], +- i % 8 == 0 ? "\n " : " "); +- +- printk("\nunmasked:\n "); +- for(i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) +- printk("%08lx%s", sh->evtchn_pending[i] & ~sh->evtchn_mask[i], +- i % 8 == 0 ? "\n " : " "); ++ int pending; ++ v = per_cpu(xen_vcpu, i); ++ pending = (get_irq_regs() && i == cpu) ++ ? xen_irqs_disabled(get_irq_regs()) ++ : v->evtchn_upcall_mask; ++ printk("%d: masked=%d pending=%d event_sel %0*lx\n ", i, ++ pending, v->evtchn_upcall_pending, ++ (int)(sizeof(v->evtchn_pending_sel)*2), ++ v->evtchn_pending_sel); ++ } ++ v = per_cpu(xen_vcpu, cpu); ++ ++ printk("\npending:\n "); ++ for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--) ++ printk("%0*lx%s", (int)sizeof(sh->evtchn_pending[0])*2, ++ sh->evtchn_pending[i], ++ i % 8 == 0 ? "\n " : " "); ++ printk("\nglobal mask:\n "); ++ for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) ++ printk("%0*lx%s", ++ (int)(sizeof(sh->evtchn_mask[0])*2), ++ sh->evtchn_mask[i], ++ i % 8 == 0 ? "\n " : " "); ++ ++ printk("\nglobally unmasked:\n "); ++ for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) ++ printk("%0*lx%s", (int)(sizeof(sh->evtchn_mask[0])*2), ++ sh->evtchn_pending[i] & ~sh->evtchn_mask[i], ++ i % 8 == 0 ? "\n " : " "); ++ ++ printk("\nlocal cpu%d mask:\n ", cpu); ++ for (i = (NR_EVENT_CHANNELS/BITS_PER_LONG)-1; i >= 0; i--) ++ printk("%0*lx%s", (int)(sizeof(cpu_evtchn[0])*2), ++ cpu_evtchn[i], ++ i % 8 == 0 ? "\n " : " "); ++ ++ printk("\nlocally unmasked:\n "); ++ for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) { ++ unsigned long pending = sh->evtchn_pending[i] ++ & ~sh->evtchn_mask[i] ++ & cpu_evtchn[i]; ++ printk("%0*lx%s", (int)(sizeof(sh->evtchn_mask[0])*2), ++ pending, i % 8 == 0 ? "\n " : " "); ++ } + + printk("\npending list:\n"); +- for(i = 0; i < NR_EVENT_CHANNELS; i++) { ++ for (i = 0; i < NR_EVENT_CHANNELS; i++) { + if (sync_test_bit(i, sh->evtchn_pending)) { +- printk(" %d: event %d -> irq %d\n", ++ int word_idx = i / BITS_PER_LONG; ++ printk(" %d: event %d -> irq %d%s%s%s\n", + cpu_from_evtchn(i), i, +- evtchn_to_irq[i]); ++ evtchn_to_irq[i], ++ sync_test_bit(word_idx, &v->evtchn_pending_sel) ++ ? "" : " l2-clear", ++ !sync_test_bit(i, sh->evtchn_mask) ++ ? "" : " globally-masked", ++ sync_test_bit(i, cpu_evtchn) ++ ? "" : " locally-masked"); + } + } + +@@ -618,17 +1079,13 @@ static DEFINE_PER_CPU(unsigned, xed_nesting_count); + * a bitset of words which contain pending event bits. The second + * level is a bitset of pending events themselves. + */ +-void xen_evtchn_do_upcall(struct pt_regs *regs) ++static void __xen_evtchn_do_upcall(struct pt_regs *regs) + { + int cpu = get_cpu(); +- struct pt_regs *old_regs = set_irq_regs(regs); + struct shared_info *s = HYPERVISOR_shared_info; + struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); + unsigned count; + +- exit_idle(); +- irq_enter(); +- + do { + unsigned long pending_words; + +@@ -651,9 +1108,16 @@ void xen_evtchn_do_upcall(struct pt_regs *regs) + int bit_idx = __ffs(pending_bits); + int port = (word_idx * BITS_PER_LONG) + bit_idx; + int irq = evtchn_to_irq[port]; ++ struct irq_desc *desc; + +- if (irq != -1) +- handle_irq(irq, regs); ++ mask_evtchn(port); ++ clear_evtchn(port); ++ ++ if (irq != -1) { ++ desc = irq_to_desc(irq); ++ if (desc) ++ generic_handle_irq_desc(irq, desc); ++ } + } + } + +@@ -661,14 +1125,32 @@ void xen_evtchn_do_upcall(struct pt_regs *regs) + + count = __get_cpu_var(xed_nesting_count); + __get_cpu_var(xed_nesting_count) = 0; +- } while(count != 1); ++ } while (count != 1 || vcpu_info->evtchn_upcall_pending); + + out: ++ ++ put_cpu(); ++} ++ ++void xen_evtchn_do_upcall(struct pt_regs *regs) ++{ ++ struct pt_regs *old_regs = set_irq_regs(regs); ++ ++ exit_idle(); ++ irq_enter(); ++ ++ __xen_evtchn_do_upcall(regs); ++ + irq_exit(); + set_irq_regs(old_regs); ++} + +- put_cpu(); ++void xen_hvm_evtchn_do_upcall(void) ++{ ++ struct pt_regs *regs = get_irq_regs(); ++ __xen_evtchn_do_upcall(regs); + } ++EXPORT_SYMBOL_GPL(xen_hvm_evtchn_do_upcall); + + /* Rebind a new event channel to an existing irq. */ + void rebind_evtchn_irq(int evtchn, int irq) +@@ -705,7 +1187,10 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu) + struct evtchn_bind_vcpu bind_vcpu; + int evtchn = evtchn_from_irq(irq); + +- if (!VALID_EVTCHN(evtchn)) ++ /* events delivered via platform PCI interrupts are always ++ * routed to vcpu 0 */ ++ if (!VALID_EVTCHN(evtchn) || ++ (xen_hvm_domain() && !xen_have_vector_callback)) + return -1; + + /* Send future instances of this interrupt to other vcpu. */ +@@ -746,33 +1231,18 @@ int resend_irq_on_evtchn(unsigned int irq) + return 1; + } + +-static void enable_dynirq(unsigned int irq) +-{ +- int evtchn = evtchn_from_irq(irq); +- +- if (VALID_EVTCHN(evtchn)) +- unmask_evtchn(evtchn); +-} +- +-static void disable_dynirq(unsigned int irq) +-{ +- int evtchn = evtchn_from_irq(irq); +- +- if (VALID_EVTCHN(evtchn)) +- mask_evtchn(evtchn); +-} +- + static void ack_dynirq(unsigned int irq) + { + int evtchn = evtchn_from_irq(irq); ++ struct irq_desc *desc = irq_to_desc(irq); + +- move_native_irq(irq); ++ move_masked_irq(irq); + +- if (VALID_EVTCHN(evtchn)) +- clear_evtchn(evtchn); ++ if (VALID_EVTCHN(evtchn) && !(desc->status & IRQ_DISABLED)) ++ unmask_evtchn(evtchn); + } + +-static int retrigger_dynirq(unsigned int irq) ++static int retrigger_irq(unsigned int irq) + { + int evtchn = evtchn_from_irq(irq); + struct shared_info *sh = HYPERVISOR_shared_info; +@@ -814,9 +1284,6 @@ static void restore_cpu_virqs(unsigned int cpu) + evtchn_to_irq[evtchn] = irq; + irq_info[irq] = mk_virq_info(evtchn, virq); + bind_evtchn_to_cpu(evtchn, cpu); +- +- /* Ready for use. */ +- unmask_evtchn(evtchn); + } + } + +@@ -842,10 +1309,6 @@ static void restore_cpu_ipis(unsigned int cpu) + evtchn_to_irq[evtchn] = irq; + irq_info[irq] = mk_ipi_info(evtchn, ipi); + bind_evtchn_to_cpu(evtchn, cpu); +- +- /* Ready for use. */ +- unmask_evtchn(evtchn); +- + } + } + +@@ -857,7 +1320,7 @@ void xen_clear_irq_pending(int irq) + if (VALID_EVTCHN(evtchn)) + clear_evtchn(evtchn); + } +- ++EXPORT_SYMBOL(xen_clear_irq_pending); + void xen_set_irq_pending(int irq) + { + int evtchn = evtchn_from_irq(irq); +@@ -877,9 +1340,9 @@ bool xen_test_irq_pending(int irq) + return ret; + } + +-/* Poll waiting for an irq to become pending. In the usual case, the ++/* Poll waiting for an irq to become pending with timeout. In the usual case, the + irq will be disabled so it won't deliver an interrupt. */ +-void xen_poll_irq(int irq) ++void xen_poll_irq_timeout(int irq, u64 timeout) + { + evtchn_port_t evtchn = evtchn_from_irq(irq); + +@@ -887,17 +1350,38 @@ void xen_poll_irq(int irq) + struct sched_poll poll; + + poll.nr_ports = 1; +- poll.timeout = 0; ++ poll.timeout = timeout; + set_xen_guest_handle(poll.ports, &evtchn); + + if (HYPERVISOR_sched_op(SCHEDOP_poll, &poll) != 0) + BUG(); + } + } ++EXPORT_SYMBOL(xen_poll_irq_timeout); ++/* Poll waiting for an irq to become pending. In the usual case, the ++ irq will be disabled so it won't deliver an interrupt. */ ++void xen_poll_irq(int irq) ++{ ++ xen_poll_irq_timeout(irq, 0 /* no timeout */); ++} ++ ++/* Check whether the IRQ line is shared with other guests. */ ++int xen_ignore_irq(int irq) ++{ ++ struct irq_info *info = info_for_irq(irq); ++ struct physdev_irq_status_query irq_status = { .irq = ++ info->u.pirq.gsi }; ++ ++ if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status)) ++ return 0; ++ return !(irq_status.flags & XENIRQSTAT_shared); ++} ++EXPORT_SYMBOL_GPL(xen_ignore_irq); + + void xen_irq_resume(void) + { + unsigned int cpu, irq, evtchn; ++ struct irq_desc *desc; + + init_evtchn_cpu_bindings(); + +@@ -916,37 +1400,134 @@ void xen_irq_resume(void) + restore_cpu_virqs(cpu); + restore_cpu_ipis(cpu); + } ++ ++ /* ++ * Unmask any IRQF_NO_SUSPEND IRQs which are enabled. These ++ * are not handled by the IRQ core. ++ */ ++ for_each_irq_desc(irq, desc) { ++ if (!desc->action || !(desc->action->flags & IRQF_NO_SUSPEND)) ++ continue; ++ if (desc->status & IRQ_DISABLED) ++ continue; ++ ++ evtchn = evtchn_from_irq(irq); ++ if (evtchn == -1) ++ continue; ++ ++ unmask_evtchn(evtchn); ++ } ++ ++ if (pirq_eoi_does_unmask) { ++ struct physdev_pirq_eoi_gmfn eoi_gmfn; ++ ++ eoi_gmfn.gmfn = virt_to_mfn(pirq_needs_eoi_bits); ++ if (HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn, &eoi_gmfn) != 0) { ++ /* Could recover by reverting to old method...? */ ++ BUG(); ++ } ++ } + } + + static struct irq_chip xen_dynamic_chip __read_mostly = { + .name = "xen-dyn", + +- .disable = disable_dynirq, +- .mask = disable_dynirq, +- .unmask = enable_dynirq, ++ .disable = mask_irq, ++ .mask = mask_irq, ++ .unmask = unmask_irq, + +- .ack = ack_dynirq, ++ .eoi = ack_dynirq, + .set_affinity = set_affinity_irq, +- .retrigger = retrigger_dynirq, ++ .retrigger = retrigger_irq, + }; + + static struct irq_chip xen_percpu_chip __read_mostly = { + .name = "xen-percpu", + +- .disable = disable_dynirq, +- .mask = disable_dynirq, +- .unmask = enable_dynirq, ++ .disable = mask_irq, ++ .mask = mask_irq, ++ .unmask = unmask_irq, + + .ack = ack_dynirq, + }; + ++static struct irq_chip xen_pirq_chip __read_mostly = { ++ .name = "xen-pirq", ++ ++ .startup = startup_pirq, ++ .shutdown = shutdown_pirq, ++ ++ .enable = pirq_eoi, ++ .unmask = unmask_irq, ++ ++ .disable = mask_irq, ++ .mask = mask_irq, ++ ++ .eoi = ack_pirq, ++ .end = end_pirq, ++ ++ .set_affinity = set_affinity_irq, ++ ++ .retrigger = retrigger_irq, ++}; ++ ++int xen_set_callback_via(uint64_t via) ++{ ++ struct xen_hvm_param a; ++ a.domid = DOMID_SELF; ++ a.index = HVM_PARAM_CALLBACK_IRQ; ++ a.value = via; ++ return HYPERVISOR_hvm_op(HVMOP_set_param, &a); ++} ++EXPORT_SYMBOL_GPL(xen_set_callback_via); ++ ++#ifdef CONFIG_XEN_PVHVM ++/* Vector callbacks are better than PCI interrupts to receive event ++ * channel notifications because we can receive vector callbacks on any ++ * vcpu and we don't need PCI support or APIC interactions. */ ++void xen_callback_vector(void) ++{ ++ int rc; ++ uint64_t callback_via; ++ if (xen_have_vector_callback) { ++ callback_via = HVM_CALLBACK_VECTOR(XEN_HVM_EVTCHN_CALLBACK); ++ rc = xen_set_callback_via(callback_via); ++ if (rc) { ++ printk(KERN_ERR "Request for Xen HVM callback vector" ++ " failed.\n"); ++ xen_have_vector_callback = 0; ++ return; ++ } ++ printk(KERN_INFO "Xen HVM callback vector for event delivery is " ++ "enabled\n"); ++ alloc_intr_gate(XEN_HVM_EVTCHN_CALLBACK, xen_hvm_callback_vector); ++ } ++} ++#else ++void xen_callback_vector(void) {} ++#endif ++ + void __init xen_init_IRQ(void) + { + int i; ++ struct physdev_pirq_eoi_gmfn eoi_gmfn; ++ int nr_pirqs = NR_IRQS; + + cpu_evtchn_mask_p = kcalloc(nr_cpu_ids, sizeof(struct cpu_evtchn_s), + GFP_KERNEL); +- BUG_ON(cpu_evtchn_mask_p == NULL); ++ irq_info = kcalloc(nr_irqs, sizeof(*irq_info), GFP_KERNEL); ++ ++ evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS, sizeof(*evtchn_to_irq), ++ GFP_KERNEL); ++ for(i = 0; i < NR_EVENT_CHANNELS; i++) ++ evtchn_to_irq[i] = -1; ++ ++ i = get_order(sizeof(unsigned long) * BITS_TO_LONGS(nr_pirqs)); ++ pirq_needs_eoi_bits = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, i); ++ ++ eoi_gmfn.gmfn = virt_to_mfn(pirq_needs_eoi_bits); ++ if (HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn, &eoi_gmfn) == 0) ++ pirq_eoi_does_unmask = true; + + init_evtchn_cpu_bindings(); + +@@ -954,5 +1535,11 @@ void __init xen_init_IRQ(void) + for (i = 0; i < NR_EVENT_CHANNELS; i++) + mask_evtchn(i); + +- irq_ctx_init(smp_processor_id()); ++ if (xen_hvm_domain()) { ++ xen_callback_vector(); ++ native_init_IRQ(); ++ } else { ++ irq_ctx_init(smp_processor_id()); ++ xen_setup_pirqs(); ++ } + } +diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c +index 79bedba..b82666a 100644 +--- a/drivers/xen/evtchn.c ++++ b/drivers/xen/evtchn.c +@@ -48,6 +48,8 @@ + #include <linux/gfp.h> + #include <linux/mutex.h> + #include <linux/cpu.h> ++ ++#include <xen/xen.h> + #include <xen/events.h> + #include <xen/evtchn.h> + #include <asm/xen/hypervisor.h> +@@ -68,10 +70,36 @@ struct per_user_data { + const char *name; + }; + +-/* Who's bound to each port? */ +-static struct per_user_data *port_user[NR_EVENT_CHANNELS]; ++/* ++ * Who's bound to each port? This is logically an array of struct ++ * per_user_data *, but we encode the current enabled-state in bit 0. ++ */ ++static unsigned long *port_user; + static DEFINE_SPINLOCK(port_user_lock); /* protects port_user[] and ring_prod */ + ++static inline struct per_user_data *get_port_user(unsigned port) ++{ ++ return (struct per_user_data *)(port_user[port] & ~1); ++} ++ ++static inline void set_port_user(unsigned port, struct per_user_data *u) ++{ ++ port_user[port] = (unsigned long)u; ++} ++ ++static inline bool get_port_enabled(unsigned port) ++{ ++ return port_user[port] & 1; ++} ++ ++static inline void set_port_enabled(unsigned port, bool enabled) ++{ ++ if (enabled) ++ port_user[port] |= 1; ++ else ++ port_user[port] &= ~1; ++} ++ + irqreturn_t evtchn_interrupt(int irq, void *data) + { + unsigned int port = (unsigned long)data; +@@ -79,9 +107,14 @@ irqreturn_t evtchn_interrupt(int irq, void *data) + + spin_lock(&port_user_lock); + +- u = port_user[port]; ++ u = get_port_user(port); ++ ++ WARN(!get_port_enabled(port), ++ "Interrupt for port %d, but apparently not enabled; per-user %p\n", ++ port, u); + + disable_irq_nosync(irq); ++ set_port_enabled(port, false); + + if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) { + u->ring[EVTCHN_RING_MASK(u->ring_prod)] = port; +@@ -91,9 +124,8 @@ irqreturn_t evtchn_interrupt(int irq, void *data) + kill_fasync(&u->evtchn_async_queue, + SIGIO, POLL_IN); + } +- } else { ++ } else + u->ring_overflow = 1; +- } + + spin_unlock(&port_user_lock); + +@@ -197,9 +229,18 @@ static ssize_t evtchn_write(struct file *file, const char __user *buf, + goto out; + + spin_lock_irq(&port_user_lock); +- for (i = 0; i < (count/sizeof(evtchn_port_t)); i++) +- if ((kbuf[i] < NR_EVENT_CHANNELS) && (port_user[kbuf[i]] == u)) +- enable_irq(irq_from_evtchn(kbuf[i])); ++ ++ for (i = 0; i < (count/sizeof(evtchn_port_t)); i++) { ++ unsigned port = kbuf[i]; ++ ++ if (port < NR_EVENT_CHANNELS && ++ get_port_user(port) == u && ++ !get_port_enabled(port)) { ++ set_port_enabled(port, true); ++ enable_irq(irq_from_evtchn(port)); ++ } ++ } ++ + spin_unlock_irq(&port_user_lock); + + rc = count; +@@ -221,8 +262,9 @@ static int evtchn_bind_to_user(struct per_user_data *u, int port) + * interrupt handler yet, and our caller has already + * serialized bind operations.) + */ +- BUG_ON(port_user[port] != NULL); +- port_user[port] = u; ++ BUG_ON(get_port_user(port) != NULL); ++ set_port_user(port, u); ++ set_port_enabled(port, true); /* start enabled */ + + rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED, + u->name, (void *)(unsigned long)port); +@@ -238,10 +280,7 @@ static void evtchn_unbind_from_user(struct per_user_data *u, int port) + + unbind_from_irqhandler(irq, (void *)(unsigned long)port); + +- /* make sure we unbind the irq handler before clearing the port */ +- barrier(); +- +- port_user[port] = NULL; ++ set_port_user(port, NULL); + } + + static long evtchn_ioctl(struct file *file, +@@ -332,15 +371,17 @@ static long evtchn_ioctl(struct file *file, + spin_lock_irq(&port_user_lock); + + rc = -ENOTCONN; +- if (port_user[unbind.port] != u) { ++ if (get_port_user(unbind.port) != u) { + spin_unlock_irq(&port_user_lock); + break; + } + +- evtchn_unbind_from_user(u, unbind.port); ++ disable_irq(irq_from_evtchn(unbind.port)); + + spin_unlock_irq(&port_user_lock); + ++ evtchn_unbind_from_user(u, unbind.port); ++ + rc = 0; + break; + } +@@ -354,7 +395,7 @@ static long evtchn_ioctl(struct file *file, + + if (notify.port >= NR_EVENT_CHANNELS) { + rc = -EINVAL; +- } else if (port_user[notify.port] != u) { ++ } else if (get_port_user(notify.port) != u) { + rc = -ENOTCONN; + } else { + notify_remote_via_evtchn(notify.port); +@@ -443,14 +484,21 @@ static int evtchn_release(struct inode *inode, struct file *filp) + free_page((unsigned long)u->ring); + + for (i = 0; i < NR_EVENT_CHANNELS; i++) { +- if (port_user[i] != u) ++ if (get_port_user(i) != u) + continue; + +- evtchn_unbind_from_user(port_user[i], i); ++ disable_irq(irq_from_evtchn(i)); + } + + spin_unlock_irq(&port_user_lock); + ++ for (i = 0; i < NR_EVENT_CHANNELS; i++) { ++ if (get_port_user(i) != u) ++ continue; ++ ++ evtchn_unbind_from_user(get_port_user(i), i); ++ } ++ + kfree(u->name); + kfree(u); + +@@ -470,7 +518,7 @@ static const struct file_operations evtchn_fops = { + + static struct miscdevice evtchn_miscdev = { + .minor = MISC_DYNAMIC_MINOR, +- .name = "evtchn", ++ .name = "xen/evtchn", + .fops = &evtchn_fops, + }; + static int __init evtchn_init(void) +@@ -480,8 +528,11 @@ static int __init evtchn_init(void) + if (!xen_domain()) + return -ENODEV; + ++ port_user = kcalloc(NR_EVENT_CHANNELS, sizeof(*port_user), GFP_KERNEL); ++ if (port_user == NULL) ++ return -ENOMEM; ++ + spin_lock_init(&port_user_lock); +- memset(port_user, 0, sizeof(port_user)); + + /* Create '/dev/misc/evtchn'. */ + err = misc_register(&evtchn_miscdev); +@@ -497,6 +548,9 @@ static int __init evtchn_init(void) + + static void __exit evtchn_cleanup(void) + { ++ kfree(port_user); ++ port_user = NULL; ++ + misc_deregister(&evtchn_miscdev); + } + +diff --git a/drivers/xen/features.c b/drivers/xen/features.c +index 99eda16..9e2b64f 100644 +--- a/drivers/xen/features.c ++++ b/drivers/xen/features.c +@@ -18,7 +18,7 @@ + u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly; + EXPORT_SYMBOL_GPL(xen_features); + +-void xen_setup_features(void) ++void __init xen_setup_features(void) + { + struct xen_feature_info fi; + int i, j; +diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c +new file mode 100644 +index 0000000..a33e443 +--- /dev/null ++++ b/drivers/xen/gntdev.c +@@ -0,0 +1,645 @@ ++/****************************************************************************** ++ * gntdev.c ++ * ++ * Device for accessing (in user-space) pages that have been granted by other ++ * domains. ++ * ++ * Copyright (c) 2006-2007, D G Murray. ++ * (c) 2009 Gerd Hoffmann <kraxel@redhat.com> ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ */ ++ ++#include <linux/module.h> ++#include <linux/kernel.h> ++#include <linux/init.h> ++#include <linux/miscdevice.h> ++#include <linux/fs.h> ++#include <linux/mm.h> ++#include <linux/mman.h> ++#include <linux/mmu_notifier.h> ++#include <linux/types.h> ++#include <linux/uaccess.h> ++#include <linux/sched.h> ++#include <linux/spinlock.h> ++ ++#include <xen/xen.h> ++#include <xen/grant_table.h> ++#include <xen/gntdev.h> ++#include <asm/xen/hypervisor.h> ++#include <asm/xen/hypercall.h> ++#include <asm/xen/page.h> ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Derek G. Murray <Derek.Murray@cl.cam.ac.uk>, " ++ "Gerd Hoffmann <kraxel@redhat.com>"); ++MODULE_DESCRIPTION("User-space granted page access driver"); ++ ++static int debug = 0; ++module_param(debug, int, 0644); ++static int limit = 1024; ++module_param(limit, int, 0644); ++ ++struct gntdev_priv { ++ struct list_head maps; ++ uint32_t used; ++ uint32_t limit; ++ spinlock_t lock; ++ struct mm_struct *mm; ++ struct mmu_notifier mn; ++}; ++ ++struct grant_map { ++ struct list_head next; ++ struct gntdev_priv *priv; ++ struct vm_area_struct *vma; ++ int index; ++ int count; ++ int flags; ++ int is_mapped; ++ struct ioctl_gntdev_grant_ref *grants; ++ struct gnttab_map_grant_ref *map_ops; ++ struct gnttab_unmap_grant_ref *unmap_ops; ++}; ++ ++/* ------------------------------------------------------------------ */ ++ ++static void gntdev_print_maps(struct gntdev_priv *priv, ++ char *text, int text_index) ++{ ++ struct grant_map *map; ++ ++ printk("%s: maps list (priv %p, usage %d/%d)\n", ++ __FUNCTION__, priv, priv->used, priv->limit); ++ list_for_each_entry(map, &priv->maps, next) ++ printk(" index %2d, count %2d %s\n", ++ map->index, map->count, ++ map->index == text_index && text ? text : ""); ++} ++ ++static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count) ++{ ++ struct grant_map *add; ++ ++ add = kzalloc(sizeof(struct grant_map), GFP_KERNEL); ++ if (NULL == add) ++ return NULL; ++ ++ add->grants = kzalloc(sizeof(add->grants[0]) * count, GFP_KERNEL); ++ add->map_ops = kzalloc(sizeof(add->map_ops[0]) * count, GFP_KERNEL); ++ add->unmap_ops = kzalloc(sizeof(add->unmap_ops[0]) * count, GFP_KERNEL); ++ if (NULL == add->grants || ++ NULL == add->map_ops || ++ NULL == add->unmap_ops) ++ goto err; ++ ++ add->index = 0; ++ add->count = count; ++ add->priv = priv; ++ ++ if (add->count + priv->used > priv->limit) ++ goto err; ++ ++ return add; ++ ++err: ++ kfree(add->grants); ++ kfree(add->map_ops); ++ kfree(add->unmap_ops); ++ kfree(add); ++ return NULL; ++} ++ ++static void gntdev_add_map(struct gntdev_priv *priv, struct grant_map *add) ++{ ++ struct grant_map *map; ++ ++ list_for_each_entry(map, &priv->maps, next) { ++ if (add->index + add->count < map->index) { ++ list_add_tail(&add->next, &map->next); ++ goto done; ++ } ++ add->index = map->index + map->count; ++ } ++ list_add_tail(&add->next, &priv->maps); ++ ++done: ++ priv->used += add->count; ++ if (debug) ++ gntdev_print_maps(priv, "[new]", add->index); ++} ++ ++static struct grant_map *gntdev_find_map_index(struct gntdev_priv *priv, int index, ++ int count) ++{ ++ struct grant_map *map; ++ ++ list_for_each_entry(map, &priv->maps, next) { ++ if (map->index != index) ++ continue; ++ if (map->count != count) ++ continue; ++ return map; ++ } ++ return NULL; ++} ++ ++static struct grant_map *gntdev_find_map_vaddr(struct gntdev_priv *priv, ++ unsigned long vaddr) ++{ ++ struct grant_map *map; ++ ++ list_for_each_entry(map, &priv->maps, next) { ++ if (!map->vma) ++ continue; ++ if (vaddr < map->vma->vm_start) ++ continue; ++ if (vaddr >= map->vma->vm_end) ++ continue; ++ return map; ++ } ++ return NULL; ++} ++ ++static int gntdev_del_map(struct grant_map *map) ++{ ++ int i; ++ ++ if (map->vma) ++ return -EBUSY; ++ for (i = 0; i < map->count; i++) ++ if (map->unmap_ops[i].handle) ++ return -EBUSY; ++ ++ map->priv->used -= map->count; ++ list_del(&map->next); ++ return 0; ++} ++ ++static void gntdev_free_map(struct grant_map *map) ++{ ++ if (!map) ++ return; ++ kfree(map->grants); ++ kfree(map->map_ops); ++ kfree(map->unmap_ops); ++ kfree(map); ++} ++ ++/* ------------------------------------------------------------------ */ ++ ++static int find_grant_ptes(pte_t *pte, pgtable_t token, unsigned long addr, void *data) ++{ ++ struct grant_map *map = data; ++ unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT; ++ u64 pte_maddr; ++ ++ BUG_ON(pgnr >= map->count); ++ pte_maddr = (u64)pfn_to_mfn(page_to_pfn(token)) << PAGE_SHIFT; ++ pte_maddr += (unsigned long)pte & ~PAGE_MASK; ++ gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr, map->flags, ++ map->grants[pgnr].ref, ++ map->grants[pgnr].domid); ++ gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr, map->flags, ++ 0 /* handle */); ++ return 0; ++} ++ ++static int map_grant_pages(struct grant_map *map) ++{ ++ int i, err = 0; ++ ++ if (debug) ++ printk("%s: map %d+%d\n", __FUNCTION__, map->index, map->count); ++ err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, ++ map->map_ops, map->count); ++ if (WARN_ON(err)) ++ return err; ++ ++ for (i = 0; i < map->count; i++) { ++ if (map->map_ops[i].status) ++ err = -EINVAL; ++ map->unmap_ops[i].handle = map->map_ops[i].handle; ++ } ++ return err; ++} ++ ++static int unmap_grant_pages(struct grant_map *map, int offset, int pages) ++{ ++ int i, err = 0; ++ ++ if (debug) ++ printk("%s: map %d+%d [%d+%d]\n", __FUNCTION__, ++ map->index, map->count, offset, pages); ++ err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ++ map->unmap_ops + offset, pages); ++ if (WARN_ON(err)) ++ return err; ++ ++ for (i = 0; i < pages; i++) { ++ if (map->unmap_ops[offset+i].status) ++ err = -EINVAL; ++ map->unmap_ops[offset+i].handle = 0; ++ } ++ return err; ++} ++ ++/* ------------------------------------------------------------------ */ ++ ++static void gntdev_vma_close(struct vm_area_struct *vma) ++{ ++ struct grant_map *map = vma->vm_private_data; ++ ++ if (debug) ++ printk("%s\n", __FUNCTION__); ++ map->is_mapped = 0; ++ map->vma = NULL; ++ vma->vm_private_data = NULL; ++} ++ ++static int gntdev_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ++{ ++ if (debug) ++ printk("%s: vaddr %p, pgoff %ld (shouldn't happen)\n", ++ __FUNCTION__, vmf->virtual_address, vmf->pgoff); ++ vmf->flags = VM_FAULT_ERROR; ++ return 0; ++} ++ ++static struct vm_operations_struct gntdev_vmops = { ++ .close = gntdev_vma_close, ++ .fault = gntdev_vma_fault, ++}; ++ ++/* ------------------------------------------------------------------ */ ++ ++static void mn_invl_range_start(struct mmu_notifier *mn, ++ struct mm_struct *mm, ++ unsigned long start, unsigned long end) ++{ ++ struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn); ++ struct grant_map *map; ++ unsigned long mstart, mend; ++ int err; ++ ++ spin_lock(&priv->lock); ++ list_for_each_entry(map, &priv->maps, next) { ++ if (!map->vma) ++ continue; ++ if (!map->is_mapped) ++ continue; ++ if (map->vma->vm_start >= end) ++ continue; ++ if (map->vma->vm_end <= start) ++ continue; ++ mstart = max(start, map->vma->vm_start); ++ mend = min(end, map->vma->vm_end); ++ if (debug) ++ printk("%s: map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n", ++ __FUNCTION__, map->index, map->count, ++ map->vma->vm_start, map->vma->vm_end, ++ start, end, mstart, mend); ++ err = unmap_grant_pages(map, ++ (mstart - map->vma->vm_start) >> PAGE_SHIFT, ++ (mend - mstart) >> PAGE_SHIFT); ++ WARN_ON(err); ++ } ++ spin_unlock(&priv->lock); ++} ++ ++static void mn_invl_page(struct mmu_notifier *mn, ++ struct mm_struct *mm, ++ unsigned long address) ++{ ++ mn_invl_range_start(mn, mm, address, address + PAGE_SIZE); ++} ++ ++static void mn_release(struct mmu_notifier *mn, ++ struct mm_struct *mm) ++{ ++ struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn); ++ struct grant_map *map; ++ int err; ++ ++ spin_lock(&priv->lock); ++ list_for_each_entry(map, &priv->maps, next) { ++ if (!map->vma) ++ continue; ++ if (debug) ++ printk("%s: map %d+%d (%lx %lx)\n", ++ __FUNCTION__, map->index, map->count, ++ map->vma->vm_start, map->vma->vm_end); ++ err = unmap_grant_pages(map, 0, map->count); ++ WARN_ON(err); ++ } ++ spin_unlock(&priv->lock); ++} ++ ++struct mmu_notifier_ops gntdev_mmu_ops = { ++ .release = mn_release, ++ .invalidate_page = mn_invl_page, ++ .invalidate_range_start = mn_invl_range_start, ++}; ++ ++/* ------------------------------------------------------------------ */ ++ ++static int gntdev_open(struct inode *inode, struct file *flip) ++{ ++ struct gntdev_priv *priv; ++ ++ priv = kzalloc(sizeof(*priv), GFP_KERNEL); ++ if (!priv) ++ return -ENOMEM; ++ ++ INIT_LIST_HEAD(&priv->maps); ++ spin_lock_init(&priv->lock); ++ priv->limit = limit; ++ ++ priv->mm = get_task_mm(current); ++ if (!priv->mm) { ++ kfree(priv); ++ return -ENOMEM; ++ } ++ priv->mn.ops = &gntdev_mmu_ops; ++ mmu_notifier_register(&priv->mn, priv->mm); ++ mmput(priv->mm); ++ ++ flip->private_data = priv; ++ if (debug) ++ printk("%s: priv %p\n", __FUNCTION__, priv); ++ ++ return 0; ++} ++ ++static int gntdev_release(struct inode *inode, struct file *flip) ++{ ++ struct gntdev_priv *priv = flip->private_data; ++ struct grant_map *map; ++ int err; ++ ++ if (debug) ++ printk("%s: priv %p\n", __FUNCTION__, priv); ++ ++ spin_lock(&priv->lock); ++ while (!list_empty(&priv->maps)) { ++ map = list_entry(priv->maps.next, struct grant_map, next); ++ err = gntdev_del_map(map); ++ if (WARN_ON(err)) ++ gntdev_free_map(map); ++ ++ } ++ spin_unlock(&priv->lock); ++ ++ mmu_notifier_unregister(&priv->mn, priv->mm); ++ kfree(priv); ++ return 0; ++} ++ ++static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv, ++ struct ioctl_gntdev_map_grant_ref __user *u) ++{ ++ struct ioctl_gntdev_map_grant_ref op; ++ struct grant_map *map; ++ int err; ++ ++ if (copy_from_user(&op, u, sizeof(op)) != 0) ++ return -EFAULT; ++ if (debug) ++ printk("%s: priv %p, add %d\n", __FUNCTION__, priv, ++ op.count); ++ if (unlikely(op.count <= 0)) ++ return -EINVAL; ++ if (unlikely(op.count > priv->limit)) ++ return -EINVAL; ++ ++ err = -ENOMEM; ++ map = gntdev_alloc_map(priv, op.count); ++ if (!map) ++ return err; ++ if (copy_from_user(map->grants, &u->refs, ++ sizeof(map->grants[0]) * op.count) != 0) { ++ gntdev_free_map(map); ++ return err; ++ } ++ ++ spin_lock(&priv->lock); ++ gntdev_add_map(priv, map); ++ op.index = map->index << PAGE_SHIFT; ++ spin_unlock(&priv->lock); ++ ++ if (copy_to_user(u, &op, sizeof(op)) != 0) { ++ spin_lock(&priv->lock); ++ gntdev_del_map(map); ++ spin_unlock(&priv->lock); ++ gntdev_free_map(map); ++ return err; ++ } ++ return 0; ++} ++ ++static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv, ++ struct ioctl_gntdev_unmap_grant_ref __user *u) ++{ ++ struct ioctl_gntdev_unmap_grant_ref op; ++ struct grant_map *map; ++ int err = -EINVAL; ++ ++ if (copy_from_user(&op, u, sizeof(op)) != 0) ++ return -EFAULT; ++ if (debug) ++ printk("%s: priv %p, del %d+%d\n", __FUNCTION__, priv, ++ (int)op.index, (int)op.count); ++ ++ spin_lock(&priv->lock); ++ map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count); ++ if (map) ++ err = gntdev_del_map(map); ++ spin_unlock(&priv->lock); ++ if (!err) ++ gntdev_free_map(map); ++ return err; ++} ++ ++static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv, ++ struct ioctl_gntdev_get_offset_for_vaddr __user *u) ++{ ++ struct ioctl_gntdev_get_offset_for_vaddr op; ++ struct grant_map *map; ++ ++ if (copy_from_user(&op, u, sizeof(op)) != 0) ++ return -EFAULT; ++ if (debug) ++ printk("%s: priv %p, offset for vaddr %lx\n", __FUNCTION__, priv, ++ (unsigned long)op.vaddr); ++ ++ spin_lock(&priv->lock); ++ map = gntdev_find_map_vaddr(priv, op.vaddr); ++ if (map == NULL || ++ map->vma->vm_start != op.vaddr) { ++ spin_unlock(&priv->lock); ++ return -EINVAL; ++ } ++ op.offset = map->index << PAGE_SHIFT; ++ op.count = map->count; ++ spin_unlock(&priv->lock); ++ ++ if (copy_to_user(u, &op, sizeof(op)) != 0) ++ return -EFAULT; ++ return 0; ++} ++ ++static long gntdev_ioctl_set_max_grants(struct gntdev_priv *priv, ++ struct ioctl_gntdev_set_max_grants __user *u) ++{ ++ struct ioctl_gntdev_set_max_grants op; ++ ++ if (copy_from_user(&op, u, sizeof(op)) != 0) ++ return -EFAULT; ++ if (debug) ++ printk("%s: priv %p, limit %d\n", __FUNCTION__, priv, op.count); ++ if (op.count > limit) ++ return -EINVAL; ++ ++ spin_lock(&priv->lock); ++ priv->limit = op.count; ++ spin_unlock(&priv->lock); ++ return 0; ++} ++ ++static long gntdev_ioctl(struct file *flip, ++ unsigned int cmd, unsigned long arg) ++{ ++ struct gntdev_priv *priv = flip->private_data; ++ void __user *ptr = (void __user *)arg; ++ ++ switch (cmd) { ++ case IOCTL_GNTDEV_MAP_GRANT_REF: ++ return gntdev_ioctl_map_grant_ref(priv, ptr); ++ ++ case IOCTL_GNTDEV_UNMAP_GRANT_REF: ++ return gntdev_ioctl_unmap_grant_ref(priv, ptr); ++ ++ case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR: ++ return gntdev_ioctl_get_offset_for_vaddr(priv, ptr); ++ ++ case IOCTL_GNTDEV_SET_MAX_GRANTS: ++ return gntdev_ioctl_set_max_grants(priv, ptr); ++ ++ default: ++ if (debug) ++ printk("%s: priv %p, unknown cmd %x\n", ++ __FUNCTION__, priv, cmd); ++ return -ENOIOCTLCMD; ++ } ++ ++ return 0; ++} ++ ++static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) ++{ ++ struct gntdev_priv *priv = flip->private_data; ++ int index = vma->vm_pgoff; ++ int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; ++ struct grant_map *map; ++ int err = -EINVAL; ++ ++ if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED)) ++ return -EINVAL; ++ ++ if (debug) ++ printk("%s: map %d+%d at %lx (pgoff %lx)\n", __FUNCTION__, ++ index, count, vma->vm_start, vma->vm_pgoff); ++ ++ spin_lock(&priv->lock); ++ map = gntdev_find_map_index(priv, index, count); ++ if (!map) ++ goto unlock_out; ++ if (map->vma) ++ goto unlock_out; ++ if (priv->mm != vma->vm_mm) { ++ printk("%s: Huh? Other mm?\n", __FUNCTION__); ++ goto unlock_out; ++ } ++ ++ vma->vm_ops = &gntdev_vmops; ++ ++ vma->vm_flags |= VM_RESERVED; ++ vma->vm_flags |= VM_DONTCOPY; ++ vma->vm_flags |= VM_DONTEXPAND; ++ ++ vma->vm_private_data = map; ++ map->vma = vma; ++ ++ map->flags = GNTMAP_host_map | GNTMAP_application_map | GNTMAP_contains_pte; ++ if (!(vma->vm_flags & VM_WRITE)) ++ map->flags |= GNTMAP_readonly; ++ ++ err = apply_to_page_range(vma->vm_mm, vma->vm_start, ++ vma->vm_end - vma->vm_start, ++ find_grant_ptes, map); ++ if (err) { ++ goto unlock_out; ++ if (debug) ++ printk("%s: find_grant_ptes() failure.\n", __FUNCTION__); ++ } ++ ++ err = map_grant_pages(map); ++ if (err) { ++ goto unlock_out; ++ if (debug) ++ printk("%s: map_grant_pages() failure.\n", __FUNCTION__); ++ } ++ map->is_mapped = 1; ++ ++unlock_out: ++ spin_unlock(&priv->lock); ++ return err; ++} ++ ++static const struct file_operations gntdev_fops = { ++ .owner = THIS_MODULE, ++ .open = gntdev_open, ++ .release = gntdev_release, ++ .mmap = gntdev_mmap, ++ .unlocked_ioctl = gntdev_ioctl ++}; ++ ++static struct miscdevice gntdev_miscdev = { ++ .minor = MISC_DYNAMIC_MINOR, ++ .name = "xen/gntdev", ++ .fops = &gntdev_fops, ++}; ++ ++/* ------------------------------------------------------------------ */ ++ ++static int __init gntdev_init(void) ++{ ++ int err; ++ ++ if (!xen_domain()) ++ return -ENODEV; ++ ++ err = misc_register(&gntdev_miscdev); ++ if (err != 0) { ++ printk(KERN_ERR "Could not register gntdev device\n"); ++ return err; ++ } ++ return 0; ++} ++ ++static void __exit gntdev_exit(void) ++{ ++ misc_deregister(&gntdev_miscdev); ++} ++ ++module_init(gntdev_init); ++module_exit(gntdev_exit); ++ ++/* ------------------------------------------------------------------ */ +diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c +index 7d8f531..09bb742 100644 +--- a/drivers/xen/grant-table.c ++++ b/drivers/xen/grant-table.c +@@ -36,10 +36,13 @@ + #include <linux/mm.h> + #include <linux/vmalloc.h> + #include <linux/uaccess.h> ++#include <linux/io.h> + ++#include <xen/xen.h> + #include <xen/interface/xen.h> + #include <xen/page.h> + #include <xen/grant_table.h> ++#include <xen/interface/memory.h> + #include <asm/xen/hypercall.h> + + #include <asm/pgtable.h> +@@ -57,6 +60,8 @@ static unsigned int boot_max_nr_grant_frames; + static int gnttab_free_count; + static grant_ref_t gnttab_free_head; + static DEFINE_SPINLOCK(gnttab_list_lock); ++unsigned long xen_hvm_resume_frames; ++EXPORT_SYMBOL_GPL(xen_hvm_resume_frames); + + static struct grant_entry *shared; + +@@ -431,7 +436,7 @@ static unsigned int __max_nr_grant_frames(void) + return query.max_nr_frames; + } + +-static inline unsigned int max_nr_grant_frames(void) ++unsigned int gnttab_max_grant_frames(void) + { + unsigned int xen_max = __max_nr_grant_frames(); + +@@ -439,6 +444,7 @@ static inline unsigned int max_nr_grant_frames(void) + return boot_max_nr_grant_frames; + return xen_max; + } ++EXPORT_SYMBOL_GPL(gnttab_max_grant_frames); + + static int gnttab_map(unsigned int start_idx, unsigned int end_idx) + { +@@ -447,6 +453,30 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) + unsigned int nr_gframes = end_idx + 1; + int rc; + ++ if (xen_hvm_domain()) { ++ struct xen_add_to_physmap xatp; ++ unsigned int i = end_idx; ++ rc = 0; ++ /* ++ * Loop backwards, so that the first hypercall has the largest ++ * index, ensuring that the table will grow only once. ++ */ ++ do { ++ xatp.domid = DOMID_SELF; ++ xatp.idx = i; ++ xatp.space = XENMAPSPACE_grant_table; ++ xatp.gpfn = (xen_hvm_resume_frames >> PAGE_SHIFT) + i; ++ rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp); ++ if (rc != 0) { ++ printk(KERN_WARNING ++ "grant table add_to_physmap failed, err=%d\n", rc); ++ break; ++ } ++ } while (i-- > start_idx); ++ ++ return rc; ++ } ++ + frames = kmalloc(nr_gframes * sizeof(unsigned long), GFP_ATOMIC); + if (!frames) + return -ENOMEM; +@@ -463,7 +493,7 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) + + BUG_ON(rc || setup.status); + +- rc = arch_gnttab_map_shared(frames, nr_gframes, max_nr_grant_frames(), ++ rc = arch_gnttab_map_shared(frames, nr_gframes, gnttab_max_grant_frames(), + &shared); + BUG_ON(rc); + +@@ -472,11 +502,127 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) + return 0; + } + ++static void gnttab_page_free(struct page *page, unsigned int order) ++{ ++ BUG_ON(order); ++ ClearPageForeign(page); ++ gnttab_reset_grant_page(page); ++ put_page(page); ++} ++ ++/* ++ * Must not be called with IRQs off. This should only be used on the ++ * slow path. ++ * ++ * Copy a foreign granted page to local memory. ++ */ ++int gnttab_copy_grant_page(grant_ref_t ref, struct page **pagep) ++{ ++ struct gnttab_unmap_and_replace unmap; ++ struct mmu_update mmu; ++ struct page *page; ++ struct page *new_page; ++ void *new_addr; ++ void *addr; ++ unsigned long pfn; ++ unsigned long mfn; ++ unsigned long new_mfn; ++ int err; ++ ++ page = *pagep; ++ if (!get_page_unless_zero(page)) ++ return -ENOENT; ++ ++ err = -ENOMEM; ++ new_page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); ++ if (!new_page) ++ goto out; ++ ++ new_addr = page_address(new_page); ++ addr = page_address(page); ++ memcpy(new_addr, addr, PAGE_SIZE); ++ ++ pfn = page_to_pfn(page); ++ mfn = pfn_to_mfn(pfn); ++ new_mfn = virt_to_mfn(new_addr); ++ ++ /* Make seq visible before checking page_mapped. */ ++ smp_mb(); ++ ++ /* Has the page been DMA-mapped? */ ++ if (unlikely(page_mapped(page))) { ++ put_page(new_page); ++ err = -EBUSY; ++ goto out; ++ } ++ ++ if (!xen_feature(XENFEAT_auto_translated_physmap)) ++ set_phys_to_machine(pfn, new_mfn); ++ ++ unmap.host_addr = (unsigned long)addr; ++ unmap.new_addr = (unsigned long)new_addr; ++ unmap.handle = ref; ++ ++ err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_and_replace, ++ &unmap, 1); ++ BUG_ON(err); ++ BUG_ON(unmap.status); ++ ++ if (!xen_feature(XENFEAT_auto_translated_physmap)) { ++ set_phys_to_machine(page_to_pfn(new_page), INVALID_P2M_ENTRY); ++ ++ mmu.ptr = PFN_PHYS(new_mfn) | MMU_MACHPHYS_UPDATE; ++ mmu.val = pfn; ++ err = HYPERVISOR_mmu_update(&mmu, 1, NULL, DOMID_SELF); ++ BUG_ON(err); ++ } ++ ++ new_page->mapping = page->mapping; ++ SetPageForeign(new_page, _PageForeignDestructor(page)); ++ if (PageReserved(page)) ++ SetPageReserved(new_page); ++ *pagep = new_page; ++ ++ SetPageForeign(page, gnttab_page_free); ++ ClearPageReserved(page); ++ page->mapping = NULL; ++ ++out: ++ put_page(page); ++ return err; ++} ++EXPORT_SYMBOL_GPL(gnttab_copy_grant_page); ++ ++void gnttab_reset_grant_page(struct page *page) ++{ ++ init_page_count(page); ++ reset_page_mapcount(page); ++} ++EXPORT_SYMBOL_GPL(gnttab_reset_grant_page); ++ + int gnttab_resume(void) + { +- if (max_nr_grant_frames() < nr_grant_frames) ++ unsigned int max_nr_gframes; ++ ++ max_nr_gframes = gnttab_max_grant_frames(); ++ if (max_nr_gframes < nr_grant_frames) + return -ENOSYS; +- return gnttab_map(0, nr_grant_frames - 1); ++ ++ if (xen_pv_domain()) ++ return gnttab_map(0, nr_grant_frames - 1); ++ ++ if (!shared) { ++ shared = ioremap(xen_hvm_resume_frames, PAGE_SIZE * max_nr_gframes); ++ if (shared == NULL) { ++ printk(KERN_WARNING ++ "Failed to ioremap gnttab share frames!"); ++ return -ENOMEM; ++ } ++ } ++ ++ gnttab_map(0, nr_grant_frames - 1); ++ ++ return 0; + } + + int gnttab_suspend(void) +@@ -493,7 +639,7 @@ static int gnttab_expand(unsigned int req_entries) + cur = nr_grant_frames; + extra = ((req_entries + (GREFS_PER_GRANT_FRAME-1)) / + GREFS_PER_GRANT_FRAME); +- if (cur + extra > max_nr_grant_frames()) ++ if (cur + extra > gnttab_max_grant_frames()) + return -ENOSPC; + + rc = gnttab_map(cur, cur + extra - 1); +@@ -503,15 +649,12 @@ static int gnttab_expand(unsigned int req_entries) + return rc; + } + +-static int __devinit gnttab_init(void) ++int gnttab_init(void) + { + int i; + unsigned int max_nr_glist_frames, nr_glist_frames; + unsigned int nr_init_grefs; + +- if (!xen_domain()) +- return -ENODEV; +- + nr_grant_frames = 1; + boot_max_nr_grant_frames = __max_nr_grant_frames(); + +@@ -554,5 +697,18 @@ static int __devinit gnttab_init(void) + kfree(gnttab_list); + return -ENOMEM; + } ++EXPORT_SYMBOL_GPL(gnttab_init); ++ ++static int __devinit __gnttab_init(void) ++{ ++ /* Delay grant-table initialization in the PV on HVM case */ ++ if (xen_hvm_domain()) ++ return 0; ++ ++ if (!xen_pv_domain()) ++ return -ENODEV; ++ ++ return gnttab_init(); ++} + +-core_initcall(gnttab_init); ++core_initcall(__gnttab_init); +diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c +index 5d42d55..0b50906 100644 +--- a/drivers/xen/manage.c ++++ b/drivers/xen/manage.c +@@ -8,6 +8,7 @@ + #include <linux/stop_machine.h> + #include <linux/freezer.h> + ++#include <xen/xen.h> + #include <xen/xenbus.h> + #include <xen/grant_table.h> + #include <xen/events.h> +@@ -32,10 +33,30 @@ enum shutdown_state { + static enum shutdown_state shutting_down = SHUTDOWN_INVALID; + + #ifdef CONFIG_PM_SLEEP +-static int xen_suspend(void *data) ++static int xen_hvm_suspend(void *data) + { ++ struct sched_shutdown r = { .reason = SHUTDOWN_suspend }; + int *cancelled = data; ++ ++ BUG_ON(!irqs_disabled()); ++ ++ *cancelled = HYPERVISOR_sched_op(SCHEDOP_shutdown, &r); ++ ++ xen_hvm_post_suspend(*cancelled); ++ gnttab_resume(); ++ ++ if (!*cancelled) { ++ xen_irq_resume(); ++ xen_timer_resume(); ++ } ++ ++ return 0; ++} ++ ++static int xen_suspend(void *data) ++{ + int err; ++ int *cancelled = data; + + BUG_ON(!irqs_disabled()); + +@@ -111,7 +132,10 @@ static void do_suspend(void) + goto out_resume; + } + +- err = stop_machine(xen_suspend, &cancelled, cpumask_of(0)); ++ if (xen_hvm_domain()) ++ err = stop_machine(xen_hvm_suspend, &cancelled, cpumask_of(0)); ++ else ++ err = stop_machine(xen_suspend, &cancelled, cpumask_of(0)); + + dpm_resume_noirq(PMSG_RESUME); + +@@ -260,7 +284,19 @@ static int shutdown_event(struct notifier_block *notifier, + return NOTIFY_DONE; + } + +-static int __init setup_shutdown_event(void) ++static int __init __setup_shutdown_event(void) ++{ ++ /* Delay initialization in the PV on HVM case */ ++ if (xen_hvm_domain()) ++ return 0; ++ ++ if (!xen_pv_domain()) ++ return -ENODEV; ++ ++ return xen_setup_shutdown_event(); ++} ++ ++int xen_setup_shutdown_event(void) + { + static struct notifier_block xenstore_notifier = { + .notifier_call = shutdown_event +@@ -269,5 +305,6 @@ static int __init setup_shutdown_event(void) + + return 0; + } ++EXPORT_SYMBOL_GPL(xen_setup_shutdown_event); + +-subsys_initcall(setup_shutdown_event); ++subsys_initcall(__setup_shutdown_event); +diff --git a/drivers/xen/mce.c b/drivers/xen/mce.c +new file mode 100644 +index 0000000..da566a5 +--- /dev/null ++++ b/drivers/xen/mce.c +@@ -0,0 +1,216 @@ ++/****************************************************************************** ++ * mce.c ++ * Add Machine Check event Logging support in DOM0 ++ * ++ * Driver for receiving and logging machine check event ++ * ++ * Copyright (c) 2008, 2009 Intel Corporation ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/types.h> ++#include <linux/kernel.h> ++#include <xen/interface/xen.h> ++#include <asm/xen/hypervisor.h> ++#include <xen/events.h> ++#include <xen/interface/vcpu.h> ++#include <asm/xen/hypercall.h> ++#include <asm/mce.h> ++#include <xen/xen.h> ++ ++static mc_info_t *g_mi; ++static mcinfo_logical_cpu_t *g_physinfo; ++static uint32_t ncpus; ++ ++static int convert_log(struct mc_info *mi) ++{ ++ struct mcinfo_common *mic = NULL; ++ struct mcinfo_global *mc_global; ++ struct mcinfo_bank *mc_bank; ++ struct mce m; ++ int i, found = 0; ++ ++ x86_mcinfo_lookup(&mic, mi, MC_TYPE_GLOBAL); ++ WARN_ON(!mic); ++ ++ mce_setup(&m); ++ mc_global = (struct mcinfo_global *)mic; ++ m.mcgstatus = mc_global->mc_gstatus; ++ m.apicid = mc_global->mc_apicid; ++ for (i = 0; i < ncpus; i++) { ++ if (g_physinfo[i].mc_apicid == m.apicid) { ++ found = 1; ++ break; ++ } ++ } ++ WARN_ON(!found); ++ ++ m.socketid = g_physinfo[i].mc_chipid; ++ m.cpu = m.extcpu = g_physinfo[i].mc_cpunr; ++ m.cpuvendor = (__u8)g_physinfo[i].mc_vendor; ++ m.mcgcap = g_physinfo[i].mc_msrvalues[0].value; ++ x86_mcinfo_lookup(&mic, mi, MC_TYPE_BANK); ++ do { ++ if (mic == NULL || mic->size == 0) ++ break; ++ if (mic->type == MC_TYPE_BANK) { ++ mc_bank = (struct mcinfo_bank *)mic; ++ m.misc = mc_bank->mc_misc; ++ m.status = mc_bank->mc_status; ++ m.addr = mc_bank->mc_addr; ++ m.tsc = mc_bank->mc_tsc; ++ m.bank = mc_bank->mc_bank; ++ m.finished = 1; ++ /*log this record*/ ++ mce_log(&m); ++ } ++ mic = x86_mcinfo_next(mic); ++ } while (1); ++ ++ return 0; ++} ++ ++/*pv_ops domain mce virq handler, logging physical mce error info*/ ++static irqreturn_t mce_dom_interrupt(int irq, void *dev_id) ++{ ++ xen_mc_t mc_op; ++ int result = 0; ++ ++ mc_op.cmd = XEN_MC_fetch; ++ mc_op.interface_version = XEN_MCA_INTERFACE_VERSION; ++ set_xen_guest_handle(mc_op.u.mc_fetch.data, g_mi); ++urgent: ++ mc_op.u.mc_fetch.flags = XEN_MC_URGENT; ++ result = HYPERVISOR_mca(&mc_op); ++ if (result || mc_op.u.mc_fetch.flags & XEN_MC_NODATA || ++ mc_op.u.mc_fetch.flags & XEN_MC_FETCHFAILED) ++ goto nonurgent; ++ else { ++ result = convert_log(g_mi); ++ if (result) ++ goto end; ++ /* After fetching the error event log entry from DOM0, ++ * we need to dec the refcnt and release the entry. ++ * The entry is reserved and inc refcnt when filling ++ * the error log entry. ++ */ ++ mc_op.u.mc_fetch.flags = XEN_MC_URGENT | XEN_MC_ACK; ++ result = HYPERVISOR_mca(&mc_op); ++ goto urgent; ++ } ++nonurgent: ++ mc_op.u.mc_fetch.flags = XEN_MC_NONURGENT; ++ result = HYPERVISOR_mca(&mc_op); ++ if (result || mc_op.u.mc_fetch.flags & XEN_MC_NODATA || ++ mc_op.u.mc_fetch.flags & XEN_MC_FETCHFAILED) ++ goto end; ++ else { ++ result = convert_log(g_mi); ++ if (result) ++ goto end; ++ /* After fetching the error event log entry from DOM0, ++ * we need to dec the refcnt and release the entry. The ++ * entry is reserved and inc refcnt when filling the ++ * error log entry. ++ */ ++ mc_op.u.mc_fetch.flags = XEN_MC_NONURGENT | XEN_MC_ACK; ++ result = HYPERVISOR_mca(&mc_op); ++ goto nonurgent; ++ } ++end: ++ return IRQ_HANDLED; ++} ++ ++static int bind_virq_for_mce(void) ++{ ++ int ret; ++ xen_mc_t mc_op; ++ ++ g_mi = kmalloc(sizeof(struct mc_info), GFP_KERNEL); ++ ++ if (!g_mi) ++ return -ENOMEM; ++ ++ /* Fetch physical CPU Numbers */ ++ mc_op.cmd = XEN_MC_physcpuinfo; ++ mc_op.interface_version = XEN_MCA_INTERFACE_VERSION; ++ set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, g_physinfo); ++ ret = HYPERVISOR_mca(&mc_op); ++ if (ret) { ++ printk(KERN_ERR "MCE_DOM0_LOG: Fail to get physical CPU numbers\n"); ++ kfree(g_mi); ++ return ret; ++ } ++ ++ /* Fetch each CPU Physical Info for later reference*/ ++ ncpus = mc_op.u.mc_physcpuinfo.ncpus; ++ g_physinfo = kmalloc(sizeof(struct mcinfo_logical_cpu)*ncpus, ++ GFP_KERNEL); ++ if (!g_physinfo) { ++ kfree(g_mi); ++ return -ENOMEM; ++ } ++ set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, g_physinfo); ++ ret = HYPERVISOR_mca(&mc_op); ++ if (ret) { ++ printk(KERN_ERR "MCE_DOM0_LOG: Fail to get physical CPUs info\n"); ++ kfree(g_mi); ++ kfree(g_physinfo); ++ return ret; ++ } ++ ++ ret = bind_virq_to_irqhandler(VIRQ_MCA, 0, ++ mce_dom_interrupt, 0, "mce", NULL); ++ ++ if (ret < 0) { ++ printk(KERN_ERR "MCE_DOM0_LOG: bind_virq for DOM0 failed\n"); ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static int __init mcelog_init(void) ++{ ++ /* Only DOM0 is responsible for MCE logging */ ++ if (xen_initial_domain()) ++ return bind_virq_for_mce(); ++ ++ return 0; ++} ++ ++ ++static void __exit mcelog_cleanup(void) ++{ ++ kfree(g_mi); ++ kfree(g_physinfo); ++} ++module_init(mcelog_init); ++module_exit(mcelog_cleanup); ++ ++MODULE_LICENSE("GPL"); +diff --git a/drivers/xen/netback/Makefile b/drivers/xen/netback/Makefile +new file mode 100644 +index 0000000..e346e81 +--- /dev/null ++++ b/drivers/xen/netback/Makefile +@@ -0,0 +1,3 @@ ++obj-$(CONFIG_XEN_NETDEV_BACKEND) := xen-netback.o ++ ++xen-netback-y := netback.o xenbus.o interface.o +diff --git a/drivers/xen/netback/common.h b/drivers/xen/netback/common.h +new file mode 100644 +index 0000000..feacf5f +--- /dev/null ++++ b/drivers/xen/netback/common.h +@@ -0,0 +1,329 @@ ++/****************************************************************************** ++ * arch/xen/drivers/netif/backend/common.h ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#ifndef __NETIF__BACKEND__COMMON_H__ ++#define __NETIF__BACKEND__COMMON_H__ ++ ++#include <linux/version.h> ++#include <linux/module.h> ++#include <linux/interrupt.h> ++#include <linux/slab.h> ++#include <linux/ip.h> ++#include <linux/in.h> ++#include <linux/netdevice.h> ++#include <linux/etherdevice.h> ++#include <linux/wait.h> ++#include <linux/sched.h> ++ ++#include <xen/interface/io/netif.h> ++#include <asm/io.h> ++#include <asm/pgalloc.h> ++#include <xen/interface/grant_table.h> ++#include <xen/grant_table.h> ++#include <xen/xenbus.h> ++ ++#define DPRINTK(_f, _a...) \ ++ pr_debug("(file=%s, line=%d) " _f, \ ++ __FILE__ , __LINE__ , ## _a ) ++#define IPRINTK(fmt, args...) \ ++ printk(KERN_INFO "xen_net: " fmt, ##args) ++#define WPRINTK(fmt, args...) \ ++ printk(KERN_WARNING "xen_net: " fmt, ##args) ++ ++struct xen_netif { ++ /* Unique identifier for this interface. */ ++ domid_t domid; ++ int group; ++ unsigned int handle; ++ ++ u8 fe_dev_addr[6]; ++ ++ /* Physical parameters of the comms window. */ ++ grant_handle_t tx_shmem_handle; ++ grant_ref_t tx_shmem_ref; ++ grant_handle_t rx_shmem_handle; ++ grant_ref_t rx_shmem_ref; ++ unsigned int irq; ++ ++ /* The shared rings and indexes. */ ++ struct xen_netif_tx_back_ring tx; ++ struct xen_netif_rx_back_ring rx; ++ struct vm_struct *tx_comms_area; ++ struct vm_struct *rx_comms_area; ++ ++ /* Flags that must not be set in dev->features */ ++ int features_disabled; ++ ++ /* Frontend feature information. */ ++ u8 can_sg:1; ++ u8 gso:1; ++ u8 gso_prefix:1; ++ u8 csum:1; ++ u8 smart_poll:1; ++ ++ /* Internal feature information. */ ++ u8 can_queue:1; /* can queue packets for receiver? */ ++ ++ /* Allow netif_be_start_xmit() to peek ahead in the rx request ++ * ring. This is a prediction of what rx_req_cons will be once ++ * all queued skbs are put on the ring. */ ++ RING_IDX rx_req_cons_peek; ++ ++ /* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */ ++ unsigned long credit_bytes; ++ unsigned long credit_usec; ++ unsigned long remaining_credit; ++ struct timer_list credit_timeout; ++ ++ /* Enforce draining of the transmit queue. */ ++ struct timer_list tx_queue_timeout; ++ ++ /* Statistics */ ++ int nr_copied_skbs; ++ ++ /* Miscellaneous private stuff. */ ++ struct list_head list; /* scheduling list */ ++ atomic_t refcnt; ++ struct net_device *dev; ++ struct net_device_stats stats; ++ ++ unsigned int carrier; ++ ++ wait_queue_head_t waiting_to_free; ++}; ++ ++/* ++ * Implement our own carrier flag: the network stack's version causes delays ++ * when the carrier is re-enabled (in particular, dev_activate() may not ++ * immediately be called, which can cause packet loss; also the etherbridge ++ * can be rather lazy in activating its port). ++ */ ++#define netback_carrier_on(netif) ((netif)->carrier = 1) ++#define netback_carrier_off(netif) ((netif)->carrier = 0) ++#define netback_carrier_ok(netif) ((netif)->carrier) ++ ++enum { ++ NETBK_DONT_COPY_SKB, ++ NETBK_DELAYED_COPY_SKB, ++ NETBK_ALWAYS_COPY_SKB, ++}; ++ ++extern int netbk_copy_skb_mode; ++ ++/* Function pointers into netback accelerator plugin modules */ ++struct netback_accel_hooks { ++ struct module *owner; ++ int (*probe)(struct xenbus_device *dev); ++ int (*remove)(struct xenbus_device *dev); ++}; ++ ++/* Structure to track the state of a netback accelerator plugin */ ++struct netback_accelerator { ++ struct list_head link; ++ int id; ++ char *eth_name; ++ atomic_t use_count; ++ struct netback_accel_hooks *hooks; ++}; ++ ++struct backend_info { ++ struct xenbus_device *dev; ++ struct xen_netif *netif; ++ enum xenbus_state frontend_state; ++ struct xenbus_watch hotplug_status_watch; ++ int have_hotplug_status_watch:1; ++ ++ /* State relating to the netback accelerator */ ++ void *netback_accel_priv; ++ /* The accelerator that this backend is currently using */ ++ struct netback_accelerator *accelerator; ++}; ++ ++#define NETBACK_ACCEL_VERSION 0x00010001 ++ ++/* ++ * Connect an accelerator plugin module to netback. Returns zero on ++ * success, < 0 on error, > 0 (with highest version number supported) ++ * if version mismatch. ++ */ ++extern int netback_connect_accelerator(unsigned version, ++ int id, const char *eth_name, ++ struct netback_accel_hooks *hooks); ++/* Disconnect a previously connected accelerator plugin module */ ++extern void netback_disconnect_accelerator(int id, const char *eth_name); ++ ++ ++extern ++void netback_probe_accelerators(struct backend_info *be, ++ struct xenbus_device *dev); ++extern ++void netback_remove_accelerators(struct backend_info *be, ++ struct xenbus_device *dev); ++extern ++void netif_accel_init(void); ++ ++ ++#define NET_TX_RING_SIZE __RING_SIZE((struct xen_netif_tx_sring *)0, PAGE_SIZE) ++#define NET_RX_RING_SIZE __RING_SIZE((struct xen_netif_rx_sring *)0, PAGE_SIZE) ++ ++void netif_disconnect(struct xen_netif *netif); ++ ++void netif_set_features(struct xen_netif *netif); ++struct xen_netif *netif_alloc(struct device *parent, domid_t domid, unsigned int handle); ++int netif_map(struct xen_netif *netif, unsigned long tx_ring_ref, ++ unsigned long rx_ring_ref, unsigned int evtchn); ++ ++static inline void netif_get(struct xen_netif *netif) ++{ ++ atomic_inc(&netif->refcnt); ++} ++ ++static inline void netif_put(struct xen_netif *netif) ++{ ++ if (atomic_dec_and_test(&netif->refcnt)) ++ wake_up(&netif->waiting_to_free); ++} ++ ++int netif_xenbus_init(void); ++ ++#define netif_schedulable(netif) \ ++ (netif_running((netif)->dev) && netback_carrier_ok(netif)) ++ ++void netif_schedule_work(struct xen_netif *netif); ++void netif_deschedule_work(struct xen_netif *netif); ++ ++int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev); ++struct net_device_stats *netif_be_get_stats(struct net_device *dev); ++irqreturn_t netif_be_int(int irq, void *dev_id); ++ ++static inline int netbk_can_queue(struct net_device *dev) ++{ ++ struct xen_netif *netif = netdev_priv(dev); ++ return netif->can_queue; ++} ++ ++static inline int netbk_can_sg(struct net_device *dev) ++{ ++ struct xen_netif *netif = netdev_priv(dev); ++ return netif->can_sg; ++} ++ ++struct pending_tx_info { ++ struct xen_netif_tx_request req; ++ struct xen_netif *netif; ++}; ++typedef unsigned int pending_ring_idx_t; ++ ++struct netbk_rx_meta { ++ int id; ++ int size; ++ int gso_size; ++}; ++ ++struct netbk_tx_pending_inuse { ++ struct list_head list; ++ unsigned long alloc_time; ++}; ++ ++#define MAX_PENDING_REQS 256 ++ ++#define MAX_BUFFER_OFFSET PAGE_SIZE ++ ++/* extra field used in struct page */ ++union page_ext { ++ struct { ++#if BITS_PER_LONG < 64 ++#define IDX_WIDTH 8 ++#define GROUP_WIDTH (BITS_PER_LONG - IDX_WIDTH) ++ unsigned int group:GROUP_WIDTH; ++ unsigned int idx:IDX_WIDTH; ++#else ++ unsigned int group, idx; ++#endif ++ } e; ++ void *mapping; ++}; ++ ++struct xen_netbk { ++ union { ++ struct { ++ struct tasklet_struct net_tx_tasklet; ++ struct tasklet_struct net_rx_tasklet; ++ } tasklet; ++ ++ struct { ++ wait_queue_head_t netbk_action_wq; ++ struct task_struct *task; ++ } kthread; ++ }; ++ ++ struct sk_buff_head rx_queue; ++ struct sk_buff_head tx_queue; ++ ++ struct timer_list net_timer; ++ struct timer_list netbk_tx_pending_timer; ++ ++ struct page **mmap_pages; ++ ++ pending_ring_idx_t pending_prod; ++ pending_ring_idx_t pending_cons; ++ pending_ring_idx_t dealloc_prod; ++ pending_ring_idx_t dealloc_cons; ++ ++ struct list_head pending_inuse_head; ++ struct list_head net_schedule_list; ++ ++ /* Protect the net_schedule_list in netif. */ ++ spinlock_t net_schedule_list_lock; ++ ++ atomic_t netfront_count; ++ ++ struct pending_tx_info pending_tx_info[MAX_PENDING_REQS]; ++ struct netbk_tx_pending_inuse pending_inuse[MAX_PENDING_REQS]; ++ struct gnttab_unmap_grant_ref tx_unmap_ops[MAX_PENDING_REQS]; ++ struct gnttab_map_grant_ref tx_map_ops[MAX_PENDING_REQS]; ++ ++ grant_handle_t grant_tx_handle[MAX_PENDING_REQS]; ++ u16 pending_ring[MAX_PENDING_REQS]; ++ u16 dealloc_ring[MAX_PENDING_REQS]; ++ ++ /* ++ * Each head or fragment can be up to 4096 bytes. Given ++ * MAX_BUFFER_OFFSET of 4096 the worst case is that each ++ * head/fragment uses 2 copy operation. ++ */ ++ struct gnttab_copy grant_copy_op[2*NET_RX_RING_SIZE]; ++ unsigned char rx_notify[NR_IRQS]; ++ u16 notify_list[NET_RX_RING_SIZE]; ++ struct netbk_rx_meta meta[2*NET_RX_RING_SIZE]; ++}; ++ ++extern struct xen_netbk *xen_netbk; ++extern int xen_netbk_group_nr; ++ ++#endif /* __NETIF__BACKEND__COMMON_H__ */ +diff --git a/drivers/xen/netback/interface.c b/drivers/xen/netback/interface.c +new file mode 100644 +index 0000000..2e8508a +--- /dev/null ++++ b/drivers/xen/netback/interface.c +@@ -0,0 +1,475 @@ ++/****************************************************************************** ++ * arch/xen/drivers/netif/backend/interface.c ++ * ++ * Network-device interface management. ++ * ++ * Copyright (c) 2004-2005, Keir Fraser ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#include "common.h" ++#include <linux/ethtool.h> ++#include <linux/rtnetlink.h> ++ ++#include <xen/events.h> ++#include <asm/xen/hypercall.h> ++ ++/* ++ * Module parameter 'queue_length': ++ * ++ * Enables queuing in the network stack when a client has run out of receive ++ * descriptors. Although this feature can improve receive bandwidth by avoiding ++ * packet loss, it can also result in packets sitting in the 'tx_queue' for ++ * unbounded time. This is bad if those packets hold onto foreign resources. ++ * For example, consider a packet that holds onto resources belonging to the ++ * guest for which it is queued (e.g., packet received on vif1.0, destined for ++ * vif1.1 which is not activated in the guest): in this situation the guest ++ * will never be destroyed, unless vif1.1 is taken down. To avoid this, we ++ * run a timer (tx_queue_timeout) to drain the queue when the interface is ++ * blocked. ++ */ ++static unsigned long netbk_queue_length = 32; ++module_param_named(queue_length, netbk_queue_length, ulong, 0644); ++ ++static void netbk_add_netif(struct xen_netbk *netbk, int group_nr, ++ struct xen_netif *netif) ++{ ++ int i; ++ int min_netfront_count; ++ int min_group = 0; ++ min_netfront_count = atomic_read(&netbk[0].netfront_count); ++ for (i = 0; i < group_nr; i++) { ++ int netfront_count = atomic_read(&netbk[i].netfront_count); ++ if (netfront_count < min_netfront_count) { ++ min_group = i; ++ min_netfront_count = netfront_count; ++ } ++ } ++ ++ netif->group = min_group; ++ atomic_inc(&netbk[netif->group].netfront_count); ++} ++ ++static void netbk_remove_netif(struct xen_netbk *netbk, struct xen_netif *netif) ++{ ++ atomic_dec(&netbk[netif->group].netfront_count); ++} ++ ++static void __netif_up(struct xen_netif *netif) ++{ ++ netbk_add_netif(xen_netbk, xen_netbk_group_nr, netif); ++ enable_irq(netif->irq); ++ netif_schedule_work(netif); ++} ++ ++static void __netif_down(struct xen_netif *netif) ++{ ++ disable_irq(netif->irq); ++ netif_deschedule_work(netif); ++ netbk_remove_netif(xen_netbk, netif); ++} ++ ++static int net_open(struct net_device *dev) ++{ ++ struct xen_netif *netif = netdev_priv(dev); ++ if (netback_carrier_ok(netif)) { ++ __netif_up(netif); ++ netif_start_queue(dev); ++ } ++ return 0; ++} ++ ++static int net_close(struct net_device *dev) ++{ ++ struct xen_netif *netif = netdev_priv(dev); ++ if (netback_carrier_ok(netif)) ++ __netif_down(netif); ++ netif_stop_queue(dev); ++ return 0; ++} ++ ++static int netbk_change_mtu(struct net_device *dev, int mtu) ++{ ++ int max = netbk_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN; ++ ++ if (mtu > max) ++ return -EINVAL; ++ dev->mtu = mtu; ++ return 0; ++} ++ ++void netif_set_features(struct xen_netif *netif) ++{ ++ struct net_device *dev = netif->dev; ++ int features = dev->features; ++ ++ if (netif->can_sg) ++ features |= NETIF_F_SG; ++ if (netif->gso || netif->gso_prefix) ++ features |= NETIF_F_TSO; ++ if (netif->csum) ++ features |= NETIF_F_IP_CSUM; ++ ++ features &= ~(netif->features_disabled); ++ ++ if (!(features & NETIF_F_SG) && dev->mtu > ETH_DATA_LEN) ++ dev->mtu = ETH_DATA_LEN; ++ ++ dev->features = features; ++} ++ ++static int netbk_set_tx_csum(struct net_device *dev, u32 data) ++{ ++ struct xen_netif *netif = netdev_priv(dev); ++ if (data) { ++ if (!netif->csum) ++ return -ENOSYS; ++ netif->features_disabled &= ~NETIF_F_IP_CSUM; ++ } else { ++ netif->features_disabled |= NETIF_F_IP_CSUM; ++ } ++ ++ netif_set_features(netif); ++ return 0; ++} ++ ++static int netbk_set_sg(struct net_device *dev, u32 data) ++{ ++ struct xen_netif *netif = netdev_priv(dev); ++ if (data) { ++ if (!netif->can_sg) ++ return -ENOSYS; ++ netif->features_disabled &= ~NETIF_F_SG; ++ } else { ++ netif->features_disabled |= NETIF_F_SG; ++ } ++ ++ netif_set_features(netif); ++ return 0; ++} ++ ++static int netbk_set_tso(struct net_device *dev, u32 data) ++{ ++ struct xen_netif *netif = netdev_priv(dev); ++ if (data) { ++ if (!netif->gso && !netif->gso_prefix) ++ return -ENOSYS; ++ netif->features_disabled &= ~NETIF_F_TSO; ++ } else { ++ netif->features_disabled |= NETIF_F_TSO; ++ } ++ ++ netif_set_features(netif); ++ return 0; ++} ++ ++static void netbk_get_drvinfo(struct net_device *dev, ++ struct ethtool_drvinfo *info) ++{ ++ strcpy(info->driver, "netbk"); ++ strcpy(info->bus_info, dev_name(dev->dev.parent)); ++} ++ ++static const struct netif_stat { ++ char name[ETH_GSTRING_LEN]; ++ u16 offset; ++} netbk_stats[] = { ++ { "copied_skbs", offsetof(struct xen_netif, nr_copied_skbs) }, ++}; ++ ++static int netbk_get_sset_count(struct net_device *dev, int string_set) ++{ ++ switch (string_set) { ++ case ETH_SS_STATS: ++ return ARRAY_SIZE(netbk_stats); ++ default: ++ return -EINVAL; ++ } ++} ++ ++static void netbk_get_ethtool_stats(struct net_device *dev, ++ struct ethtool_stats *stats, u64 * data) ++{ ++ void *netif = netdev_priv(dev); ++ int i; ++ ++ for (i = 0; i < ARRAY_SIZE(netbk_stats); i++) ++ data[i] = *(int *)(netif + netbk_stats[i].offset); ++} ++ ++static void netbk_get_strings(struct net_device *dev, u32 stringset, u8 * data) ++{ ++ int i; ++ ++ switch (stringset) { ++ case ETH_SS_STATS: ++ for (i = 0; i < ARRAY_SIZE(netbk_stats); i++) ++ memcpy(data + i * ETH_GSTRING_LEN, ++ netbk_stats[i].name, ETH_GSTRING_LEN); ++ break; ++ } ++} ++ ++static struct ethtool_ops network_ethtool_ops = ++{ ++ .get_drvinfo = netbk_get_drvinfo, ++ ++ .get_tx_csum = ethtool_op_get_tx_csum, ++ .set_tx_csum = netbk_set_tx_csum, ++ .get_sg = ethtool_op_get_sg, ++ .set_sg = netbk_set_sg, ++ .get_tso = ethtool_op_get_tso, ++ .set_tso = netbk_set_tso, ++ .get_link = ethtool_op_get_link, ++ ++ .get_sset_count = netbk_get_sset_count, ++ .get_ethtool_stats = netbk_get_ethtool_stats, ++ .get_strings = netbk_get_strings, ++}; ++ ++static struct net_device_ops netback_ops = ++{ ++ .ndo_start_xmit = netif_be_start_xmit, ++ .ndo_get_stats = netif_be_get_stats, ++ .ndo_open = net_open, ++ .ndo_stop = net_close, ++ .ndo_change_mtu = netbk_change_mtu, ++}; ++ ++struct xen_netif *netif_alloc(struct device *parent, domid_t domid, unsigned int handle) ++{ ++ int err = 0; ++ struct net_device *dev; ++ struct xen_netif *netif; ++ char name[IFNAMSIZ] = {}; ++ ++ snprintf(name, IFNAMSIZ - 1, "vif%u.%u", domid, handle); ++ dev = alloc_netdev(sizeof(struct xen_netif), name, ether_setup); ++ if (dev == NULL) { ++ DPRINTK("Could not create netif: out of memory\n"); ++ return ERR_PTR(-ENOMEM); ++ } ++ ++ SET_NETDEV_DEV(dev, parent); ++ ++ netif = netdev_priv(dev); ++ memset(netif, 0, sizeof(*netif)); ++ netif->domid = domid; ++ netif->group = -1; ++ netif->handle = handle; ++ netif->can_sg = 1; ++ netif->csum = 1; ++ atomic_set(&netif->refcnt, 1); ++ init_waitqueue_head(&netif->waiting_to_free); ++ netif->dev = dev; ++ INIT_LIST_HEAD(&netif->list); ++ ++ netback_carrier_off(netif); ++ ++ netif->credit_bytes = netif->remaining_credit = ~0UL; ++ netif->credit_usec = 0UL; ++ init_timer(&netif->credit_timeout); ++ /* Initialize 'expires' now: it's used to track the credit window. */ ++ netif->credit_timeout.expires = jiffies; ++ ++ init_timer(&netif->tx_queue_timeout); ++ ++ dev->netdev_ops = &netback_ops; ++ netif_set_features(netif); ++ SET_ETHTOOL_OPS(dev, &network_ethtool_ops); ++ ++ dev->tx_queue_len = netbk_queue_length; ++ ++ /* ++ * Initialise a dummy MAC address. We choose the numerically ++ * largest non-broadcast address to prevent the address getting ++ * stolen by an Ethernet bridge for STP purposes. ++ * (FE:FF:FF:FF:FF:FF) ++ */ ++ memset(dev->dev_addr, 0xFF, ETH_ALEN); ++ dev->dev_addr[0] &= ~0x01; ++ ++ rtnl_lock(); ++ err = register_netdevice(dev); ++ rtnl_unlock(); ++ if (err) { ++ DPRINTK("Could not register new net device %s: err=%d\n", ++ dev->name, err); ++ free_netdev(dev); ++ return ERR_PTR(err); ++ } ++ ++ DPRINTK("Successfully created netif\n"); ++ return netif; ++} ++ ++static int map_frontend_pages( ++ struct xen_netif *netif, grant_ref_t tx_ring_ref, grant_ref_t rx_ring_ref) ++{ ++ struct gnttab_map_grant_ref op; ++ ++ gnttab_set_map_op(&op, (unsigned long)netif->tx_comms_area->addr, ++ GNTMAP_host_map, tx_ring_ref, netif->domid); ++ ++ if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)) ++ BUG(); ++ ++ if (op.status) { ++ DPRINTK(" Gnttab failure mapping tx_ring_ref!\n"); ++ return op.status; ++ } ++ ++ netif->tx_shmem_ref = tx_ring_ref; ++ netif->tx_shmem_handle = op.handle; ++ ++ gnttab_set_map_op(&op, (unsigned long)netif->rx_comms_area->addr, ++ GNTMAP_host_map, rx_ring_ref, netif->domid); ++ ++ if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)) ++ BUG(); ++ ++ if (op.status) { ++ struct gnttab_unmap_grant_ref unop; ++ ++ gnttab_set_unmap_op(&unop, ++ (unsigned long)netif->tx_comms_area->addr, ++ GNTMAP_host_map, netif->tx_shmem_handle); ++ HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &unop, 1); ++ DPRINTK(" Gnttab failure mapping rx_ring_ref!\n"); ++ return op.status; ++ } ++ ++ netif->rx_shmem_ref = rx_ring_ref; ++ netif->rx_shmem_handle = op.handle; ++ ++ return 0; ++} ++ ++static void unmap_frontend_pages(struct xen_netif *netif) ++{ ++ struct gnttab_unmap_grant_ref op; ++ ++ gnttab_set_unmap_op(&op, (unsigned long)netif->tx_comms_area->addr, ++ GNTMAP_host_map, netif->tx_shmem_handle); ++ ++ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)) ++ BUG(); ++ ++ gnttab_set_unmap_op(&op, (unsigned long)netif->rx_comms_area->addr, ++ GNTMAP_host_map, netif->rx_shmem_handle); ++ ++ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)) ++ BUG(); ++} ++ ++int netif_map(struct xen_netif *netif, unsigned long tx_ring_ref, ++ unsigned long rx_ring_ref, unsigned int evtchn) ++{ ++ int err = -ENOMEM; ++ struct xen_netif_tx_sring *txs; ++ struct xen_netif_rx_sring *rxs; ++ ++ /* Already connected through? */ ++ if (netif->irq) ++ return 0; ++ ++ netif->tx_comms_area = alloc_vm_area(PAGE_SIZE); ++ if (netif->tx_comms_area == NULL) ++ return -ENOMEM; ++ netif->rx_comms_area = alloc_vm_area(PAGE_SIZE); ++ if (netif->rx_comms_area == NULL) ++ goto err_rx; ++ ++ err = map_frontend_pages(netif, tx_ring_ref, rx_ring_ref); ++ if (err) ++ goto err_map; ++ ++ err = bind_interdomain_evtchn_to_irqhandler( ++ netif->domid, evtchn, netif_be_int, 0, ++ netif->dev->name, netif); ++ if (err < 0) ++ goto err_hypervisor; ++ netif->irq = err; ++ disable_irq(netif->irq); ++ ++ txs = (struct xen_netif_tx_sring *)netif->tx_comms_area->addr; ++ BACK_RING_INIT(&netif->tx, txs, PAGE_SIZE); ++ ++ rxs = (struct xen_netif_rx_sring *) ++ ((char *)netif->rx_comms_area->addr); ++ BACK_RING_INIT(&netif->rx, rxs, PAGE_SIZE); ++ ++ netif->rx_req_cons_peek = 0; ++ ++ netif_get(netif); ++ ++ rtnl_lock(); ++ netback_carrier_on(netif); ++ if (netif_running(netif->dev)) ++ __netif_up(netif); ++ rtnl_unlock(); ++ ++ return 0; ++err_hypervisor: ++ unmap_frontend_pages(netif); ++err_map: ++ free_vm_area(netif->rx_comms_area); ++err_rx: ++ free_vm_area(netif->tx_comms_area); ++ return err; ++} ++ ++void netif_disconnect(struct xen_netif *netif) ++{ ++ if (netback_carrier_ok(netif)) { ++ rtnl_lock(); ++ netback_carrier_off(netif); ++ netif_carrier_off(netif->dev); /* discard queued packets */ ++ if (netif_running(netif->dev)) ++ __netif_down(netif); ++ rtnl_unlock(); ++ netif_put(netif); ++ } ++ ++ atomic_dec(&netif->refcnt); ++ wait_event(netif->waiting_to_free, atomic_read(&netif->refcnt) == 0); ++ ++ del_timer_sync(&netif->credit_timeout); ++ del_timer_sync(&netif->tx_queue_timeout); ++ ++ if (netif->irq) ++ unbind_from_irqhandler(netif->irq, netif); ++ ++ unregister_netdev(netif->dev); ++ ++ if (netif->tx.sring) { ++ unmap_frontend_pages(netif); ++ free_vm_area(netif->tx_comms_area); ++ free_vm_area(netif->rx_comms_area); ++ } ++ ++ free_netdev(netif->dev); ++} +diff --git a/drivers/xen/netback/netback.c b/drivers/xen/netback/netback.c +new file mode 100644 +index 0000000..c448675 +--- /dev/null ++++ b/drivers/xen/netback/netback.c +@@ -0,0 +1,1902 @@ ++/****************************************************************************** ++ * drivers/xen/netback/netback.c ++ * ++ * Back-end of the driver for virtual network devices. This portion of the ++ * driver exports a 'unified' network-device interface that can be accessed ++ * by any operating system that implements a compatible front end. A ++ * reference front-end implementation can be found in: ++ * drivers/xen/netfront/netfront.c ++ * ++ * Copyright (c) 2002-2005, K A Fraser ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#include "common.h" ++ ++#include <linux/tcp.h> ++#include <linux/udp.h> ++#include <linux/kthread.h> ++ ++#include <xen/balloon.h> ++#include <xen/events.h> ++#include <xen/interface/memory.h> ++ ++#include <asm/xen/hypercall.h> ++#include <asm/xen/page.h> ++ ++/*define NETBE_DEBUG_INTERRUPT*/ ++ ++struct xen_netbk *xen_netbk; ++int xen_netbk_group_nr; ++ ++static void netif_idx_release(struct xen_netbk *netbk, u16 pending_idx); ++static void make_tx_response(struct xen_netif *netif, ++ struct xen_netif_tx_request *txp, ++ s8 st); ++static struct xen_netif_rx_response *make_rx_response(struct xen_netif *netif, ++ u16 id, ++ s8 st, ++ u16 offset, ++ u16 size, ++ u16 flags); ++ ++static void net_tx_action(unsigned long data); ++ ++static void net_rx_action(unsigned long data); ++ ++static inline unsigned long idx_to_pfn(struct xen_netbk *netbk, ++ unsigned int idx) ++{ ++ return page_to_pfn(netbk->mmap_pages[idx]); ++} ++ ++static inline unsigned long idx_to_kaddr(struct xen_netbk *netbk, ++ unsigned int idx) ++{ ++ return (unsigned long)pfn_to_kaddr(idx_to_pfn(netbk, idx)); ++} ++ ++/* extra field used in struct page */ ++static inline void netif_set_page_ext(struct page *pg, unsigned int group, ++ unsigned int idx) ++{ ++ union page_ext ext = { .e = { .group = group + 1, .idx = idx } }; ++ ++ BUILD_BUG_ON(sizeof(ext) > sizeof(ext.mapping)); ++ pg->mapping = ext.mapping; ++} ++ ++static inline int netif_get_page_ext(struct page *pg, unsigned int *_group, unsigned int *_idx) ++{ ++ union page_ext ext = { .mapping = pg->mapping }; ++ struct xen_netbk *netbk; ++ unsigned int group, idx; ++ ++ if (!PageForeign(pg)) ++ return 0; ++ ++ group = ext.e.group - 1; ++ ++ if (group < 0 || group >= xen_netbk_group_nr) ++ return 0; ++ ++ netbk = &xen_netbk[group]; ++ ++ if (netbk->mmap_pages == NULL) ++ return 0; ++ ++ idx = ext.e.idx; ++ ++ if ((idx < 0) || (idx >= MAX_PENDING_REQS)) ++ return 0; ++ ++ if (netbk->mmap_pages[idx] != pg) ++ return 0; ++ ++ *_group = group; ++ *_idx = idx; ++ ++ return 1; ++} ++ ++/* ++ * This is the amount of packet we copy rather than map, so that the ++ * guest can't fiddle with the contents of the headers while we do ++ * packet processing on them (netfilter, routing, etc). 72 is enough ++ * to cover TCP+IP headers including options. ++ */ ++#define PKT_PROT_LEN 72 ++ ++static inline pending_ring_idx_t pending_index(unsigned i) ++{ ++ return i & (MAX_PENDING_REQS-1); ++} ++ ++static inline pending_ring_idx_t nr_pending_reqs(struct xen_netbk *netbk) ++{ ++ return MAX_PENDING_REQS - ++ netbk->pending_prod + netbk->pending_cons; ++} ++ ++/* Setting this allows the safe use of this driver without netloop. */ ++static int MODPARM_copy_skb = 1; ++module_param_named(copy_skb, MODPARM_copy_skb, bool, 0); ++MODULE_PARM_DESC(copy_skb, "Copy data received from netfront without netloop"); ++ ++int netbk_copy_skb_mode; ++ ++static int MODPARM_netback_kthread; ++module_param_named(netback_kthread, MODPARM_netback_kthread, bool, 0); ++MODULE_PARM_DESC(netback_kthread, "Use kernel thread to replace tasklet"); ++ ++/* ++ * Netback bottom half handler. ++ * dir indicates the data direction. ++ * rx: 1, tx: 0. ++ */ ++static inline void xen_netbk_bh_handler(struct xen_netbk *netbk, int dir) ++{ ++ if (MODPARM_netback_kthread) ++ wake_up(&netbk->kthread.netbk_action_wq); ++ else if (dir) ++ tasklet_schedule(&netbk->tasklet.net_rx_tasklet); ++ else ++ tasklet_schedule(&netbk->tasklet.net_tx_tasklet); ++} ++ ++static inline void maybe_schedule_tx_action(struct xen_netbk *netbk) ++{ ++ smp_mb(); ++ if ((nr_pending_reqs(netbk) < (MAX_PENDING_REQS/2)) && ++ !list_empty(&netbk->net_schedule_list)) ++ xen_netbk_bh_handler(netbk, 0); ++} ++ ++static struct sk_buff *netbk_copy_skb(struct sk_buff *skb) ++{ ++ struct skb_shared_info *ninfo; ++ struct sk_buff *nskb; ++ unsigned long offset; ++ int ret; ++ int len; ++ int headlen; ++ ++ BUG_ON(skb_shinfo(skb)->frag_list != NULL); ++ ++ nskb = alloc_skb(SKB_MAX_HEAD(0), GFP_ATOMIC | __GFP_NOWARN); ++ if (unlikely(!nskb)) ++ goto err; ++ ++ skb_reserve(nskb, NET_SKB_PAD + NET_IP_ALIGN); ++ headlen = skb_end_pointer(nskb) - nskb->data; ++ if (headlen > skb_headlen(skb)) ++ headlen = skb_headlen(skb); ++ ret = skb_copy_bits(skb, 0, __skb_put(nskb, headlen), headlen); ++ BUG_ON(ret); ++ ++ ninfo = skb_shinfo(nskb); ++ ninfo->gso_size = skb_shinfo(skb)->gso_size; ++ ninfo->gso_type = skb_shinfo(skb)->gso_type; ++ ++ offset = headlen; ++ len = skb->len - headlen; ++ ++ nskb->len = skb->len; ++ nskb->data_len = len; ++ nskb->truesize += len; ++ ++ while (len) { ++ struct page *page; ++ int copy; ++ int zero; ++ ++ if (unlikely(ninfo->nr_frags >= MAX_SKB_FRAGS)) { ++ dump_stack(); ++ goto err_free; ++ } ++ ++ copy = len >= PAGE_SIZE ? PAGE_SIZE : len; ++ zero = len >= PAGE_SIZE ? 0 : __GFP_ZERO; ++ ++ page = alloc_page(GFP_ATOMIC | __GFP_NOWARN | zero); ++ if (unlikely(!page)) ++ goto err_free; ++ ++ ret = skb_copy_bits(skb, offset, page_address(page), copy); ++ BUG_ON(ret); ++ ++ ninfo->frags[ninfo->nr_frags].page = page; ++ ninfo->frags[ninfo->nr_frags].page_offset = 0; ++ ninfo->frags[ninfo->nr_frags].size = copy; ++ ninfo->nr_frags++; ++ ++ offset += copy; ++ len -= copy; ++ } ++ ++#ifdef NET_SKBUFF_DATA_USES_OFFSET ++ offset = 0; ++#else ++ offset = nskb->data - skb->data; ++#endif ++ ++ nskb->transport_header = skb->transport_header + offset; ++ nskb->network_header = skb->network_header + offset; ++ nskb->mac_header = skb->mac_header + offset; ++ ++ return nskb; ++ ++ err_free: ++ kfree_skb(nskb); ++ err: ++ return NULL; ++} ++ ++static inline int netbk_max_required_rx_slots(struct xen_netif *netif) ++{ ++ if (netif->can_sg || netif->gso || netif->gso_prefix) ++ return MAX_SKB_FRAGS + 2; /* header + extra_info + frags */ ++ return 1; /* all in one */ ++} ++ ++static inline int netbk_queue_full(struct xen_netif *netif) ++{ ++ RING_IDX peek = netif->rx_req_cons_peek; ++ RING_IDX needed = netbk_max_required_rx_slots(netif); ++ ++ return ((netif->rx.sring->req_prod - peek) < needed) || ++ ((netif->rx.rsp_prod_pvt + NET_RX_RING_SIZE - peek) < needed); ++} ++ ++static void tx_queue_callback(unsigned long data) ++{ ++ struct xen_netif *netif = (struct xen_netif *)data; ++ if (netif_schedulable(netif)) ++ netif_wake_queue(netif->dev); ++} ++ ++/* Figure out how many ring slots we're going to need to send @skb to ++ the guest. */ ++static unsigned count_skb_slots(struct sk_buff *skb, struct xen_netif *netif) ++{ ++ unsigned count; ++ unsigned copy_off; ++ unsigned i; ++ ++ copy_off = 0; ++ count = 1; ++ ++ BUG_ON(offset_in_page(skb->data) + skb_headlen(skb) > MAX_BUFFER_OFFSET); ++ ++ copy_off = skb_headlen(skb); ++ ++ if (skb_shinfo(skb)->gso_size) ++ count++; ++ ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { ++ unsigned long size = skb_shinfo(skb)->frags[i].size; ++ unsigned long bytes; ++ while (size > 0) { ++ BUG_ON(copy_off > MAX_BUFFER_OFFSET); ++ ++ /* These checks are the same as in netbk_gop_frag_copy */ ++ if (copy_off == MAX_BUFFER_OFFSET ++ || ((copy_off + size > MAX_BUFFER_OFFSET) && (size <= MAX_BUFFER_OFFSET) && copy_off)) { ++ count++; ++ copy_off = 0; ++ } ++ ++ bytes = size; ++ if (copy_off + bytes > MAX_BUFFER_OFFSET) ++ bytes = MAX_BUFFER_OFFSET - copy_off; ++ ++ copy_off += bytes; ++ size -= bytes; ++ } ++ } ++ return count; ++} ++ ++int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev) ++{ ++ struct xen_netif *netif = netdev_priv(dev); ++ struct xen_netbk *netbk; ++ ++ BUG_ON(skb->dev != dev); ++ ++ if (netif->group == -1) ++ goto drop; ++ ++ netbk = &xen_netbk[netif->group]; ++ ++ /* Drop the packet if the target domain has no receive buffers. */ ++ if (unlikely(!netif_schedulable(netif) || netbk_queue_full(netif))) ++ goto drop; ++ ++ /* ++ * XXX For now we also copy skbuffs whose head crosses a page ++ * boundary, because netbk_gop_skb can't handle them. ++ */ ++ if ((skb_headlen(skb) + offset_in_page(skb->data)) >= PAGE_SIZE) { ++ struct sk_buff *nskb = netbk_copy_skb(skb); ++ if ( unlikely(nskb == NULL) ) ++ goto drop; ++ /* Copy only the header fields we use in this driver. */ ++ nskb->dev = skb->dev; ++ nskb->ip_summed = skb->ip_summed; ++ dev_kfree_skb(skb); ++ skb = nskb; ++ } ++ ++ /* Reserve ring slots for the worst-case number of ++ * fragments. */ ++ netif->rx_req_cons_peek += count_skb_slots(skb, netif); ++ netif_get(netif); ++ ++ if (netbk_can_queue(dev) && netbk_queue_full(netif)) { ++ netif->rx.sring->req_event = netif->rx_req_cons_peek + ++ netbk_max_required_rx_slots(netif); ++ mb(); /* request notification /then/ check & stop the queue */ ++ if (netbk_queue_full(netif)) { ++ netif_stop_queue(dev); ++ /* ++ * Schedule 500ms timeout to restart the queue, thus ++ * ensuring that an inactive queue will be drained. ++ * Packets will be immediately be dropped until more ++ * receive buffers become available (see ++ * netbk_queue_full() check above). ++ */ ++ netif->tx_queue_timeout.data = (unsigned long)netif; ++ netif->tx_queue_timeout.function = tx_queue_callback; ++ mod_timer(&netif->tx_queue_timeout, jiffies + HZ/2); ++ } ++ } ++ skb_queue_tail(&netbk->rx_queue, skb); ++ ++ xen_netbk_bh_handler(netbk, 1); ++ ++ return 0; ++ ++ drop: ++ netif->stats.tx_dropped++; ++ dev_kfree_skb(skb); ++ return 0; ++} ++ ++struct netrx_pending_operations { ++ unsigned copy_prod, copy_cons; ++ unsigned meta_prod, meta_cons; ++ struct gnttab_copy *copy; ++ struct netbk_rx_meta *meta; ++ int copy_off; ++ grant_ref_t copy_gref; ++}; ++ ++/* Set up the grant operations for this fragment. If it's a flipping ++ interface, we also set up the unmap request from here. */ ++ ++static void netbk_gop_frag_copy(struct xen_netif *netif, ++ struct netrx_pending_operations *npo, ++ struct page *page, unsigned long size, ++ unsigned long offset, int head) ++{ ++ struct gnttab_copy *copy_gop; ++ struct netbk_rx_meta *meta; ++ /* ++ * These variables a used iff netif_get_page_ext returns true, ++ * in which case they are guaranteed to be initialized. ++ */ ++ unsigned int uninitialized_var(group), uninitialized_var(idx); ++ int foreign = netif_get_page_ext(page, &group, &idx); ++ unsigned long bytes; ++ ++ /* Data must not cross a page boundary. */ ++ BUG_ON(size + offset > PAGE_SIZE); ++ ++ meta = npo->meta + npo->meta_prod - 1; ++ ++ while (size > 0) { ++ BUG_ON(npo->copy_off > MAX_BUFFER_OFFSET); ++ ++ /* ++ * Move to a new receive buffer if: ++ * ++ * simple case: we have completely filled the current buffer. ++ * ++ * complex case: the current frag would overflow ++ * the current buffer but only if: ++ * (i) this frag would fit completely in the next buffer ++ * and (ii) there is already some data in the current buffer ++ * and (iii) this is not the head buffer. ++ * ++ * Where: ++ * - (i) stops us splitting a frag into two copies ++ * unless the frag is too large for a single buffer. ++ * - (ii) stops us from leaving a buffer pointlessly empty. ++ * - (iii) stops us leaving the first buffer ++ * empty. Strictly speaking this is already covered ++ * by (ii) but is explicitly checked because ++ * netfront relies on the first buffer being ++ * non-empty and can crash otherwise. ++ * ++ * This means we will effectively linearise small ++ * frags but do not needlessly split large buffers ++ * into multiple copies tend to give large frags their ++ * own buffers as before. ++ */ ++ if (npo->copy_off == MAX_BUFFER_OFFSET ++ || ((npo->copy_off + size > MAX_BUFFER_OFFSET) && (size <= MAX_BUFFER_OFFSET) && npo->copy_off && !head)) { ++ struct xen_netif_rx_request *req; ++ ++ BUG_ON(head); /* Netfront requires there to be some data in the head buffer. */ ++ /* Overflowed this request, go to the next one */ ++ req = RING_GET_REQUEST(&netif->rx, netif->rx.req_cons++); ++ meta = npo->meta + npo->meta_prod++; ++ meta->gso_size = 0; ++ meta->size = 0; ++ meta->id = req->id; ++ npo->copy_off = 0; ++ npo->copy_gref = req->gref; ++ } ++ ++ bytes = size; ++ if (npo->copy_off + bytes > MAX_BUFFER_OFFSET) ++ bytes = MAX_BUFFER_OFFSET - npo->copy_off; ++ ++ copy_gop = npo->copy + npo->copy_prod++; ++ copy_gop->flags = GNTCOPY_dest_gref; ++ if (foreign) { ++ struct xen_netbk *netbk = &xen_netbk[group]; ++ struct pending_tx_info *src_pend; ++ ++ src_pend = &netbk->pending_tx_info[idx]; ++ ++ copy_gop->source.domid = src_pend->netif->domid; ++ copy_gop->source.u.ref = src_pend->req.gref; ++ copy_gop->flags |= GNTCOPY_source_gref; ++ } else { ++ copy_gop->source.domid = DOMID_SELF; ++ copy_gop->source.u.gmfn = virt_to_mfn(page_address(page)); ++ } ++ copy_gop->source.offset = offset; ++ copy_gop->dest.domid = netif->domid; ++ ++ copy_gop->dest.offset = npo->copy_off; ++ copy_gop->dest.u.ref = npo->copy_gref; ++ copy_gop->len = bytes; ++ ++ npo->copy_off += bytes; ++ meta->size += bytes; ++ ++ offset += bytes; ++ size -= bytes; ++ head = 0; /* Must be something in this buffer now */ ++ } ++} ++ ++/* Prepare an SKB to be transmitted to the frontend. This is ++ responsible for allocating grant operations, meta structures, etc. ++ It returns the number of meta structures consumed. The number of ++ ring slots used is always equal to the number of meta slots used ++ plus the number of GSO descriptors used. Currently, we use either ++ zero GSO descriptors (for non-GSO packets) or one descriptor (for ++ frontend-side LRO). */ ++static int netbk_gop_skb(struct sk_buff *skb, ++ struct netrx_pending_operations *npo) ++{ ++ struct xen_netif *netif = netdev_priv(skb->dev); ++ int nr_frags = skb_shinfo(skb)->nr_frags; ++ int i; ++ struct xen_netif_rx_request *req; ++ struct netbk_rx_meta *meta; ++ int old_meta_prod; ++ ++ old_meta_prod = npo->meta_prod; ++ ++ /* Set up a GSO prefix descriptor, if necessary */ ++ if (skb_shinfo(skb)->gso_size && netif->gso_prefix) { ++ req = RING_GET_REQUEST(&netif->rx, netif->rx.req_cons++); ++ meta = npo->meta + npo->meta_prod++; ++ meta->gso_size = skb_shinfo(skb)->gso_size; ++ meta->size = 0; ++ meta->id = req->id; ++ } ++ ++ req = RING_GET_REQUEST(&netif->rx, netif->rx.req_cons++); ++ meta = npo->meta + npo->meta_prod++; ++ ++ if (!netif->gso_prefix) ++ meta->gso_size = skb_shinfo(skb)->gso_size; ++ else ++ meta->gso_size = 0; ++ ++ meta->size = 0; ++ meta->id = req->id; ++ npo->copy_off = 0; ++ npo->copy_gref = req->gref; ++ ++ netbk_gop_frag_copy(netif, ++ npo, virt_to_page(skb->data), ++ skb_headlen(skb), ++ offset_in_page(skb->data), 1); ++ ++ /* Leave a gap for the GSO descriptor. */ ++ if (skb_shinfo(skb)->gso_size && !netif->gso_prefix) ++ netif->rx.req_cons++; ++ ++ for (i = 0; i < nr_frags; i++) { ++ netbk_gop_frag_copy(netif, npo, ++ skb_shinfo(skb)->frags[i].page, ++ skb_shinfo(skb)->frags[i].size, ++ skb_shinfo(skb)->frags[i].page_offset, ++ 0); ++ } ++ ++ return npo->meta_prod - old_meta_prod; ++} ++ ++/* This is a twin to netbk_gop_skb. Assume that netbk_gop_skb was ++ used to set up the operations on the top of ++ netrx_pending_operations, which have since been done. Check that ++ they didn't give any errors and advance over them. */ ++static int netbk_check_gop(int nr_meta_slots, domid_t domid, ++ struct netrx_pending_operations *npo) ++{ ++ struct gnttab_copy *copy_op; ++ int status = NETIF_RSP_OKAY; ++ int i; ++ ++ for (i = 0; i < nr_meta_slots; i++) { ++ copy_op = npo->copy + npo->copy_cons++; ++ if (copy_op->status != GNTST_okay) { ++ DPRINTK("Bad status %d from copy to DOM%d.\n", ++ copy_op->status, domid); ++ status = NETIF_RSP_ERROR; ++ } ++ } ++ ++ return status; ++} ++ ++static void netbk_add_frag_responses(struct xen_netif *netif, int status, ++ struct netbk_rx_meta *meta, ++ int nr_meta_slots) ++{ ++ int i; ++ unsigned long offset; ++ ++ for (i = 0; i < nr_meta_slots; i++) { ++ int flags; ++ if (i == nr_meta_slots - 1) ++ flags = 0; ++ else ++ flags = NETRXF_more_data; ++ ++ offset = 0; ++ make_rx_response(netif, meta[i].id, status, offset, ++ meta[i].size, flags); ++ } ++} ++ ++struct skb_cb_overlay { ++ int meta_slots_used; ++}; ++ ++static void net_rx_action(unsigned long data) ++{ ++ struct xen_netif *netif = NULL; ++ struct xen_netbk *netbk = (struct xen_netbk *)data; ++ s8 status; ++ u16 irq, flags; ++ struct xen_netif_rx_response *resp; ++ struct sk_buff_head rxq; ++ struct sk_buff *skb; ++ int notify_nr = 0; ++ int ret; ++ int nr_frags; ++ int count; ++ unsigned long offset; ++ struct skb_cb_overlay *sco; ++ ++ struct netrx_pending_operations npo = { ++ .copy = netbk->grant_copy_op, ++ .meta = netbk->meta, ++ }; ++ ++ skb_queue_head_init(&rxq); ++ ++ count = 0; ++ ++ while ((skb = skb_dequeue(&netbk->rx_queue)) != NULL) { ++ netif = netdev_priv(skb->dev); ++ nr_frags = skb_shinfo(skb)->nr_frags; ++ ++ sco = (struct skb_cb_overlay *)skb->cb; ++ sco->meta_slots_used = netbk_gop_skb(skb, &npo); ++ ++ count += nr_frags + 1; ++ ++ __skb_queue_tail(&rxq, skb); ++ ++ /* Filled the batch queue? */ ++ if (count + MAX_SKB_FRAGS >= NET_RX_RING_SIZE) ++ break; ++ } ++ ++ BUG_ON(npo.meta_prod > ARRAY_SIZE(netbk->meta)); ++ ++ if (!npo.copy_prod) ++ return; ++ ++ BUG_ON(npo.copy_prod > ARRAY_SIZE(netbk->grant_copy_op)); ++ ret = HYPERVISOR_grant_table_op(GNTTABOP_copy, &netbk->grant_copy_op, ++ npo.copy_prod); ++ BUG_ON(ret != 0); ++ ++ while ((skb = __skb_dequeue(&rxq)) != NULL) { ++ sco = (struct skb_cb_overlay *)skb->cb; ++ ++ netif = netdev_priv(skb->dev); ++ ++ if (netbk->meta[npo.meta_cons].gso_size && netif->gso_prefix) { ++ resp = RING_GET_RESPONSE(&netif->rx, ++ netif->rx.rsp_prod_pvt++); ++ ++ resp->flags = NETRXF_gso_prefix | NETRXF_more_data; ++ ++ resp->offset = netbk->meta[npo.meta_cons].gso_size; ++ resp->id = netbk->meta[npo.meta_cons].id; ++ resp->status = sco->meta_slots_used; ++ ++ npo.meta_cons++; ++ sco->meta_slots_used--; ++ } ++ ++ ++ netif->stats.tx_bytes += skb->len; ++ netif->stats.tx_packets++; ++ ++ status = netbk_check_gop(sco->meta_slots_used, ++ netif->domid, &npo); ++ ++ if (sco->meta_slots_used == 1) ++ flags = 0; ++ else ++ flags = NETRXF_more_data; ++ ++ if (skb->ip_summed == CHECKSUM_PARTIAL) /* local packet? */ ++ flags |= NETRXF_csum_blank | NETRXF_data_validated; ++ else if (skb->ip_summed == CHECKSUM_UNNECESSARY) ++ /* remote but checksummed. */ ++ flags |= NETRXF_data_validated; ++ ++ offset = 0; ++ resp = make_rx_response(netif, netbk->meta[npo.meta_cons].id, ++ status, offset, ++ netbk->meta[npo.meta_cons].size, ++ flags); ++ ++ if (netbk->meta[npo.meta_cons].gso_size && !netif->gso_prefix) { ++ struct xen_netif_extra_info *gso = ++ (struct xen_netif_extra_info *) ++ RING_GET_RESPONSE(&netif->rx, ++ netif->rx.rsp_prod_pvt++); ++ ++ resp->flags |= NETRXF_extra_info; ++ ++ gso->u.gso.size = netbk->meta[npo.meta_cons].gso_size; ++ gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4; ++ gso->u.gso.pad = 0; ++ gso->u.gso.features = 0; ++ ++ gso->type = XEN_NETIF_EXTRA_TYPE_GSO; ++ gso->flags = 0; ++ } ++ ++ if (sco->meta_slots_used > 1) { ++ netbk_add_frag_responses(netif, status, ++ netbk->meta + npo.meta_cons + 1, ++ sco->meta_slots_used - 1); ++ } ++ ++ RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->rx, ret); ++ irq = netif->irq; ++ if (ret && !netbk->rx_notify[irq] && ++ (netif->smart_poll != 1)) { ++ netbk->rx_notify[irq] = 1; ++ netbk->notify_list[notify_nr++] = irq; ++ } ++ ++ if (netif_queue_stopped(netif->dev) && ++ netif_schedulable(netif) && ++ !netbk_queue_full(netif)) ++ netif_wake_queue(netif->dev); ++ ++ /* ++ * netfront_smartpoll_active indicates whether ++ * netfront timer is active. ++ */ ++ if ((netif->smart_poll == 1) && ++ !(netif->rx.sring->private.netif.smartpoll_active)) { ++ notify_remote_via_irq(irq); ++ netif->rx.sring->private.netif.smartpoll_active = 1; ++ } ++ ++ netif_put(netif); ++ npo.meta_cons += sco->meta_slots_used; ++ dev_kfree_skb(skb); ++ } ++ ++ while (notify_nr != 0) { ++ irq = netbk->notify_list[--notify_nr]; ++ netbk->rx_notify[irq] = 0; ++ notify_remote_via_irq(irq); ++ } ++ ++ /* More work to do? */ ++ if (!skb_queue_empty(&netbk->rx_queue) && ++ !timer_pending(&netbk->net_timer)) ++ xen_netbk_bh_handler(netbk, 1); ++} ++ ++static void net_alarm(unsigned long data) ++{ ++ struct xen_netbk *netbk = (struct xen_netbk *)data; ++ xen_netbk_bh_handler(netbk, 1); ++} ++ ++static void netbk_tx_pending_timeout(unsigned long data) ++{ ++ struct xen_netbk *netbk = (struct xen_netbk *)data; ++ xen_netbk_bh_handler(netbk, 0); ++} ++ ++struct net_device_stats *netif_be_get_stats(struct net_device *dev) ++{ ++ struct xen_netif *netif = netdev_priv(dev); ++ return &netif->stats; ++} ++ ++static int __on_net_schedule_list(struct xen_netif *netif) ++{ ++ return !list_empty(&netif->list); ++} ++ ++/* Must be called with net_schedule_list_lock held */ ++static void remove_from_net_schedule_list(struct xen_netif *netif) ++{ ++ if (likely(__on_net_schedule_list(netif))) { ++ list_del_init(&netif->list); ++ netif_put(netif); ++ } ++} ++ ++static struct xen_netif *poll_net_schedule_list(struct xen_netbk *netbk) ++{ ++ struct xen_netif *netif = NULL; ++ ++ spin_lock_irq(&netbk->net_schedule_list_lock); ++ if (list_empty(&netbk->net_schedule_list)) ++ goto out; ++ ++ netif = list_first_entry(&netbk->net_schedule_list, ++ struct xen_netif, list); ++ if (!netif) ++ goto out; ++ ++ netif_get(netif); ++ ++ remove_from_net_schedule_list(netif); ++out: ++ spin_unlock_irq(&netbk->net_schedule_list_lock); ++ return netif; ++} ++ ++static void add_to_net_schedule_list_tail(struct xen_netif *netif) ++{ ++ unsigned long flags; ++ ++ struct xen_netbk *netbk = &xen_netbk[netif->group]; ++ if (__on_net_schedule_list(netif)) ++ return; ++ ++ spin_lock_irqsave(&netbk->net_schedule_list_lock, flags); ++ if (!__on_net_schedule_list(netif) && ++ likely(netif_schedulable(netif))) { ++ list_add_tail(&netif->list, &netbk->net_schedule_list); ++ netif_get(netif); ++ } ++ spin_unlock_irqrestore(&netbk->net_schedule_list_lock, flags); ++} ++ ++void netif_schedule_work(struct xen_netif *netif) ++{ ++ struct xen_netbk *netbk = &xen_netbk[netif->group]; ++ int more_to_do; ++ ++ RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do); ++ ++ if (more_to_do) { ++ add_to_net_schedule_list_tail(netif); ++ maybe_schedule_tx_action(netbk); ++ } ++} ++ ++void netif_deschedule_work(struct xen_netif *netif) ++{ ++ struct xen_netbk *netbk = &xen_netbk[netif->group]; ++ spin_lock_irq(&netbk->net_schedule_list_lock); ++ remove_from_net_schedule_list(netif); ++ spin_unlock_irq(&netbk->net_schedule_list_lock); ++} ++ ++ ++static void tx_add_credit(struct xen_netif *netif) ++{ ++ unsigned long max_burst, max_credit; ++ ++ /* ++ * Allow a burst big enough to transmit a jumbo packet of up to 128kB. ++ * Otherwise the interface can seize up due to insufficient credit. ++ */ ++ max_burst = RING_GET_REQUEST(&netif->tx, netif->tx.req_cons)->size; ++ max_burst = min(max_burst, 131072UL); ++ max_burst = max(max_burst, netif->credit_bytes); ++ ++ /* Take care that adding a new chunk of credit doesn't wrap to zero. */ ++ max_credit = netif->remaining_credit + netif->credit_bytes; ++ if (max_credit < netif->remaining_credit) ++ max_credit = ULONG_MAX; /* wrapped: clamp to ULONG_MAX */ ++ ++ netif->remaining_credit = min(max_credit, max_burst); ++} ++ ++static void tx_credit_callback(unsigned long data) ++{ ++ struct xen_netif *netif = (struct xen_netif *)data; ++ tx_add_credit(netif); ++ netif_schedule_work(netif); ++} ++ ++static inline int copy_pending_req(struct xen_netbk *netbk, ++ pending_ring_idx_t pending_idx) ++{ ++ return gnttab_copy_grant_page( ++ netbk->grant_tx_handle[pending_idx], ++ &netbk->mmap_pages[pending_idx]); ++} ++ ++static inline void net_tx_action_dealloc(struct xen_netbk *netbk) ++{ ++ struct netbk_tx_pending_inuse *inuse, *n; ++ struct gnttab_unmap_grant_ref *gop; ++ u16 pending_idx; ++ pending_ring_idx_t dc, dp; ++ struct xen_netif *netif; ++ int ret; ++ LIST_HEAD(list); ++ ++ dc = netbk->dealloc_cons; ++ gop = netbk->tx_unmap_ops; ++ ++ /* ++ * Free up any grants we have finished using ++ */ ++ do { ++ dp = netbk->dealloc_prod; ++ ++ /* Ensure we see all indices enqueued by netif_idx_release(). */ ++ smp_rmb(); ++ ++ while (dc != dp) { ++ unsigned long pfn; ++ struct netbk_tx_pending_inuse *pending_inuse = ++ netbk->pending_inuse; ++ ++ pending_idx = netbk->dealloc_ring[pending_index(dc++)]; ++ list_move_tail(&pending_inuse[pending_idx].list, &list); ++ ++ pfn = idx_to_pfn(netbk, pending_idx); ++ /* Already unmapped? */ ++ if (!phys_to_machine_mapping_valid(pfn)) ++ continue; ++ ++ gnttab_set_unmap_op(gop, ++ idx_to_kaddr(netbk, pending_idx), ++ GNTMAP_host_map, ++ netbk->grant_tx_handle[pending_idx]); ++ gop++; ++ } ++ ++ if (netbk_copy_skb_mode != NETBK_DELAYED_COPY_SKB || ++ list_empty(&netbk->pending_inuse_head)) ++ break; ++ ++ /* Copy any entries that have been pending for too long. */ ++ list_for_each_entry_safe(inuse, n, ++ &netbk->pending_inuse_head, list) { ++ struct pending_tx_info *pending_tx_info; ++ pending_tx_info = netbk->pending_tx_info; ++ ++ if (time_after(inuse->alloc_time + HZ / 2, jiffies)) ++ break; ++ ++ pending_idx = inuse - netbk->pending_inuse; ++ ++ pending_tx_info[pending_idx].netif->nr_copied_skbs++; ++ ++ switch (copy_pending_req(netbk, pending_idx)) { ++ case 0: ++ list_move_tail(&inuse->list, &list); ++ continue; ++ case -EBUSY: ++ list_del_init(&inuse->list); ++ continue; ++ case -ENOENT: ++ continue; ++ } ++ ++ break; ++ } ++ } while (dp != netbk->dealloc_prod); ++ ++ netbk->dealloc_cons = dc; ++ ++ ret = HYPERVISOR_grant_table_op( ++ GNTTABOP_unmap_grant_ref, netbk->tx_unmap_ops, ++ gop - netbk->tx_unmap_ops); ++ BUG_ON(ret); ++ ++ list_for_each_entry_safe(inuse, n, &list, list) { ++ struct pending_tx_info *pending_tx_info; ++ pending_ring_idx_t index; ++ ++ pending_tx_info = netbk->pending_tx_info; ++ pending_idx = inuse - netbk->pending_inuse; ++ ++ netif = pending_tx_info[pending_idx].netif; ++ ++ make_tx_response(netif, &pending_tx_info[pending_idx].req, ++ NETIF_RSP_OKAY); ++ ++ /* Ready for next use. */ ++ gnttab_reset_grant_page(netbk->mmap_pages[pending_idx]); ++ ++ index = pending_index(netbk->pending_prod++); ++ netbk->pending_ring[index] = pending_idx; ++ ++ netif_put(netif); ++ ++ list_del_init(&inuse->list); ++ } ++} ++ ++static void netbk_tx_err(struct xen_netif *netif, ++ struct xen_netif_tx_request *txp, RING_IDX end) ++{ ++ RING_IDX cons = netif->tx.req_cons; ++ ++ do { ++ make_tx_response(netif, txp, NETIF_RSP_ERROR); ++ if (cons >= end) ++ break; ++ txp = RING_GET_REQUEST(&netif->tx, cons++); ++ } while (1); ++ netif->tx.req_cons = cons; ++ netif_schedule_work(netif); ++ netif_put(netif); ++} ++ ++static int netbk_count_requests(struct xen_netif *netif, ++ struct xen_netif_tx_request *first, ++ struct xen_netif_tx_request *txp, int work_to_do) ++{ ++ RING_IDX cons = netif->tx.req_cons; ++ int frags = 0; ++ ++ if (!(first->flags & NETTXF_more_data)) ++ return 0; ++ ++ do { ++ if (frags >= work_to_do) { ++ DPRINTK("Need more frags\n"); ++ return -frags; ++ } ++ ++ if (unlikely(frags >= MAX_SKB_FRAGS)) { ++ DPRINTK("Too many frags\n"); ++ return -frags; ++ } ++ ++ memcpy(txp, RING_GET_REQUEST(&netif->tx, cons + frags), ++ sizeof(*txp)); ++ if (txp->size > first->size) { ++ DPRINTK("Frags galore\n"); ++ return -frags; ++ } ++ ++ first->size -= txp->size; ++ frags++; ++ ++ if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) { ++ DPRINTK("txp->offset: %x, size: %u\n", ++ txp->offset, txp->size); ++ return -frags; ++ } ++ } while ((txp++)->flags & NETTXF_more_data); ++ ++ return frags; ++} ++ ++static struct gnttab_map_grant_ref *netbk_get_requests(struct xen_netbk *netbk, ++ struct xen_netif *netif, ++ struct sk_buff *skb, ++ struct xen_netif_tx_request *txp, ++ struct gnttab_map_grant_ref *mop) ++{ ++ struct skb_shared_info *shinfo = skb_shinfo(skb); ++ skb_frag_t *frags = shinfo->frags; ++ unsigned long pending_idx = *((u16 *)skb->data); ++ int i, start; ++ ++ /* Skip first skb fragment if it is on same page as header fragment. */ ++ start = ((unsigned long)shinfo->frags[0].page == pending_idx); ++ ++ for (i = start; i < shinfo->nr_frags; i++, txp++) { ++ pending_ring_idx_t index; ++ struct pending_tx_info *pending_tx_info = ++ netbk->pending_tx_info; ++ ++ index = pending_index(netbk->pending_cons++); ++ pending_idx = netbk->pending_ring[index]; ++ ++ gnttab_set_map_op(mop++, idx_to_kaddr(netbk, pending_idx), ++ GNTMAP_host_map | GNTMAP_readonly, ++ txp->gref, netif->domid); ++ ++ memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp)); ++ netif_get(netif); ++ pending_tx_info[pending_idx].netif = netif; ++ frags[i].page = (void *)pending_idx; ++ } ++ ++ return mop; ++} ++ ++static int netbk_tx_check_mop(struct xen_netbk *netbk, ++ struct sk_buff *skb, ++ struct gnttab_map_grant_ref **mopp) ++{ ++ struct gnttab_map_grant_ref *mop = *mopp; ++ int pending_idx = *((u16 *)skb->data); ++ struct pending_tx_info *pending_tx_info = netbk->pending_tx_info; ++ struct xen_netif *netif = pending_tx_info[pending_idx].netif; ++ struct xen_netif_tx_request *txp; ++ struct skb_shared_info *shinfo = skb_shinfo(skb); ++ int nr_frags = shinfo->nr_frags; ++ int i, err, start; ++ ++ /* Check status of header. */ ++ err = mop->status; ++ if (unlikely(err)) { ++ pending_ring_idx_t index; ++ index = pending_index(netbk->pending_prod++); ++ txp = &pending_tx_info[pending_idx].req; ++ make_tx_response(netif, txp, NETIF_RSP_ERROR); ++ netbk->pending_ring[index] = pending_idx; ++ netif_put(netif); ++ } else { ++ set_phys_to_machine( ++ __pa(idx_to_kaddr(netbk, pending_idx)) >> PAGE_SHIFT, ++ FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT)); ++ netbk->grant_tx_handle[pending_idx] = mop->handle; ++ } ++ ++ /* Skip first skb fragment if it is on same page as header fragment. */ ++ start = ((unsigned long)shinfo->frags[0].page == pending_idx); ++ ++ for (i = start; i < nr_frags; i++) { ++ int j, newerr; ++ pending_ring_idx_t index; ++ ++ pending_idx = (unsigned long)shinfo->frags[i].page; ++ ++ /* Check error status: if okay then remember grant handle. */ ++ newerr = (++mop)->status; ++ if (likely(!newerr)) { ++ unsigned long addr; ++ addr = idx_to_kaddr(netbk, pending_idx); ++ set_phys_to_machine( ++ __pa(addr)>>PAGE_SHIFT, ++ FOREIGN_FRAME(mop->dev_bus_addr>>PAGE_SHIFT)); ++ netbk->grant_tx_handle[pending_idx] = mop->handle; ++ /* Had a previous error? Invalidate this fragment. */ ++ if (unlikely(err)) ++ netif_idx_release(netbk, pending_idx); ++ continue; ++ } ++ ++ /* Error on this fragment: respond to client with an error. */ ++ txp = &netbk->pending_tx_info[pending_idx].req; ++ make_tx_response(netif, txp, NETIF_RSP_ERROR); ++ index = pending_index(netbk->pending_prod++); ++ netbk->pending_ring[index] = pending_idx; ++ netif_put(netif); ++ ++ /* Not the first error? Preceding frags already invalidated. */ ++ if (err) ++ continue; ++ ++ /* First error: invalidate header and preceding fragments. */ ++ pending_idx = *((u16 *)skb->data); ++ netif_idx_release(netbk, pending_idx); ++ for (j = start; j < i; j++) { ++ pending_idx = (unsigned long)shinfo->frags[i].page; ++ netif_idx_release(netbk, pending_idx); ++ } ++ ++ /* Remember the error: invalidate all subsequent fragments. */ ++ err = newerr; ++ } ++ ++ *mopp = mop + 1; ++ return err; ++} ++ ++static void netbk_fill_frags(struct xen_netbk *netbk, struct sk_buff *skb) ++{ ++ struct skb_shared_info *shinfo = skb_shinfo(skb); ++ int nr_frags = shinfo->nr_frags; ++ int i; ++ ++ for (i = 0; i < nr_frags; i++) { ++ skb_frag_t *frag = shinfo->frags + i; ++ struct xen_netif_tx_request *txp; ++ unsigned long pending_idx; ++ ++ pending_idx = (unsigned long)frag->page; ++ ++ netbk->pending_inuse[pending_idx].alloc_time = jiffies; ++ list_add_tail(&netbk->pending_inuse[pending_idx].list, ++ &netbk->pending_inuse_head); ++ ++ txp = &netbk->pending_tx_info[pending_idx].req; ++ frag->page = virt_to_page(idx_to_kaddr(netbk, pending_idx)); ++ frag->size = txp->size; ++ frag->page_offset = txp->offset; ++ ++ skb->len += txp->size; ++ skb->data_len += txp->size; ++ skb->truesize += txp->size; ++ } ++} ++ ++int netbk_get_extras(struct xen_netif *netif, struct xen_netif_extra_info *extras, ++ int work_to_do) ++{ ++ struct xen_netif_extra_info extra; ++ RING_IDX cons = netif->tx.req_cons; ++ ++ do { ++ if (unlikely(work_to_do-- <= 0)) { ++ DPRINTK("Missing extra info\n"); ++ return -EBADR; ++ } ++ ++ memcpy(&extra, RING_GET_REQUEST(&netif->tx, cons), ++ sizeof(extra)); ++ if (unlikely(!extra.type || ++ extra.type >= XEN_NETIF_EXTRA_TYPE_MAX)) { ++ netif->tx.req_cons = ++cons; ++ DPRINTK("Invalid extra type: %d\n", extra.type); ++ return -EINVAL; ++ } ++ ++ memcpy(&extras[extra.type - 1], &extra, sizeof(extra)); ++ netif->tx.req_cons = ++cons; ++ } while (extra.flags & XEN_NETIF_EXTRA_FLAG_MORE); ++ ++ return work_to_do; ++} ++ ++static int netbk_set_skb_gso(struct sk_buff *skb, struct xen_netif_extra_info *gso) ++{ ++ if (!gso->u.gso.size) { ++ DPRINTK("GSO size must not be zero.\n"); ++ return -EINVAL; ++ } ++ ++ /* Currently only TCPv4 S.O. is supported. */ ++ if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) { ++ DPRINTK("Bad GSO type %d.\n", gso->u.gso.type); ++ return -EINVAL; ++ } ++ ++ skb_shinfo(skb)->gso_size = gso->u.gso.size; ++ skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; ++ ++ /* Header must be checked, and gso_segs computed. */ ++ skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; ++ skb_shinfo(skb)->gso_segs = 0; ++ ++ return 0; ++} ++ ++static int skb_checksum_setup(struct sk_buff *skb) ++{ ++ struct iphdr *iph; ++ unsigned char *th; ++ int err = -EPROTO; ++ ++ if (skb->protocol != htons(ETH_P_IP)) ++ goto out; ++ ++ iph = (void *)skb->data; ++ th = skb->data + 4 * iph->ihl; ++ if (th >= skb_tail_pointer(skb)) ++ goto out; ++ ++ skb->csum_start = th - skb->head; ++ switch (iph->protocol) { ++ case IPPROTO_TCP: ++ skb->csum_offset = offsetof(struct tcphdr, check); ++ break; ++ case IPPROTO_UDP: ++ skb->csum_offset = offsetof(struct udphdr, check); ++ break; ++ default: ++ if (net_ratelimit()) ++ printk(KERN_ERR "Attempting to checksum a non-" ++ "TCP/UDP packet, dropping a protocol" ++ " %d packet", iph->protocol); ++ goto out; ++ } ++ ++ if ((th + skb->csum_offset + 2) > skb_tail_pointer(skb)) ++ goto out; ++ ++ err = 0; ++ ++out: ++ return err; ++} ++ ++static bool tx_credit_exceeded(struct xen_netif *netif, unsigned size) ++{ ++ unsigned long now = jiffies; ++ unsigned long next_credit = ++ netif->credit_timeout.expires + ++ msecs_to_jiffies(netif->credit_usec / 1000); ++ ++ /* Timer could already be pending in rare cases. */ ++ if (timer_pending(&netif->credit_timeout)) ++ return true; ++ ++ /* Passed the point where we can replenish credit? */ ++ if (time_after_eq(now, next_credit)) { ++ netif->credit_timeout.expires = now; ++ tx_add_credit(netif); ++ } ++ ++ /* Still too big to send right now? Set a callback. */ ++ if (size > netif->remaining_credit) { ++ netif->credit_timeout.data = ++ (unsigned long)netif; ++ netif->credit_timeout.function = ++ tx_credit_callback; ++ mod_timer(&netif->credit_timeout, ++ next_credit); ++ ++ return true; ++ } ++ ++ return false; ++} ++ ++static unsigned net_tx_build_mops(struct xen_netbk *netbk) ++{ ++ struct gnttab_map_grant_ref *mop; ++ struct sk_buff *skb; ++ int ret; ++ ++ mop = netbk->tx_map_ops; ++ while (((nr_pending_reqs(netbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS) && ++ !list_empty(&netbk->net_schedule_list)) { ++ struct xen_netif *netif; ++ struct xen_netif_tx_request txreq; ++ struct xen_netif_tx_request txfrags[MAX_SKB_FRAGS]; ++ struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1]; ++ u16 pending_idx; ++ RING_IDX idx; ++ int work_to_do; ++ unsigned int data_len; ++ pending_ring_idx_t index; ++ ++ /* Get a netif from the list with work to do. */ ++ netif = poll_net_schedule_list(netbk); ++ if (!netif) ++ continue; ++ ++ RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, work_to_do); ++ if (!work_to_do) { ++ netif_put(netif); ++ continue; ++ } ++ ++ idx = netif->tx.req_cons; ++ rmb(); /* Ensure that we see the request before we copy it. */ ++ memcpy(&txreq, RING_GET_REQUEST(&netif->tx, idx), sizeof(txreq)); ++ ++ /* Credit-based scheduling. */ ++ if (txreq.size > netif->remaining_credit && ++ tx_credit_exceeded(netif, txreq.size)) { ++ netif_put(netif); ++ continue; ++ } ++ ++ netif->remaining_credit -= txreq.size; ++ ++ work_to_do--; ++ netif->tx.req_cons = ++idx; ++ ++ memset(extras, 0, sizeof(extras)); ++ if (txreq.flags & NETTXF_extra_info) { ++ work_to_do = netbk_get_extras(netif, extras, ++ work_to_do); ++ idx = netif->tx.req_cons; ++ if (unlikely(work_to_do < 0)) { ++ netbk_tx_err(netif, &txreq, idx); ++ continue; ++ } ++ } ++ ++ ret = netbk_count_requests(netif, &txreq, txfrags, work_to_do); ++ if (unlikely(ret < 0)) { ++ netbk_tx_err(netif, &txreq, idx - ret); ++ continue; ++ } ++ idx += ret; ++ ++ if (unlikely(txreq.size < ETH_HLEN)) { ++ DPRINTK("Bad packet size: %d\n", txreq.size); ++ netbk_tx_err(netif, &txreq, idx); ++ continue; ++ } ++ ++ /* No crossing a page as the payload mustn't fragment. */ ++ if (unlikely((txreq.offset + txreq.size) > PAGE_SIZE)) { ++ DPRINTK("txreq.offset: %x, size: %u, end: %lu\n", ++ txreq.offset, txreq.size, ++ (txreq.offset &~PAGE_MASK) + txreq.size); ++ netbk_tx_err(netif, &txreq, idx); ++ continue; ++ } ++ ++ index = pending_index(netbk->pending_cons); ++ pending_idx = netbk->pending_ring[index]; ++ ++ data_len = (txreq.size > PKT_PROT_LEN && ++ ret < MAX_SKB_FRAGS) ? ++ PKT_PROT_LEN : txreq.size; ++ ++ skb = alloc_skb(data_len + NET_SKB_PAD + NET_IP_ALIGN, ++ GFP_ATOMIC | __GFP_NOWARN); ++ if (unlikely(skb == NULL)) { ++ DPRINTK("Can't allocate a skb in start_xmit.\n"); ++ netbk_tx_err(netif, &txreq, idx); ++ break; ++ } ++ ++ /* Packets passed to netif_rx() must have some headroom. */ ++ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); ++ ++ if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) { ++ struct xen_netif_extra_info *gso; ++ gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1]; ++ ++ if (netbk_set_skb_gso(skb, gso)) { ++ kfree_skb(skb); ++ netbk_tx_err(netif, &txreq, idx); ++ continue; ++ } ++ } ++ ++ gnttab_set_map_op(mop, idx_to_kaddr(netbk, pending_idx), ++ GNTMAP_host_map | GNTMAP_readonly, ++ txreq.gref, netif->domid); ++ mop++; ++ ++ memcpy(&netbk->pending_tx_info[pending_idx].req, ++ &txreq, sizeof(txreq)); ++ netbk->pending_tx_info[pending_idx].netif = netif; ++ *((u16 *)skb->data) = pending_idx; ++ ++ __skb_put(skb, data_len); ++ ++ skb_shinfo(skb)->nr_frags = ret; ++ if (data_len < txreq.size) { ++ skb_shinfo(skb)->nr_frags++; ++ skb_shinfo(skb)->frags[0].page = ++ (void *)(unsigned long)pending_idx; ++ } else { ++ /* Discriminate from any valid pending_idx value. */ ++ skb_shinfo(skb)->frags[0].page = (void *)~0UL; ++ } ++ ++ __skb_queue_tail(&netbk->tx_queue, skb); ++ ++ netbk->pending_cons++; ++ ++ mop = netbk_get_requests(netbk, netif, skb, txfrags, mop); ++ ++ netif->tx.req_cons = idx; ++ netif_schedule_work(netif); ++ ++ if ((mop - netbk->tx_map_ops) >= ARRAY_SIZE(netbk->tx_map_ops)) ++ break; ++ } ++ ++ return mop - netbk->tx_map_ops; ++} ++ ++static void net_tx_submit(struct xen_netbk *netbk) ++{ ++ struct gnttab_map_grant_ref *mop; ++ struct sk_buff *skb; ++ ++ mop = netbk->tx_map_ops; ++ while ((skb = __skb_dequeue(&netbk->tx_queue)) != NULL) { ++ struct xen_netif_tx_request *txp; ++ struct xen_netif *netif; ++ u16 pending_idx; ++ unsigned data_len; ++ ++ pending_idx = *((u16 *)skb->data); ++ netif = netbk->pending_tx_info[pending_idx].netif; ++ txp = &netbk->pending_tx_info[pending_idx].req; ++ ++ /* Check the remap error code. */ ++ if (unlikely(netbk_tx_check_mop(netbk, skb, &mop))) { ++ DPRINTK("netback grant failed.\n"); ++ skb_shinfo(skb)->nr_frags = 0; ++ kfree_skb(skb); ++ continue; ++ } ++ ++ data_len = skb->len; ++ memcpy(skb->data, ++ (void *)(idx_to_kaddr(netbk, pending_idx)|txp->offset), ++ data_len); ++ if (data_len < txp->size) { ++ /* Append the packet payload as a fragment. */ ++ txp->offset += data_len; ++ txp->size -= data_len; ++ } else { ++ /* Schedule a response immediately. */ ++ netif_idx_release(netbk, pending_idx); ++ } ++ ++ if (txp->flags & NETTXF_csum_blank) ++ skb->ip_summed = CHECKSUM_PARTIAL; ++ else if (txp->flags & NETTXF_data_validated) ++ skb->ip_summed = CHECKSUM_UNNECESSARY; ++ ++ netbk_fill_frags(netbk, skb); ++ ++ /* ++ * If the initial fragment was < PKT_PROT_LEN then ++ * pull through some bytes from the other fragments to ++ * increase the linear region to PKT_PROT_LEN bytes. ++ */ ++ if (skb_headlen(skb) < PKT_PROT_LEN && skb_is_nonlinear(skb)) { ++ int target = min_t(int, skb->len, PKT_PROT_LEN); ++ __pskb_pull_tail(skb, target - skb_headlen(skb)); ++ } ++ ++ skb->dev = netif->dev; ++ skb->protocol = eth_type_trans(skb, skb->dev); ++ ++ netif->stats.rx_bytes += skb->len; ++ netif->stats.rx_packets++; ++ ++ if (skb->ip_summed == CHECKSUM_PARTIAL) { ++ if (skb_checksum_setup(skb)) { ++ DPRINTK("Can't setup checksum in net_tx_action\n"); ++ kfree_skb(skb); ++ continue; ++ } ++ } ++ ++ if (unlikely(netbk_copy_skb_mode == NETBK_ALWAYS_COPY_SKB) && ++ unlikely(skb_linearize(skb))) { ++ DPRINTK("Can't linearize skb in net_tx_action.\n"); ++ kfree_skb(skb); ++ continue; ++ } ++ ++ netif_rx_ni(skb); ++ netif->dev->last_rx = jiffies; ++ } ++} ++ ++/* Called after netfront has transmitted */ ++static void net_tx_action(unsigned long data) ++{ ++ struct xen_netbk *netbk = (struct xen_netbk *)data; ++ unsigned nr_mops; ++ int ret; ++ ++ net_tx_action_dealloc(netbk); ++ ++ nr_mops = net_tx_build_mops(netbk); ++ ++ if (nr_mops == 0) ++ goto out; ++ ++ ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, ++ netbk->tx_map_ops, nr_mops); ++ BUG_ON(ret); ++ ++ net_tx_submit(netbk); ++out: ++ if (netbk_copy_skb_mode == NETBK_DELAYED_COPY_SKB && ++ !list_empty(&netbk->pending_inuse_head)) { ++ struct netbk_tx_pending_inuse *oldest; ++ ++ oldest = list_entry(netbk->pending_inuse_head.next, ++ struct netbk_tx_pending_inuse, list); ++ mod_timer(&netbk->netbk_tx_pending_timer, ++ oldest->alloc_time + HZ); ++ } ++} ++ ++static void netif_idx_release(struct xen_netbk *netbk, u16 pending_idx) ++{ ++ static DEFINE_SPINLOCK(_lock); ++ unsigned long flags; ++ pending_ring_idx_t index; ++ ++ spin_lock_irqsave(&_lock, flags); ++ index = pending_index(netbk->dealloc_prod); ++ netbk->dealloc_ring[index] = pending_idx; ++ /* Sync with net_tx_action_dealloc: insert idx /then/ incr producer. */ ++ smp_wmb(); ++ netbk->dealloc_prod++; ++ spin_unlock_irqrestore(&_lock, flags); ++ ++ xen_netbk_bh_handler(netbk, 0); ++} ++ ++static void netif_page_release(struct page *page, unsigned int order) ++{ ++ unsigned int group, idx; ++ int foreign = netif_get_page_ext(page, &group, &idx); ++ ++ BUG_ON(!foreign); ++ BUG_ON(order); ++ ++ netif_idx_release(&xen_netbk[group], idx); ++} ++ ++irqreturn_t netif_be_int(int irq, void *dev_id) ++{ ++ struct xen_netif *netif = dev_id; ++ struct xen_netbk *netbk; ++ ++ if (netif->group == -1) ++ return IRQ_NONE; ++ ++ netbk = &xen_netbk[netif->group]; ++ ++ add_to_net_schedule_list_tail(netif); ++ maybe_schedule_tx_action(netbk); ++ ++ if (netif_schedulable(netif) && !netbk_queue_full(netif)) ++ netif_wake_queue(netif->dev); ++ ++ return IRQ_HANDLED; ++} ++ ++static void make_tx_response(struct xen_netif *netif, ++ struct xen_netif_tx_request *txp, ++ s8 st) ++{ ++ RING_IDX i = netif->tx.rsp_prod_pvt; ++ struct xen_netif_tx_response *resp; ++ int notify; ++ ++ resp = RING_GET_RESPONSE(&netif->tx, i); ++ resp->id = txp->id; ++ resp->status = st; ++ ++ if (txp->flags & NETTXF_extra_info) ++ RING_GET_RESPONSE(&netif->tx, ++i)->status = NETIF_RSP_NULL; ++ ++ netif->tx.rsp_prod_pvt = ++i; ++ RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->tx, notify); ++ ++ /* ++ * netfront_smartpoll_active indicates whether netfront timer ++ * is active. ++ */ ++ if ((netif->smart_poll == 1)) { ++ if (!(netif->rx.sring->private.netif.smartpoll_active)) { ++ notify_remote_via_irq(netif->irq); ++ netif->rx.sring->private.netif.smartpoll_active = 1; ++ } ++ } else if (notify) ++ notify_remote_via_irq(netif->irq); ++} ++ ++static struct xen_netif_rx_response *make_rx_response(struct xen_netif *netif, ++ u16 id, ++ s8 st, ++ u16 offset, ++ u16 size, ++ u16 flags) ++{ ++ RING_IDX i = netif->rx.rsp_prod_pvt; ++ struct xen_netif_rx_response *resp; ++ ++ resp = RING_GET_RESPONSE(&netif->rx, i); ++ resp->offset = offset; ++ resp->flags = flags; ++ resp->id = id; ++ resp->status = (s16)size; ++ if (st < 0) ++ resp->status = (s16)st; ++ ++ netif->rx.rsp_prod_pvt = ++i; ++ ++ return resp; ++} ++ ++#ifdef NETBE_DEBUG_INTERRUPT ++static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs) ++{ ++ struct list_head *ent; ++ struct xen_netif *netif; ++ int i = 0; ++ int group = 0; ++ ++ printk(KERN_ALERT "netif_schedule_list:\n"); ++ ++ for (group = 0; group < xen_netbk_group_nr; group++) { ++ struct xen_netbk *netbk = &xen_netbk[group]; ++ spin_lock_irq(&netbk->net_schedule_list_lock); ++ printk(KERN_ALERT "xen_netback group number: %d\n", group); ++ list_for_each(ent, &netbk->net_schedule_list) { ++ netif = list_entry(ent, struct xen_netif, list); ++ printk(KERN_ALERT " %d: private(rx_req_cons=%08x " ++ "rx_resp_prod=%08x\n", ++ i, netif->rx.req_cons, netif->rx.rsp_prod_pvt); ++ printk(KERN_ALERT ++ " tx_req_cons=%08x, tx_resp_prod=%08x)\n", ++ netif->tx.req_cons, netif->tx.rsp_prod_pvt); ++ printk(KERN_ALERT ++ " shared(rx_req_prod=%08x " ++ "rx_resp_prod=%08x\n", ++ netif->rx.sring->req_prod, ++ netif->rx.sring->rsp_prod); ++ printk(KERN_ALERT ++ " rx_event=%08x, tx_req_prod=%08x\n", ++ netif->rx.sring->rsp_event, ++ netif->tx.sring->req_prod); ++ printk(KERN_ALERT ++ " tx_resp_prod=%08x, tx_event=%08x)\n", ++ netif->tx.sring->rsp_prod, ++ netif->tx.sring->rsp_event); ++ i++; ++ } ++ spin_unlock_irq(&netbk->net_schedule_list_lock); ++ } ++ ++ printk(KERN_ALERT " ** End of netif_schedule_list **\n"); ++ ++ return IRQ_HANDLED; ++} ++#endif ++ ++static inline int rx_work_todo(struct xen_netbk *netbk) ++{ ++ return !skb_queue_empty(&netbk->rx_queue); ++} ++ ++static inline int tx_work_todo(struct xen_netbk *netbk) ++{ ++ if (netbk->dealloc_cons != netbk->dealloc_prod) ++ return 1; ++ ++ if (((nr_pending_reqs(netbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS) && ++ !list_empty(&netbk->net_schedule_list)) ++ return 1; ++ ++ return 0; ++} ++ ++static int netbk_action_thread(void *data) ++{ ++ struct xen_netbk *netbk = (struct xen_netbk *)data; ++ while (!kthread_should_stop()) { ++ wait_event_interruptible(netbk->kthread.netbk_action_wq, ++ rx_work_todo(netbk) ++ || tx_work_todo(netbk) ++ || kthread_should_stop()); ++ cond_resched(); ++ ++ if (kthread_should_stop()) ++ break; ++ ++ if (rx_work_todo(netbk)) ++ net_rx_action((unsigned long)netbk); ++ ++ if (tx_work_todo(netbk)) ++ net_tx_action((unsigned long)netbk); ++ } ++ ++ return 0; ++} ++ ++static int __init netback_init(void) ++{ ++ int i; ++ struct page *page; ++ int rc = 0; ++ int group; ++ ++ if (!xen_pv_domain()) ++ return -ENODEV; ++ ++ xen_netbk_group_nr = num_online_cpus(); ++ xen_netbk = vmalloc(sizeof(struct xen_netbk) * xen_netbk_group_nr); ++ if (!xen_netbk) { ++ printk(KERN_ALERT "%s: out of memory\n", __func__); ++ return -ENOMEM; ++ } ++ memset(xen_netbk, 0, sizeof(struct xen_netbk) * xen_netbk_group_nr); ++ ++ /* We can increase reservation by this much in net_rx_action(). */ ++// balloon_update_driver_allowance(NET_RX_RING_SIZE); ++ ++ for (group = 0; group < xen_netbk_group_nr; group++) { ++ struct xen_netbk *netbk = &xen_netbk[group]; ++ skb_queue_head_init(&netbk->rx_queue); ++ skb_queue_head_init(&netbk->tx_queue); ++ ++ init_timer(&netbk->net_timer); ++ netbk->net_timer.data = (unsigned long)netbk; ++ netbk->net_timer.function = net_alarm; ++ ++ init_timer(&netbk->netbk_tx_pending_timer); ++ netbk->netbk_tx_pending_timer.data = (unsigned long)netbk; ++ netbk->netbk_tx_pending_timer.function = ++ netbk_tx_pending_timeout; ++ ++ netbk->mmap_pages = ++ alloc_empty_pages_and_pagevec(MAX_PENDING_REQS); ++ if (!netbk->mmap_pages) { ++ printk(KERN_ALERT "%s: out of memory\n", __func__); ++ del_timer(&netbk->netbk_tx_pending_timer); ++ del_timer(&netbk->net_timer); ++ rc = -ENOMEM; ++ goto failed_init; ++ } ++ ++ for (i = 0; i < MAX_PENDING_REQS; i++) { ++ page = netbk->mmap_pages[i]; ++ SetPageForeign(page, netif_page_release); ++ netif_set_page_ext(page, group, i); ++ INIT_LIST_HEAD(&netbk->pending_inuse[i].list); ++ } ++ ++ netbk->pending_cons = 0; ++ netbk->pending_prod = MAX_PENDING_REQS; ++ for (i = 0; i < MAX_PENDING_REQS; i++) ++ netbk->pending_ring[i] = i; ++ ++ if (MODPARM_netback_kthread) { ++ init_waitqueue_head(&netbk->kthread.netbk_action_wq); ++ netbk->kthread.task = ++ kthread_create(netbk_action_thread, ++ (void *)netbk, ++ "netback/%u", group); ++ ++ if (!IS_ERR(netbk->kthread.task)) { ++ kthread_bind(netbk->kthread.task, group); ++ } else { ++ printk(KERN_ALERT ++ "kthread_run() fails at netback\n"); ++ free_empty_pages_and_pagevec(netbk->mmap_pages, ++ MAX_PENDING_REQS); ++ del_timer(&netbk->netbk_tx_pending_timer); ++ del_timer(&netbk->net_timer); ++ rc = PTR_ERR(netbk->kthread.task); ++ goto failed_init; ++ } ++ } else { ++ tasklet_init(&netbk->tasklet.net_tx_tasklet, ++ net_tx_action, ++ (unsigned long)netbk); ++ tasklet_init(&netbk->tasklet.net_rx_tasklet, ++ net_rx_action, ++ (unsigned long)netbk); ++ } ++ ++ INIT_LIST_HEAD(&netbk->pending_inuse_head); ++ INIT_LIST_HEAD(&netbk->net_schedule_list); ++ ++ spin_lock_init(&netbk->net_schedule_list_lock); ++ ++ atomic_set(&netbk->netfront_count, 0); ++ ++ if (MODPARM_netback_kthread) ++ wake_up_process(netbk->kthread.task); ++ } ++ ++ netbk_copy_skb_mode = NETBK_DONT_COPY_SKB; ++ if (MODPARM_copy_skb) { ++ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_and_replace, ++ NULL, 0)) ++ netbk_copy_skb_mode = NETBK_ALWAYS_COPY_SKB; ++ else ++ netbk_copy_skb_mode = NETBK_DELAYED_COPY_SKB; ++ } ++ ++ //netif_accel_init(); ++ ++ rc = netif_xenbus_init(); ++ if (rc) ++ goto failed_init; ++ ++#ifdef NETBE_DEBUG_INTERRUPT ++ (void)bind_virq_to_irqhandler(VIRQ_DEBUG, ++ 0, ++ netif_be_dbg, ++ IRQF_SHARED, ++ "net-be-dbg", ++ &netif_be_dbg); ++#endif ++ ++ return 0; ++ ++failed_init: ++ for (i = 0; i < group; i++) { ++ struct xen_netbk *netbk = &xen_netbk[i]; ++ free_empty_pages_and_pagevec(netbk->mmap_pages, ++ MAX_PENDING_REQS); ++ del_timer(&netbk->netbk_tx_pending_timer); ++ del_timer(&netbk->net_timer); ++ if (MODPARM_netback_kthread) ++ kthread_stop(netbk->kthread.task); ++ } ++ vfree(xen_netbk); ++ return rc; ++ ++} ++ ++module_init(netback_init); ++ ++MODULE_LICENSE("Dual BSD/GPL"); +diff --git a/drivers/xen/netback/xenbus.c b/drivers/xen/netback/xenbus.c +new file mode 100644 +index 0000000..1930f64 +--- /dev/null ++++ b/drivers/xen/netback/xenbus.c +@@ -0,0 +1,518 @@ ++/* Xenbus code for netif backend ++ Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au> ++ Copyright (C) 2005 XenSource Ltd ++ ++ This program is free software; you can redistribute it and/or modify ++ it under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 2 of the License, or ++ (at your option) any later version. ++ ++ This program is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ GNU General Public License for more details. ++ ++ You should have received a copy of the GNU General Public License ++ along with this program; if not, write to the Free Software ++ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++*/ ++ ++#include <stdarg.h> ++#include <linux/module.h> ++#include <xen/xenbus.h> ++#include "common.h" ++ ++#if 0 ++#undef DPRINTK ++#define DPRINTK(fmt, args...) \ ++ printk("netback/xenbus (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args) ++#endif ++ ++ ++static int connect_rings(struct backend_info *); ++static void connect(struct backend_info *); ++static void backend_create_netif(struct backend_info *be); ++static void unregister_hotplug_status_watch(struct backend_info *be); ++ ++static int netback_remove(struct xenbus_device *dev) ++{ ++ struct backend_info *be = dev_get_drvdata(&dev->dev); ++ ++ //netback_remove_accelerators(be, dev); ++ ++ unregister_hotplug_status_watch(be); ++ if (be->netif) { ++ kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE); ++ xenbus_rm(XBT_NIL, dev->nodename, "hotplug-status"); ++ netif_disconnect(be->netif); ++ be->netif = NULL; ++ } ++ kfree(be); ++ dev_set_drvdata(&dev->dev, NULL); ++ return 0; ++} ++ ++ ++/** ++ * Entry point to this code when a new device is created. Allocate the basic ++ * structures and switch to InitWait. ++ */ ++static int netback_probe(struct xenbus_device *dev, ++ const struct xenbus_device_id *id) ++{ ++ const char *message; ++ struct xenbus_transaction xbt; ++ int err; ++ int sg; ++ struct backend_info *be = kzalloc(sizeof(struct backend_info), ++ GFP_KERNEL); ++ if (!be) { ++ xenbus_dev_fatal(dev, -ENOMEM, ++ "allocating backend structure"); ++ return -ENOMEM; ++ } ++ ++ be->dev = dev; ++ dev_set_drvdata(&dev->dev, be); ++ ++ sg = 1; ++ if (netbk_copy_skb_mode == NETBK_ALWAYS_COPY_SKB) ++ sg = 0; ++ ++ do { ++ err = xenbus_transaction_start(&xbt); ++ if (err) { ++ xenbus_dev_fatal(dev, err, "starting transaction"); ++ goto fail; ++ } ++ ++ err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", sg); ++ if (err) { ++ message = "writing feature-sg"; ++ goto abort_transaction; ++ } ++ ++ err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4", ++ "%d", sg); ++ if (err) { ++ message = "writing feature-gso-tcpv4"; ++ goto abort_transaction; ++ } ++ ++ /* We support rx-copy path. */ ++ err = xenbus_printf(xbt, dev->nodename, ++ "feature-rx-copy", "%d", 1); ++ if (err) { ++ message = "writing feature-rx-copy"; ++ goto abort_transaction; ++ } ++ ++ /* ++ * We don't support rx-flip path (except old guests who don't ++ * grok this feature flag). ++ */ ++ err = xenbus_printf(xbt, dev->nodename, ++ "feature-rx-flip", "%d", 0); ++ if (err) { ++ message = "writing feature-rx-flip"; ++ goto abort_transaction; ++ } ++ ++ /* We support data smart poll mechanism */ ++ err = xenbus_printf(xbt, dev->nodename, ++ "feature-smart-poll", "%d", 1); ++ if (err) { ++ message = "writing feature-smart-poll"; ++ goto abort_transaction; ++ } ++ ++ err = xenbus_transaction_end(xbt, 0); ++ } while (err == -EAGAIN); ++ ++ if (err) { ++ xenbus_dev_fatal(dev, err, "completing transaction"); ++ goto fail; ++ } ++ ++ //netback_probe_accelerators(be, dev); ++ ++ err = xenbus_switch_state(dev, XenbusStateInitWait); ++ if (err) ++ goto fail; ++ ++ /* This kicks hotplug scripts, so do it immediately. */ ++ backend_create_netif(be); ++ ++ return 0; ++ ++abort_transaction: ++ xenbus_transaction_end(xbt, 1); ++ xenbus_dev_fatal(dev, err, "%s", message); ++fail: ++ DPRINTK("failed"); ++ netback_remove(dev); ++ return err; ++} ++ ++ ++/** ++ * Handle the creation of the hotplug script environment. We add the script ++ * and vif variables to the environment, for the benefit of the vif-* hotplug ++ * scripts. ++ */ ++static int netback_uevent(struct xenbus_device *xdev, struct kobj_uevent_env *env) ++{ ++ struct backend_info *be = dev_get_drvdata(&xdev->dev); ++ char *val; ++ ++ DPRINTK("netback_uevent"); ++ ++ val = xenbus_read(XBT_NIL, xdev->nodename, "script", NULL); ++ if (IS_ERR(val)) { ++ int err = PTR_ERR(val); ++ xenbus_dev_fatal(xdev, err, "reading script"); ++ return err; ++ } ++ else { ++ if (add_uevent_var(env, "script=%s", val)) { ++ kfree(val); ++ return -ENOMEM; ++ } ++ kfree(val); ++ } ++ ++ if (be && be->netif && add_uevent_var(env, "vif=%s", be->netif->dev->name)) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++ ++static void backend_create_netif(struct backend_info *be) ++{ ++ int err; ++ long handle; ++ struct xenbus_device *dev = be->dev; ++ ++ if (be->netif != NULL) ++ return; ++ ++ err = xenbus_scanf(XBT_NIL, dev->nodename, "handle", "%li", &handle); ++ if (err != 1) { ++ xenbus_dev_fatal(dev, err, "reading handle"); ++ return; ++ } ++ ++ be->netif = netif_alloc(&dev->dev, dev->otherend_id, handle); ++ if (IS_ERR(be->netif)) { ++ err = PTR_ERR(be->netif); ++ be->netif = NULL; ++ xenbus_dev_fatal(dev, err, "creating interface"); ++ return; ++ } ++ ++ kobject_uevent(&dev->dev.kobj, KOBJ_ONLINE); ++} ++ ++ ++static void disconnect_backend(struct xenbus_device *dev) ++{ ++ struct backend_info *be = dev_get_drvdata(&dev->dev); ++ ++ if (be->netif) { ++ xenbus_rm(XBT_NIL, dev->nodename, "hotplug-status"); ++ netif_disconnect(be->netif); ++ be->netif = NULL; ++ } ++} ++ ++/** ++ * Callback received when the frontend's state changes. ++ */ ++static void frontend_changed(struct xenbus_device *dev, ++ enum xenbus_state frontend_state) ++{ ++ struct backend_info *be = dev_get_drvdata(&dev->dev); ++ ++ DPRINTK("%s", xenbus_strstate(frontend_state)); ++ ++ be->frontend_state = frontend_state; ++ ++ switch (frontend_state) { ++ case XenbusStateInitialising: ++ if (dev->state == XenbusStateClosed) { ++ printk(KERN_INFO "%s: %s: prepare for reconnect\n", ++ __FUNCTION__, dev->nodename); ++ xenbus_switch_state(dev, XenbusStateInitWait); ++ } ++ break; ++ ++ case XenbusStateInitialised: ++ break; ++ ++ case XenbusStateConnected: ++ if (dev->state == XenbusStateConnected) ++ break; ++ backend_create_netif(be); ++ if (be->netif) ++ connect(be); ++ break; ++ ++ case XenbusStateClosing: ++ if (be->netif) ++ kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE); ++ disconnect_backend(dev); ++ xenbus_switch_state(dev, XenbusStateClosing); ++ break; ++ ++ case XenbusStateClosed: ++ xenbus_switch_state(dev, XenbusStateClosed); ++ if (xenbus_dev_is_online(dev)) ++ break; ++ /* fall through if not online */ ++ case XenbusStateUnknown: ++ device_unregister(&dev->dev); ++ break; ++ ++ default: ++ xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend", ++ frontend_state); ++ break; ++ } ++} ++ ++ ++static void xen_net_read_rate(struct xenbus_device *dev, ++ unsigned long *bytes, unsigned long *usec) ++{ ++ char *s, *e; ++ unsigned long b, u; ++ char *ratestr; ++ ++ /* Default to unlimited bandwidth. */ ++ *bytes = ~0UL; ++ *usec = 0; ++ ++ ratestr = xenbus_read(XBT_NIL, dev->nodename, "rate", NULL); ++ if (IS_ERR(ratestr)) ++ return; ++ ++ s = ratestr; ++ b = simple_strtoul(s, &e, 10); ++ if ((s == e) || (*e != ',')) ++ goto fail; ++ ++ s = e + 1; ++ u = simple_strtoul(s, &e, 10); ++ if ((s == e) || (*e != '\0')) ++ goto fail; ++ ++ *bytes = b; ++ *usec = u; ++ ++ kfree(ratestr); ++ return; ++ ++ fail: ++ WPRINTK("Failed to parse network rate limit. Traffic unlimited.\n"); ++ kfree(ratestr); ++} ++ ++static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[]) ++{ ++ char *s, *e, *macstr; ++ int i; ++ ++ macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL); ++ if (IS_ERR(macstr)) ++ return PTR_ERR(macstr); ++ ++ for (i = 0; i < ETH_ALEN; i++) { ++ mac[i] = simple_strtoul(s, &e, 16); ++ if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) { ++ kfree(macstr); ++ return -ENOENT; ++ } ++ s = e+1; ++ } ++ ++ kfree(macstr); ++ return 0; ++} ++ ++static void unregister_hotplug_status_watch(struct backend_info *be) ++{ ++ if (be->have_hotplug_status_watch) { ++ unregister_xenbus_watch(&be->hotplug_status_watch); ++ kfree(be->hotplug_status_watch.node); ++ } ++ be->have_hotplug_status_watch = 0; ++} ++ ++static void hotplug_status_changed(struct xenbus_watch *watch, ++ const char **vec, ++ unsigned int vec_size) ++{ ++ struct backend_info *be = container_of(watch, ++ struct backend_info, ++ hotplug_status_watch); ++ char *str; ++ unsigned int len; ++ ++ str = xenbus_read(XBT_NIL, be->dev->nodename, "hotplug-status", &len); ++ if (IS_ERR(str)) ++ return; ++ if (len == sizeof("connected")-1 && !memcmp(str, "connected", len)) { ++ xenbus_switch_state(be->dev, XenbusStateConnected); ++ /* Not interested in this watch anymore. */ ++ unregister_hotplug_status_watch(be); ++ } ++ kfree(str); ++} ++ ++static void connect(struct backend_info *be) ++{ ++ int err; ++ struct xenbus_device *dev = be->dev; ++ ++ err = connect_rings(be); ++ if (err) ++ return; ++ ++ err = xen_net_read_mac(dev, be->netif->fe_dev_addr); ++ if (err) { ++ xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename); ++ return; ++ } ++ ++ xen_net_read_rate(dev, &be->netif->credit_bytes, ++ &be->netif->credit_usec); ++ be->netif->remaining_credit = be->netif->credit_bytes; ++ ++ unregister_hotplug_status_watch(be); ++ err = xenbus_watch_pathfmt(dev, &be->hotplug_status_watch, ++ hotplug_status_changed, ++ "%s/%s", dev->nodename, "hotplug-status"); ++ if (err) { ++ /* Switch now, since we can't do a watch. */ ++ xenbus_switch_state(dev, XenbusStateConnected); ++ } else { ++ be->have_hotplug_status_watch = 1; ++ } ++ ++ netif_wake_queue(be->netif->dev); ++} ++ ++ ++static int connect_rings(struct backend_info *be) ++{ ++ struct xen_netif *netif = be->netif; ++ struct xenbus_device *dev = be->dev; ++ unsigned long tx_ring_ref, rx_ring_ref; ++ unsigned int evtchn, rx_copy; ++ int err; ++ int val; ++ ++ DPRINTK(""); ++ ++ err = xenbus_gather(XBT_NIL, dev->otherend, ++ "tx-ring-ref", "%lu", &tx_ring_ref, ++ "rx-ring-ref", "%lu", &rx_ring_ref, ++ "event-channel", "%u", &evtchn, NULL); ++ if (err) { ++ xenbus_dev_fatal(dev, err, ++ "reading %s/ring-ref and event-channel", ++ dev->otherend); ++ return err; ++ } ++ ++ err = xenbus_scanf(XBT_NIL, dev->otherend, "request-rx-copy", "%u", ++ &rx_copy); ++ if (err == -ENOENT) { ++ err = 0; ++ rx_copy = 0; ++ } ++ if (err < 0) { ++ xenbus_dev_fatal(dev, err, "reading %s/request-rx-copy", ++ dev->otherend); ++ return err; ++ } ++ if (!rx_copy) ++ return -EOPNOTSUPP; ++ ++ if (netif->dev->tx_queue_len != 0) { ++ if (xenbus_scanf(XBT_NIL, dev->otherend, ++ "feature-rx-notify", "%d", &val) < 0) ++ val = 0; ++ if (val) ++ netif->can_queue = 1; ++ else ++ /* Must be non-zero for pfifo_fast to work. */ ++ netif->dev->tx_queue_len = 1; ++ } ++ ++ if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-sg", ++ "%d", &val) < 0) ++ val = 0; ++ netif->can_sg = !!val; ++ ++ if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv4", ++ "%d", &val) < 0) ++ val = 0; ++ netif->gso = !!val; ++ ++ if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv4-prefix", ++ "%d", &val) < 0) ++ val = 0; ++ netif->gso_prefix = !!val; ++ ++ if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-no-csum-offload", ++ "%d", &val) < 0) ++ val = 0; ++ netif->csum = !val; ++ ++ if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-smart-poll", ++ "%d", &val) < 0) ++ val = 0; ++ netif->smart_poll = !!val; ++ ++ /* Set dev->features */ ++ netif_set_features(netif); ++ ++ /* Map the shared frame, irq etc. */ ++ err = netif_map(netif, tx_ring_ref, rx_ring_ref, evtchn); ++ if (err) { ++ xenbus_dev_fatal(dev, err, ++ "mapping shared-frames %lu/%lu port %u", ++ tx_ring_ref, rx_ring_ref, evtchn); ++ return err; ++ } ++ return 0; ++} ++ ++ ++/* ** Driver Registration ** */ ++ ++ ++static const struct xenbus_device_id netback_ids[] = { ++ { "vif" }, ++ { "" } ++}; ++ ++ ++static struct xenbus_driver netback = { ++ .name = "vif", ++ .owner = THIS_MODULE, ++ .ids = netback_ids, ++ .probe = netback_probe, ++ .remove = netback_remove, ++ .uevent = netback_uevent, ++ .otherend_changed = frontend_changed, ++}; ++ ++ ++int netif_xenbus_init(void) ++{ ++ printk(KERN_CRIT "registering netback\n"); ++ return xenbus_register_backend(&netback); ++} +diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c +new file mode 100644 +index 0000000..ae693e7 +--- /dev/null ++++ b/drivers/xen/pci.c +@@ -0,0 +1,124 @@ ++/* ++ * Copyright (c) 2009, Intel Corporation. ++ * ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms and conditions of the GNU General Public License, ++ * version 2, as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++ * more details. ++ * ++ * You should have received a copy of the GNU General Public License along with ++ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple ++ * Place - Suite 330, Boston, MA 02111-1307 USA. ++ * ++ * Author: Weidong Han <weidong.han@intel.com> ++ */ ++ ++#include <linux/pci.h> ++ ++#include <xen/interface/xen.h> ++#include <xen/interface/physdev.h> ++ ++#include <asm/xen/hypervisor.h> ++#include <asm/xen/hypercall.h> ++ ++#include "../pci/pci.h" ++ ++ ++#ifdef CONFIG_PCI_IOV ++#define HANDLE_PCI_IOV 1 ++#else ++#define HANDLE_PCI_IOV 0 ++#endif ++ ++static int xen_add_device(struct device *dev) ++{ ++ int r; ++ struct pci_dev *pci_dev = to_pci_dev(dev); ++ ++ if (HANDLE_PCI_IOV && pci_dev->is_virtfn) { ++ struct physdev_manage_pci_ext manage_pci_ext = { ++ .bus = pci_dev->bus->number, ++ .devfn = pci_dev->devfn, ++ .is_virtfn = 1, ++#ifdef CONFIG_PCI_IOV ++ .physfn.bus = pci_dev->physfn->bus->number, ++ .physfn.devfn = pci_dev->physfn->devfn, ++#endif ++ }; ++ ++ r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext, ++ &manage_pci_ext); ++ } else if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn)) { ++ struct physdev_manage_pci_ext manage_pci_ext = { ++ .bus = pci_dev->bus->number, ++ .devfn = pci_dev->devfn, ++ .is_extfn = 1, ++ }; ++ ++ r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext, ++ &manage_pci_ext); ++ } else { ++ struct physdev_manage_pci manage_pci = { ++ .bus = pci_dev->bus->number, ++ .devfn = pci_dev->devfn, ++ }; ++ ++ r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add, ++ &manage_pci); ++ } ++ ++ return r; ++} ++ ++static int xen_remove_device(struct device *dev) ++{ ++ int r; ++ struct pci_dev *pci_dev = to_pci_dev(dev); ++ struct physdev_manage_pci manage_pci; ++ ++ manage_pci.bus = pci_dev->bus->number; ++ manage_pci.devfn = pci_dev->devfn; ++ ++ r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_remove, ++ &manage_pci); ++ ++ return r; ++} ++ ++static int xen_pci_notifier(struct notifier_block *nb, ++ unsigned long action, void *data) ++{ ++ struct device *dev = data; ++ int r = 0; ++ ++ switch (action) { ++ case BUS_NOTIFY_ADD_DEVICE: ++ r = xen_add_device(dev); ++ break; ++ case BUS_NOTIFY_DEL_DEVICE: ++ r = xen_remove_device(dev); ++ break; ++ default: ++ break; ++ } ++ ++ return r; ++} ++ ++struct notifier_block device_nb = { ++ .notifier_call = xen_pci_notifier, ++}; ++ ++static int __init register_xen_pci_notifier(void) ++{ ++ if (!xen_pv_domain()) ++ return 0; ++ ++ return bus_register_notifier(&pci_bus_type, &device_nb); ++} ++ ++arch_initcall(register_xen_pci_notifier); +diff --git a/drivers/xen/pciback/Makefile b/drivers/xen/pciback/Makefile +new file mode 100644 +index 0000000..38bc123 +--- /dev/null ++++ b/drivers/xen/pciback/Makefile +@@ -0,0 +1,17 @@ ++obj-$(CONFIG_XEN_PCIDEV_BACKEND) += xen-pciback.o ++ ++xen-pciback-y := pci_stub.o pciback_ops.o xenbus.o ++xen-pciback-y += conf_space.o conf_space_header.o \ ++ conf_space_capability.o \ ++ conf_space_capability_vpd.o \ ++ conf_space_capability_pm.o \ ++ conf_space_quirks.o ++xen-pciback-$(CONFIG_PCI_MSI) += conf_space_capability_msi.o ++xen-pciback-$(CONFIG_XEN_PCIDEV_BACKEND_VPCI) += vpci.o ++xen-pciback-$(CONFIG_XEN_PCIDEV_BACKEND_SLOT) += slot.o ++xen-pciback-$(CONFIG_XEN_PCIDEV_BACKEND_PASS) += passthrough.o ++xen-pciback-$(CONFIG_XEN_PCIDEV_BACKEND_CONTROLLER) += controller.o ++ ++ifeq ($(CONFIG_XEN_PCIDEV_BE_DEBUG),y) ++EXTRA_CFLAGS += -DDEBUG ++endif +diff --git a/drivers/xen/pciback/conf_space.c b/drivers/xen/pciback/conf_space.c +new file mode 100644 +index 0000000..370c18e +--- /dev/null ++++ b/drivers/xen/pciback/conf_space.c +@@ -0,0 +1,435 @@ ++/* ++ * PCI Backend - Functions for creating a virtual configuration space for ++ * exported PCI Devices. ++ * It's dangerous to allow PCI Driver Domains to change their ++ * device's resources (memory, i/o ports, interrupts). We need to ++ * restrict changes to certain PCI Configuration registers: ++ * BARs, INTERRUPT_PIN, most registers in the header... ++ * ++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> ++ */ ++ ++#include <linux/kernel.h> ++#include <linux/pci.h> ++#include "pciback.h" ++#include "conf_space.h" ++#include "conf_space_quirks.h" ++ ++static int permissive; ++module_param(permissive, bool, 0644); ++ ++#define DEFINE_PCI_CONFIG(op, size, type) \ ++int pciback_##op##_config_##size \ ++(struct pci_dev *dev, int offset, type value, void *data) \ ++{ \ ++ return pci_##op##_config_##size(dev, offset, value); \ ++} ++ ++DEFINE_PCI_CONFIG(read, byte, u8 *) ++DEFINE_PCI_CONFIG(read, word, u16 *) ++DEFINE_PCI_CONFIG(read, dword, u32 *) ++ ++DEFINE_PCI_CONFIG(write, byte, u8) ++DEFINE_PCI_CONFIG(write, word, u16) ++DEFINE_PCI_CONFIG(write, dword, u32) ++ ++static int conf_space_read(struct pci_dev *dev, ++ const struct config_field_entry *entry, ++ int offset, u32 *value) ++{ ++ int ret = 0; ++ const struct config_field *field = entry->field; ++ ++ *value = 0; ++ ++ switch (field->size) { ++ case 1: ++ if (field->u.b.read) ++ ret = field->u.b.read(dev, offset, (u8 *) value, ++ entry->data); ++ break; ++ case 2: ++ if (field->u.w.read) ++ ret = field->u.w.read(dev, offset, (u16 *) value, ++ entry->data); ++ break; ++ case 4: ++ if (field->u.dw.read) ++ ret = field->u.dw.read(dev, offset, value, entry->data); ++ break; ++ } ++ return ret; ++} ++ ++static int conf_space_write(struct pci_dev *dev, ++ const struct config_field_entry *entry, ++ int offset, u32 value) ++{ ++ int ret = 0; ++ const struct config_field *field = entry->field; ++ ++ switch (field->size) { ++ case 1: ++ if (field->u.b.write) ++ ret = field->u.b.write(dev, offset, (u8) value, ++ entry->data); ++ break; ++ case 2: ++ if (field->u.w.write) ++ ret = field->u.w.write(dev, offset, (u16) value, ++ entry->data); ++ break; ++ case 4: ++ if (field->u.dw.write) ++ ret = field->u.dw.write(dev, offset, value, ++ entry->data); ++ break; ++ } ++ return ret; ++} ++ ++static inline u32 get_mask(int size) ++{ ++ if (size == 1) ++ return 0xff; ++ else if (size == 2) ++ return 0xffff; ++ else ++ return 0xffffffff; ++} ++ ++static inline int valid_request(int offset, int size) ++{ ++ /* Validate request (no un-aligned requests) */ ++ if ((size == 1 || size == 2 || size == 4) && (offset % size) == 0) ++ return 1; ++ return 0; ++} ++ ++static inline u32 merge_value(u32 val, u32 new_val, u32 new_val_mask, ++ int offset) ++{ ++ if (offset >= 0) { ++ new_val_mask <<= (offset * 8); ++ new_val <<= (offset * 8); ++ } else { ++ new_val_mask >>= (offset * -8); ++ new_val >>= (offset * -8); ++ } ++ val = (val & ~new_val_mask) | (new_val & new_val_mask); ++ ++ return val; ++} ++ ++static int pcibios_err_to_errno(int err) ++{ ++ switch (err) { ++ case PCIBIOS_SUCCESSFUL: ++ return XEN_PCI_ERR_success; ++ case PCIBIOS_DEVICE_NOT_FOUND: ++ return XEN_PCI_ERR_dev_not_found; ++ case PCIBIOS_BAD_REGISTER_NUMBER: ++ return XEN_PCI_ERR_invalid_offset; ++ case PCIBIOS_FUNC_NOT_SUPPORTED: ++ return XEN_PCI_ERR_not_implemented; ++ case PCIBIOS_SET_FAILED: ++ return XEN_PCI_ERR_access_denied; ++ } ++ return err; ++} ++ ++int pciback_config_read(struct pci_dev *dev, int offset, int size, ++ u32 *ret_val) ++{ ++ int err = 0; ++ struct pciback_dev_data *dev_data = pci_get_drvdata(dev); ++ const struct config_field_entry *cfg_entry; ++ const struct config_field *field; ++ int req_start, req_end, field_start, field_end; ++ /* if read fails for any reason, return 0 ++ * (as if device didn't respond) */ ++ u32 value = 0, tmp_val; ++ ++ if (unlikely(verbose_request)) ++ printk(KERN_DEBUG "pciback: %s: read %d bytes at 0x%x\n", ++ pci_name(dev), size, offset); ++ ++ if (!valid_request(offset, size)) { ++ err = XEN_PCI_ERR_invalid_offset; ++ goto out; ++ } ++ ++ /* Get the real value first, then modify as appropriate */ ++ switch (size) { ++ case 1: ++ err = pci_read_config_byte(dev, offset, (u8 *) &value); ++ break; ++ case 2: ++ err = pci_read_config_word(dev, offset, (u16 *) &value); ++ break; ++ case 4: ++ err = pci_read_config_dword(dev, offset, &value); ++ break; ++ } ++ ++ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) { ++ field = cfg_entry->field; ++ ++ req_start = offset; ++ req_end = offset + size; ++ field_start = OFFSET(cfg_entry); ++ field_end = OFFSET(cfg_entry) + field->size; ++ ++ if ((req_start >= field_start && req_start < field_end) ++ || (req_end > field_start && req_end <= field_end)) { ++ err = conf_space_read(dev, cfg_entry, field_start, ++ &tmp_val); ++ if (err) ++ goto out; ++ ++ value = merge_value(value, tmp_val, ++ get_mask(field->size), ++ field_start - req_start); ++ } ++ } ++ ++out: ++ if (unlikely(verbose_request)) ++ printk(KERN_DEBUG "pciback: %s: read %d bytes at 0x%x = %x\n", ++ pci_name(dev), size, offset, value); ++ ++ *ret_val = value; ++ return pcibios_err_to_errno(err); ++} ++ ++int pciback_config_write(struct pci_dev *dev, int offset, int size, u32 value) ++{ ++ int err = 0, handled = 0; ++ struct pciback_dev_data *dev_data = pci_get_drvdata(dev); ++ const struct config_field_entry *cfg_entry; ++ const struct config_field *field; ++ u32 tmp_val; ++ int req_start, req_end, field_start, field_end; ++ ++ if (unlikely(verbose_request)) ++ printk(KERN_DEBUG ++ "pciback: %s: write request %d bytes at 0x%x = %x\n", ++ pci_name(dev), size, offset, value); ++ ++ if (!valid_request(offset, size)) ++ return XEN_PCI_ERR_invalid_offset; ++ ++ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) { ++ field = cfg_entry->field; ++ ++ req_start = offset; ++ req_end = offset + size; ++ field_start = OFFSET(cfg_entry); ++ field_end = OFFSET(cfg_entry) + field->size; ++ ++ if ((req_start >= field_start && req_start < field_end) ++ || (req_end > field_start && req_end <= field_end)) { ++ tmp_val = 0; ++ ++ err = pciback_config_read(dev, field_start, ++ field->size, &tmp_val); ++ if (err) ++ break; ++ ++ tmp_val = merge_value(tmp_val, value, get_mask(size), ++ req_start - field_start); ++ ++ err = conf_space_write(dev, cfg_entry, field_start, ++ tmp_val); ++ ++ /* handled is set true here, but not every byte ++ * may have been written! Properly detecting if ++ * every byte is handled is unnecessary as the ++ * flag is used to detect devices that need ++ * special helpers to work correctly. ++ */ ++ handled = 1; ++ } ++ } ++ ++ if (!handled && !err) { ++ /* By default, anything not specificially handled above is ++ * read-only. The permissive flag changes this behavior so ++ * that anything not specifically handled above is writable. ++ * This means that some fields may still be read-only because ++ * they have entries in the config_field list that intercept ++ * the write and do nothing. */ ++ if (dev_data->permissive || permissive) { ++ switch (size) { ++ case 1: ++ err = pci_write_config_byte(dev, offset, ++ (u8) value); ++ break; ++ case 2: ++ err = pci_write_config_word(dev, offset, ++ (u16) value); ++ break; ++ case 4: ++ err = pci_write_config_dword(dev, offset, ++ (u32) value); ++ break; ++ } ++ } else if (!dev_data->warned_on_write) { ++ dev_data->warned_on_write = 1; ++ dev_warn(&dev->dev, "Driver tried to write to a " ++ "read-only configuration space field at offset" ++ " 0x%x, size %d. This may be harmless, but if " ++ "you have problems with your device:\n" ++ "1) see permissive attribute in sysfs\n" ++ "2) report problems to the xen-devel " ++ "mailing list along with details of your " ++ "device obtained from lspci.\n", offset, size); ++ } ++ } ++ ++ return pcibios_err_to_errno(err); ++} ++ ++void pciback_config_free_dyn_fields(struct pci_dev *dev) ++{ ++ struct pciback_dev_data *dev_data = pci_get_drvdata(dev); ++ struct config_field_entry *cfg_entry, *t; ++ const struct config_field *field; ++ ++ dev_dbg(&dev->dev, "free-ing dynamically allocated virtual " ++ "configuration space fields\n"); ++ if (!dev_data) ++ return; ++ ++ list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) { ++ field = cfg_entry->field; ++ ++ if (field->clean) { ++ field->clean((struct config_field *)field); ++ ++ kfree(cfg_entry->data); ++ ++ list_del(&cfg_entry->list); ++ kfree(cfg_entry); ++ } ++ ++ } ++} ++ ++void pciback_config_reset_dev(struct pci_dev *dev) ++{ ++ struct pciback_dev_data *dev_data = pci_get_drvdata(dev); ++ const struct config_field_entry *cfg_entry; ++ const struct config_field *field; ++ ++ dev_dbg(&dev->dev, "resetting virtual configuration space\n"); ++ if (!dev_data) ++ return; ++ ++ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) { ++ field = cfg_entry->field; ++ ++ if (field->reset) ++ field->reset(dev, OFFSET(cfg_entry), cfg_entry->data); ++ } ++} ++ ++void pciback_config_free_dev(struct pci_dev *dev) ++{ ++ struct pciback_dev_data *dev_data = pci_get_drvdata(dev); ++ struct config_field_entry *cfg_entry, *t; ++ const struct config_field *field; ++ ++ dev_dbg(&dev->dev, "free-ing virtual configuration space fields\n"); ++ if (!dev_data) ++ return; ++ ++ list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) { ++ list_del(&cfg_entry->list); ++ ++ field = cfg_entry->field; ++ ++ if (field->release) ++ field->release(dev, OFFSET(cfg_entry), cfg_entry->data); ++ ++ kfree(cfg_entry); ++ } ++} ++ ++int pciback_config_add_field_offset(struct pci_dev *dev, ++ const struct config_field *field, ++ unsigned int base_offset) ++{ ++ int err = 0; ++ struct pciback_dev_data *dev_data = pci_get_drvdata(dev); ++ struct config_field_entry *cfg_entry; ++ void *tmp; ++ ++ cfg_entry = kmalloc(sizeof(*cfg_entry), GFP_KERNEL); ++ if (!cfg_entry) { ++ err = -ENOMEM; ++ goto out; ++ } ++ ++ cfg_entry->data = NULL; ++ cfg_entry->field = field; ++ cfg_entry->base_offset = base_offset; ++ ++ /* silently ignore duplicate fields */ ++ err = pciback_field_is_dup(dev, OFFSET(cfg_entry)); ++ if (err) ++ goto out; ++ ++ if (field->init) { ++ tmp = field->init(dev, OFFSET(cfg_entry)); ++ ++ if (IS_ERR(tmp)) { ++ err = PTR_ERR(tmp); ++ goto out; ++ } ++ ++ cfg_entry->data = tmp; ++ } ++ ++ dev_dbg(&dev->dev, "added config field at offset 0x%02x\n", ++ OFFSET(cfg_entry)); ++ list_add_tail(&cfg_entry->list, &dev_data->config_fields); ++ ++out: ++ if (err) ++ kfree(cfg_entry); ++ ++ return err; ++} ++ ++/* This sets up the device's virtual configuration space to keep track of ++ * certain registers (like the base address registers (BARs) so that we can ++ * keep the client from manipulating them directly. ++ */ ++int pciback_config_init_dev(struct pci_dev *dev) ++{ ++ int err = 0; ++ struct pciback_dev_data *dev_data = pci_get_drvdata(dev); ++ ++ dev_dbg(&dev->dev, "initializing virtual configuration space\n"); ++ ++ INIT_LIST_HEAD(&dev_data->config_fields); ++ ++ err = pciback_config_header_add_fields(dev); ++ if (err) ++ goto out; ++ ++ err = pciback_config_capability_add_fields(dev); ++ if (err) ++ goto out; ++ ++ err = pciback_config_quirks_init(dev); ++ ++out: ++ return err; ++} ++ ++int pciback_config_init(void) ++{ ++ return pciback_config_capability_init(); ++} +diff --git a/drivers/xen/pciback/conf_space.h b/drivers/xen/pciback/conf_space.h +new file mode 100644 +index 0000000..50ebef2 +--- /dev/null ++++ b/drivers/xen/pciback/conf_space.h +@@ -0,0 +1,126 @@ ++/* ++ * PCI Backend - Common data structures for overriding the configuration space ++ * ++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> ++ */ ++ ++#ifndef __XEN_PCIBACK_CONF_SPACE_H__ ++#define __XEN_PCIBACK_CONF_SPACE_H__ ++ ++#include <linux/list.h> ++#include <linux/err.h> ++ ++/* conf_field_init can return an errno in a ptr with ERR_PTR() */ ++typedef void *(*conf_field_init) (struct pci_dev *dev, int offset); ++typedef void (*conf_field_reset) (struct pci_dev *dev, int offset, void *data); ++typedef void (*conf_field_free) (struct pci_dev *dev, int offset, void *data); ++ ++typedef int (*conf_dword_write) (struct pci_dev *dev, int offset, u32 value, ++ void *data); ++typedef int (*conf_word_write) (struct pci_dev *dev, int offset, u16 value, ++ void *data); ++typedef int (*conf_byte_write) (struct pci_dev *dev, int offset, u8 value, ++ void *data); ++typedef int (*conf_dword_read) (struct pci_dev *dev, int offset, u32 *value, ++ void *data); ++typedef int (*conf_word_read) (struct pci_dev *dev, int offset, u16 *value, ++ void *data); ++typedef int (*conf_byte_read) (struct pci_dev *dev, int offset, u8 *value, ++ void *data); ++ ++/* These are the fields within the configuration space which we ++ * are interested in intercepting reads/writes to and changing their ++ * values. ++ */ ++struct config_field { ++ unsigned int offset; ++ unsigned int size; ++ unsigned int mask; ++ conf_field_init init; ++ conf_field_reset reset; ++ conf_field_free release; ++ void (*clean) (struct config_field *field); ++ union { ++ struct { ++ conf_dword_write write; ++ conf_dword_read read; ++ } dw; ++ struct { ++ conf_word_write write; ++ conf_word_read read; ++ } w; ++ struct { ++ conf_byte_write write; ++ conf_byte_read read; ++ } b; ++ } u; ++ struct list_head list; ++}; ++ ++struct config_field_entry { ++ struct list_head list; ++ const struct config_field *field; ++ unsigned int base_offset; ++ void *data; ++}; ++ ++#define OFFSET(cfg_entry) ((cfg_entry)->base_offset+(cfg_entry)->field->offset) ++ ++/* Add fields to a device - the add_fields macro expects to get a pointer to ++ * the first entry in an array (of which the ending is marked by size==0) ++ */ ++int pciback_config_add_field_offset(struct pci_dev *dev, ++ const struct config_field *field, ++ unsigned int offset); ++ ++static inline int pciback_config_add_field(struct pci_dev *dev, ++ const struct config_field *field) ++{ ++ return pciback_config_add_field_offset(dev, field, 0); ++} ++ ++static inline int pciback_config_add_fields(struct pci_dev *dev, ++ const struct config_field *field) ++{ ++ int i, err = 0; ++ for (i = 0; field[i].size != 0; i++) { ++ err = pciback_config_add_field(dev, &field[i]); ++ if (err) ++ break; ++ } ++ return err; ++} ++ ++static inline int pciback_config_add_fields_offset(struct pci_dev *dev, ++ const struct config_field *field, ++ unsigned int offset) ++{ ++ int i, err = 0; ++ for (i = 0; field[i].size != 0; i++) { ++ err = pciback_config_add_field_offset(dev, &field[i], offset); ++ if (err) ++ break; ++ } ++ return err; ++} ++ ++/* Read/Write the real configuration space */ ++int pciback_read_config_byte(struct pci_dev *dev, int offset, u8 *value, ++ void *data); ++int pciback_read_config_word(struct pci_dev *dev, int offset, u16 *value, ++ void *data); ++int pciback_read_config_dword(struct pci_dev *dev, int offset, u32 *value, ++ void *data); ++int pciback_write_config_byte(struct pci_dev *dev, int offset, u8 value, ++ void *data); ++int pciback_write_config_word(struct pci_dev *dev, int offset, u16 value, ++ void *data); ++int pciback_write_config_dword(struct pci_dev *dev, int offset, u32 value, ++ void *data); ++ ++int pciback_config_capability_init(void); ++ ++int pciback_config_header_add_fields(struct pci_dev *dev); ++int pciback_config_capability_add_fields(struct pci_dev *dev); ++ ++#endif /* __XEN_PCIBACK_CONF_SPACE_H__ */ +diff --git a/drivers/xen/pciback/conf_space_capability.c b/drivers/xen/pciback/conf_space_capability.c +new file mode 100644 +index 0000000..0ea84d6 +--- /dev/null ++++ b/drivers/xen/pciback/conf_space_capability.c +@@ -0,0 +1,66 @@ ++/* ++ * PCI Backend - Handles the virtual fields found on the capability lists ++ * in the configuration space. ++ * ++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> ++ */ ++ ++#include <linux/kernel.h> ++#include <linux/pci.h> ++#include "pciback.h" ++#include "conf_space.h" ++#include "conf_space_capability.h" ++ ++static LIST_HEAD(capabilities); ++ ++static const struct config_field caplist_header[] = { ++ { ++ .offset = PCI_CAP_LIST_ID, ++ .size = 2, /* encompass PCI_CAP_LIST_ID & PCI_CAP_LIST_NEXT */ ++ .u.w.read = pciback_read_config_word, ++ .u.w.write = NULL, ++ }, ++ {} ++}; ++ ++static inline void register_capability(struct pciback_config_capability *cap) ++{ ++ list_add_tail(&cap->cap_list, &capabilities); ++} ++ ++int pciback_config_capability_add_fields(struct pci_dev *dev) ++{ ++ int err = 0; ++ struct pciback_config_capability *cap; ++ int cap_offset; ++ ++ list_for_each_entry(cap, &capabilities, cap_list) { ++ cap_offset = pci_find_capability(dev, cap->capability); ++ if (cap_offset) { ++ dev_dbg(&dev->dev, "Found capability 0x%x at 0x%x\n", ++ cap->capability, cap_offset); ++ ++ err = pciback_config_add_fields_offset(dev, ++ caplist_header, ++ cap_offset); ++ if (err) ++ goto out; ++ err = pciback_config_add_fields_offset(dev, ++ cap->fields, ++ cap_offset); ++ if (err) ++ goto out; ++ } ++ } ++ ++out: ++ return err; ++} ++ ++int pciback_config_capability_init(void) ++{ ++ register_capability(&pciback_config_capability_vpd); ++ register_capability(&pciback_config_capability_pm); ++ ++ return 0; ++} +diff --git a/drivers/xen/pciback/conf_space_capability.h b/drivers/xen/pciback/conf_space_capability.h +new file mode 100644 +index 0000000..8da3ac4 +--- /dev/null ++++ b/drivers/xen/pciback/conf_space_capability.h +@@ -0,0 +1,26 @@ ++/* ++ * PCI Backend - Data structures for special overlays for structures on ++ * the capability list. ++ * ++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> ++ */ ++ ++#ifndef __PCIBACK_CONFIG_CAPABILITY_H__ ++#define __PCIBACK_CONFIG_CAPABILITY_H__ ++ ++#include <linux/pci.h> ++#include <linux/list.h> ++ ++struct pciback_config_capability { ++ struct list_head cap_list; ++ ++ int capability; ++ ++ /* If the device has the capability found above, add these fields */ ++ const struct config_field *fields; ++}; ++ ++extern struct pciback_config_capability pciback_config_capability_vpd; ++extern struct pciback_config_capability pciback_config_capability_pm; ++ ++#endif +diff --git a/drivers/xen/pciback/conf_space_capability_msi.c b/drivers/xen/pciback/conf_space_capability_msi.c +new file mode 100644 +index 0000000..b15131e +--- /dev/null ++++ b/drivers/xen/pciback/conf_space_capability_msi.c +@@ -0,0 +1,110 @@ ++/* ++ * PCI Backend -- Configuration overlay for MSI capability ++ */ ++#include <linux/pci.h> ++#include <linux/slab.h> ++#include "conf_space.h" ++#include "conf_space_capability.h" ++#include <xen/interface/io/pciif.h> ++#include <xen/events.h> ++#include "pciback.h" ++ ++int pciback_enable_msi(struct pciback_device *pdev, ++ struct pci_dev *dev, struct xen_pci_op *op) ++{ ++ struct pciback_dev_data *dev_data; ++ int otherend = pdev->xdev->otherend_id; ++ int status; ++ ++ if (unlikely(verbose_request)) ++ printk(KERN_DEBUG "pciback: %s: enable MSI\n", pci_name(dev)); ++ ++ status = pci_enable_msi(dev); ++ ++ if (status) { ++ printk(KERN_ERR "error enable msi for guest %x status %x\n", ++ otherend, status); ++ op->value = 0; ++ return XEN_PCI_ERR_op_failed; ++ } ++ ++ /* The value the guest needs is actually the IDT vector, not the ++ * the local domain's IRQ number. */ ++ op->value = xen_gsi_from_irq(dev->irq); ++ dev_data = pci_get_drvdata(dev); ++ if (dev_data) ++ dev_data->ack_intr = 0; ++ ++ return 0; ++} ++ ++int pciback_disable_msi(struct pciback_device *pdev, ++ struct pci_dev *dev, struct xen_pci_op *op) ++{ ++ struct pciback_dev_data *dev_data; ++ ++ if (unlikely(verbose_request)) ++ printk(KERN_DEBUG "pciback: %s: disable MSI\n", pci_name(dev)); ++ pci_disable_msi(dev); ++ ++ op->value = xen_gsi_from_irq(dev->irq); ++ dev_data = pci_get_drvdata(dev); ++ if (dev_data) ++ dev_data->ack_intr = 1; ++ return 0; ++} ++ ++int pciback_enable_msix(struct pciback_device *pdev, ++ struct pci_dev *dev, struct xen_pci_op *op) ++{ ++ struct pciback_dev_data *dev_data; ++ int i, result; ++ struct msix_entry *entries; ++ ++ if (unlikely(verbose_request)) ++ printk(KERN_DEBUG "pciback: %s: enable MSI-X\n", pci_name(dev)); ++ if (op->value > SH_INFO_MAX_VEC) ++ return -EINVAL; ++ ++ entries = kmalloc(op->value * sizeof(*entries), GFP_KERNEL); ++ if (entries == NULL) ++ return -ENOMEM; ++ ++ for (i = 0; i < op->value; i++) { ++ entries[i].entry = op->msix_entries[i].entry; ++ entries[i].vector = op->msix_entries[i].vector; ++ } ++ ++ result = pci_enable_msix(dev, entries, op->value); ++ ++ for (i = 0; i < op->value; i++) { ++ op->msix_entries[i].entry = entries[i].entry; ++ op->msix_entries[i].vector = ++ xen_gsi_from_irq(entries[i].vector); ++ } ++ ++ kfree(entries); ++ ++ op->value = result; ++ dev_data = pci_get_drvdata(dev); ++ if (dev_data) ++ dev_data->ack_intr = 0; ++ ++ return result; ++} ++ ++int pciback_disable_msix(struct pciback_device *pdev, ++ struct pci_dev *dev, struct xen_pci_op *op) ++{ ++ struct pciback_dev_data *dev_data; ++ if (unlikely(verbose_request)) ++ printk(KERN_DEBUG "pciback: %s: disable MSI-X\n", pci_name(dev)); ++ pci_disable_msix(dev); ++ ++ op->value = xen_gsi_from_irq(dev->irq); ++ dev_data = pci_get_drvdata(dev); ++ if (dev_data) ++ dev_data->ack_intr = 1; ++ return 0; ++} ++ +diff --git a/drivers/xen/pciback/conf_space_capability_pm.c b/drivers/xen/pciback/conf_space_capability_pm.c +new file mode 100644 +index 0000000..0442616 +--- /dev/null ++++ b/drivers/xen/pciback/conf_space_capability_pm.c +@@ -0,0 +1,113 @@ ++/* ++ * PCI Backend - Configuration space overlay for power management ++ * ++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> ++ */ ++ ++#include <linux/pci.h> ++#include "conf_space.h" ++#include "conf_space_capability.h" ++ ++static int pm_caps_read(struct pci_dev *dev, int offset, u16 *value, ++ void *data) ++{ ++ int err; ++ u16 real_value; ++ ++ err = pci_read_config_word(dev, offset, &real_value); ++ if (err) ++ goto out; ++ ++ *value = real_value & ~PCI_PM_CAP_PME_MASK; ++ ++out: ++ return err; ++} ++ ++/* PM_OK_BITS specifies the bits that the driver domain is allowed to change. ++ * Can't allow driver domain to enable PMEs - they're shared */ ++#define PM_OK_BITS (PCI_PM_CTRL_PME_STATUS|PCI_PM_CTRL_DATA_SEL_MASK) ++ ++static int pm_ctrl_write(struct pci_dev *dev, int offset, u16 new_value, ++ void *data) ++{ ++ int err; ++ u16 old_value; ++ pci_power_t new_state, old_state; ++ ++ err = pci_read_config_word(dev, offset, &old_value); ++ if (err) ++ goto out; ++ ++ old_state = (pci_power_t)(old_value & PCI_PM_CTRL_STATE_MASK); ++ new_state = (pci_power_t)(new_value & PCI_PM_CTRL_STATE_MASK); ++ ++ new_value &= PM_OK_BITS; ++ if ((old_value & PM_OK_BITS) != new_value) { ++ new_value = (old_value & ~PM_OK_BITS) | new_value; ++ err = pci_write_config_word(dev, offset, new_value); ++ if (err) ++ goto out; ++ } ++ ++ /* Let pci core handle the power management change */ ++ dev_dbg(&dev->dev, "set power state to %x\n", new_state); ++ err = pci_set_power_state(dev, new_state); ++ if (err) { ++ err = PCIBIOS_SET_FAILED; ++ goto out; ++ } ++ ++ out: ++ return err; ++} ++ ++/* Ensure PMEs are disabled */ ++static void *pm_ctrl_init(struct pci_dev *dev, int offset) ++{ ++ int err; ++ u16 value; ++ ++ err = pci_read_config_word(dev, offset, &value); ++ if (err) ++ goto out; ++ ++ if (value & PCI_PM_CTRL_PME_ENABLE) { ++ value &= ~PCI_PM_CTRL_PME_ENABLE; ++ err = pci_write_config_word(dev, offset, value); ++ } ++ ++out: ++ return ERR_PTR(err); ++} ++ ++static const struct config_field caplist_pm[] = { ++ { ++ .offset = PCI_PM_PMC, ++ .size = 2, ++ .u.w.read = pm_caps_read, ++ }, ++ { ++ .offset = PCI_PM_CTRL, ++ .size = 2, ++ .init = pm_ctrl_init, ++ .u.w.read = pciback_read_config_word, ++ .u.w.write = pm_ctrl_write, ++ }, ++ { ++ .offset = PCI_PM_PPB_EXTENSIONS, ++ .size = 1, ++ .u.b.read = pciback_read_config_byte, ++ }, ++ { ++ .offset = PCI_PM_DATA_REGISTER, ++ .size = 1, ++ .u.b.read = pciback_read_config_byte, ++ }, ++ {} ++}; ++ ++struct pciback_config_capability pciback_config_capability_pm = { ++ .capability = PCI_CAP_ID_PM, ++ .fields = caplist_pm, ++}; +diff --git a/drivers/xen/pciback/conf_space_capability_vpd.c b/drivers/xen/pciback/conf_space_capability_vpd.c +new file mode 100644 +index 0000000..e7b4d66 +--- /dev/null ++++ b/drivers/xen/pciback/conf_space_capability_vpd.c +@@ -0,0 +1,40 @@ ++/* ++ * PCI Backend - Configuration space overlay for Vital Product Data ++ * ++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> ++ */ ++ ++#include <linux/pci.h> ++#include "conf_space.h" ++#include "conf_space_capability.h" ++ ++static int vpd_address_write(struct pci_dev *dev, int offset, u16 value, ++ void *data) ++{ ++ /* Disallow writes to the vital product data */ ++ if (value & PCI_VPD_ADDR_F) ++ return PCIBIOS_SET_FAILED; ++ else ++ return pci_write_config_word(dev, offset, value); ++} ++ ++static const struct config_field caplist_vpd[] = { ++ { ++ .offset = PCI_VPD_ADDR, ++ .size = 2, ++ .u.w.read = pciback_read_config_word, ++ .u.w.write = vpd_address_write, ++ }, ++ { ++ .offset = PCI_VPD_DATA, ++ .size = 4, ++ .u.dw.read = pciback_read_config_dword, ++ .u.dw.write = NULL, ++ }, ++ {} ++}; ++ ++struct pciback_config_capability pciback_config_capability_vpd = { ++ .capability = PCI_CAP_ID_VPD, ++ .fields = caplist_vpd, ++}; +diff --git a/drivers/xen/pciback/conf_space_header.c b/drivers/xen/pciback/conf_space_header.c +new file mode 100644 +index 0000000..cb450f4 +--- /dev/null ++++ b/drivers/xen/pciback/conf_space_header.c +@@ -0,0 +1,385 @@ ++/* ++ * PCI Backend - Handles the virtual fields in the configuration space headers. ++ * ++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> ++ */ ++ ++#include <linux/kernel.h> ++#include <linux/pci.h> ++#include "pciback.h" ++#include "conf_space.h" ++ ++struct pci_bar_info { ++ u32 val; ++ u32 len_val; ++ int which; ++}; ++ ++#define is_enable_cmd(value) ((value)&(PCI_COMMAND_MEMORY|PCI_COMMAND_IO)) ++#define is_master_cmd(value) ((value)&PCI_COMMAND_MASTER) ++ ++static int command_read(struct pci_dev *dev, int offset, u16 *value, void *data) ++{ ++ int i; ++ int ret; ++ ++ ret = pciback_read_config_word(dev, offset, value, data); ++ if (!atomic_read(&dev->enable_cnt)) ++ return ret; ++ ++ for (i = 0; i < PCI_ROM_RESOURCE; i++) { ++ if (dev->resource[i].flags & IORESOURCE_IO) ++ *value |= PCI_COMMAND_IO; ++ if (dev->resource[i].flags & IORESOURCE_MEM) ++ *value |= PCI_COMMAND_MEMORY; ++ } ++ ++ return ret; ++} ++ ++static int command_write(struct pci_dev *dev, int offset, u16 value, void *data) ++{ ++ struct pciback_dev_data *dev_data; ++ int err; ++ ++ dev_data = pci_get_drvdata(dev); ++ if (!pci_is_enabled(dev) && is_enable_cmd(value)) { ++ if (unlikely(verbose_request)) ++ printk(KERN_DEBUG "pciback: %s: enable\n", ++ pci_name(dev)); ++ err = pci_enable_device(dev); ++ if (err) ++ return err; ++ if (dev_data) ++ dev_data->enable_intx = 1; ++ } else if (pci_is_enabled(dev) && !is_enable_cmd(value)) { ++ if (unlikely(verbose_request)) ++ printk(KERN_DEBUG "pciback: %s: disable\n", ++ pci_name(dev)); ++ pci_disable_device(dev); ++ if (dev_data) ++ dev_data->enable_intx = 0; ++ } ++ ++ if (!dev->is_busmaster && is_master_cmd(value)) { ++ if (unlikely(verbose_request)) ++ printk(KERN_DEBUG "pciback: %s: set bus master\n", ++ pci_name(dev)); ++ pci_set_master(dev); ++ } ++ ++ if (value & PCI_COMMAND_INVALIDATE) { ++ if (unlikely(verbose_request)) ++ printk(KERN_DEBUG ++ "pciback: %s: enable memory-write-invalidate\n", ++ pci_name(dev)); ++ err = pci_set_mwi(dev); ++ if (err) { ++ printk(KERN_WARNING ++ "pciback: %s: cannot enable " ++ "memory-write-invalidate (%d)\n", ++ pci_name(dev), err); ++ value &= ~PCI_COMMAND_INVALIDATE; ++ } ++ } ++ ++ return pci_write_config_word(dev, offset, value); ++} ++ ++static int rom_write(struct pci_dev *dev, int offset, u32 value, void *data) ++{ ++ struct pci_bar_info *bar = data; ++ ++ if (unlikely(!bar)) { ++ printk(KERN_WARNING "pciback: driver data not found for %s\n", ++ pci_name(dev)); ++ return XEN_PCI_ERR_op_failed; ++ } ++ ++ /* A write to obtain the length must happen as a 32-bit write. ++ * This does not (yet) support writing individual bytes ++ */ ++ if (value == ~PCI_ROM_ADDRESS_ENABLE) ++ bar->which = 1; ++ else { ++ u32 tmpval; ++ pci_read_config_dword(dev, offset, &tmpval); ++ if (tmpval != bar->val && value == bar->val) { ++ /* Allow restoration of bar value. */ ++ pci_write_config_dword(dev, offset, bar->val); ++ } ++ bar->which = 0; ++ } ++ ++ /* Do we need to support enabling/disabling the rom address here? */ ++ ++ return 0; ++} ++ ++/* For the BARs, only allow writes which write ~0 or ++ * the correct resource information ++ * (Needed for when the driver probes the resource usage) ++ */ ++static int bar_write(struct pci_dev *dev, int offset, u32 value, void *data) ++{ ++ struct pci_bar_info *bar = data; ++ ++ if (unlikely(!bar)) { ++ printk(KERN_WARNING "pciback: driver data not found for %s\n", ++ pci_name(dev)); ++ return XEN_PCI_ERR_op_failed; ++ } ++ ++ /* A write to obtain the length must happen as a 32-bit write. ++ * This does not (yet) support writing individual bytes ++ */ ++ if (value == ~0) ++ bar->which = 1; ++ else { ++ u32 tmpval; ++ pci_read_config_dword(dev, offset, &tmpval); ++ if (tmpval != bar->val && value == bar->val) { ++ /* Allow restoration of bar value. */ ++ pci_write_config_dword(dev, offset, bar->val); ++ } ++ bar->which = 0; ++ } ++ ++ return 0; ++} ++ ++static int bar_read(struct pci_dev *dev, int offset, u32 * value, void *data) ++{ ++ struct pci_bar_info *bar = data; ++ ++ if (unlikely(!bar)) { ++ printk(KERN_WARNING "pciback: driver data not found for %s\n", ++ pci_name(dev)); ++ return XEN_PCI_ERR_op_failed; ++ } ++ ++ *value = bar->which ? bar->len_val : bar->val; ++ ++ return 0; ++} ++ ++static inline void read_dev_bar(struct pci_dev *dev, ++ struct pci_bar_info *bar_info, int offset, ++ u32 len_mask) ++{ ++ int pos; ++ struct resource *res = dev->resource; ++ ++ if (offset == PCI_ROM_ADDRESS || offset == PCI_ROM_ADDRESS1) ++ pos = PCI_ROM_RESOURCE; ++ else { ++ pos = (offset - PCI_BASE_ADDRESS_0) / 4; ++ if (pos && ((res[pos - 1].flags & (PCI_BASE_ADDRESS_SPACE | ++ PCI_BASE_ADDRESS_MEM_TYPE_MASK)) == ++ (PCI_BASE_ADDRESS_SPACE_MEMORY | ++ PCI_BASE_ADDRESS_MEM_TYPE_64))) { ++ bar_info->val = res[pos - 1].start >> 32; ++ bar_info->len_val = res[pos - 1].end >> 32; ++ return; ++ } ++ } ++ ++ bar_info->val = res[pos].start | ++ (res[pos].flags & PCI_REGION_FLAG_MASK); ++ bar_info->len_val = res[pos].end - res[pos].start + 1; ++} ++ ++static void *bar_init(struct pci_dev *dev, int offset) ++{ ++ struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL); ++ ++ if (!bar) ++ return ERR_PTR(-ENOMEM); ++ ++ read_dev_bar(dev, bar, offset, ~0); ++ bar->which = 0; ++ ++ return bar; ++} ++ ++static void *rom_init(struct pci_dev *dev, int offset) ++{ ++ struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL); ++ ++ if (!bar) ++ return ERR_PTR(-ENOMEM); ++ ++ read_dev_bar(dev, bar, offset, ~PCI_ROM_ADDRESS_ENABLE); ++ bar->which = 0; ++ ++ return bar; ++} ++ ++static void bar_reset(struct pci_dev *dev, int offset, void *data) ++{ ++ struct pci_bar_info *bar = data; ++ ++ bar->which = 0; ++} ++ ++static void bar_release(struct pci_dev *dev, int offset, void *data) ++{ ++ kfree(data); ++} ++ ++static int pciback_read_vendor(struct pci_dev *dev, int offset, ++ u16 *value, void *data) ++{ ++ *value = dev->vendor; ++ ++ return 0; ++} ++ ++static int pciback_read_device(struct pci_dev *dev, int offset, ++ u16 *value, void *data) ++{ ++ *value = dev->device; ++ ++ return 0; ++} ++ ++static int interrupt_read(struct pci_dev *dev, int offset, u8 * value, ++ void *data) ++{ ++ *value = (u8) dev->irq; ++ ++ return 0; ++} ++ ++static int bist_write(struct pci_dev *dev, int offset, u8 value, void *data) ++{ ++ u8 cur_value; ++ int err; ++ ++ err = pci_read_config_byte(dev, offset, &cur_value); ++ if (err) ++ goto out; ++ ++ if ((cur_value & ~PCI_BIST_START) == (value & ~PCI_BIST_START) ++ || value == PCI_BIST_START) ++ err = pci_write_config_byte(dev, offset, value); ++ ++out: ++ return err; ++} ++ ++static const struct config_field header_common[] = { ++ { ++ .offset = PCI_VENDOR_ID, ++ .size = 2, ++ .u.w.read = pciback_read_vendor, ++ }, ++ { ++ .offset = PCI_DEVICE_ID, ++ .size = 2, ++ .u.w.read = pciback_read_device, ++ }, ++ { ++ .offset = PCI_COMMAND, ++ .size = 2, ++ .u.w.read = command_read, ++ .u.w.write = command_write, ++ }, ++ { ++ .offset = PCI_INTERRUPT_LINE, ++ .size = 1, ++ .u.b.read = interrupt_read, ++ }, ++ { ++ .offset = PCI_INTERRUPT_PIN, ++ .size = 1, ++ .u.b.read = pciback_read_config_byte, ++ }, ++ { ++ /* Any side effects of letting driver domain control cache line? */ ++ .offset = PCI_CACHE_LINE_SIZE, ++ .size = 1, ++ .u.b.read = pciback_read_config_byte, ++ .u.b.write = pciback_write_config_byte, ++ }, ++ { ++ .offset = PCI_LATENCY_TIMER, ++ .size = 1, ++ .u.b.read = pciback_read_config_byte, ++ }, ++ { ++ .offset = PCI_BIST, ++ .size = 1, ++ .u.b.read = pciback_read_config_byte, ++ .u.b.write = bist_write, ++ }, ++ {} ++}; ++ ++#define CFG_FIELD_BAR(reg_offset) \ ++ { \ ++ .offset = reg_offset, \ ++ .size = 4, \ ++ .init = bar_init, \ ++ .reset = bar_reset, \ ++ .release = bar_release, \ ++ .u.dw.read = bar_read, \ ++ .u.dw.write = bar_write, \ ++ } ++ ++#define CFG_FIELD_ROM(reg_offset) \ ++ { \ ++ .offset = reg_offset, \ ++ .size = 4, \ ++ .init = rom_init, \ ++ .reset = bar_reset, \ ++ .release = bar_release, \ ++ .u.dw.read = bar_read, \ ++ .u.dw.write = rom_write, \ ++ } ++ ++static const struct config_field header_0[] = { ++ CFG_FIELD_BAR(PCI_BASE_ADDRESS_0), ++ CFG_FIELD_BAR(PCI_BASE_ADDRESS_1), ++ CFG_FIELD_BAR(PCI_BASE_ADDRESS_2), ++ CFG_FIELD_BAR(PCI_BASE_ADDRESS_3), ++ CFG_FIELD_BAR(PCI_BASE_ADDRESS_4), ++ CFG_FIELD_BAR(PCI_BASE_ADDRESS_5), ++ CFG_FIELD_ROM(PCI_ROM_ADDRESS), ++ {} ++}; ++ ++static const struct config_field header_1[] = { ++ CFG_FIELD_BAR(PCI_BASE_ADDRESS_0), ++ CFG_FIELD_BAR(PCI_BASE_ADDRESS_1), ++ CFG_FIELD_ROM(PCI_ROM_ADDRESS1), ++ {} ++}; ++ ++int pciback_config_header_add_fields(struct pci_dev *dev) ++{ ++ int err; ++ ++ err = pciback_config_add_fields(dev, header_common); ++ if (err) ++ goto out; ++ ++ switch (dev->hdr_type) { ++ case PCI_HEADER_TYPE_NORMAL: ++ err = pciback_config_add_fields(dev, header_0); ++ break; ++ ++ case PCI_HEADER_TYPE_BRIDGE: ++ err = pciback_config_add_fields(dev, header_1); ++ break; ++ ++ default: ++ err = -EINVAL; ++ printk(KERN_ERR "pciback: %s: Unsupported header type %d!\n", ++ pci_name(dev), dev->hdr_type); ++ break; ++ } ++ ++out: ++ return err; ++} +diff --git a/drivers/xen/pciback/conf_space_quirks.c b/drivers/xen/pciback/conf_space_quirks.c +new file mode 100644 +index 0000000..45c31fb +--- /dev/null ++++ b/drivers/xen/pciback/conf_space_quirks.c +@@ -0,0 +1,140 @@ ++/* ++ * PCI Backend - Handle special overlays for broken devices. ++ * ++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> ++ * Author: Chris Bookholt <hap10@epoch.ncsc.mil> ++ */ ++ ++#include <linux/kernel.h> ++#include <linux/pci.h> ++#include "pciback.h" ++#include "conf_space.h" ++#include "conf_space_quirks.h" ++ ++LIST_HEAD(pciback_quirks); ++ ++static inline const struct pci_device_id * ++match_one_device(const struct pci_device_id *id, const struct pci_dev *dev) ++{ ++ if ((id->vendor == PCI_ANY_ID || id->vendor == dev->vendor) && ++ (id->device == PCI_ANY_ID || id->device == dev->device) && ++ (id->subvendor == PCI_ANY_ID || ++ id->subvendor == dev->subsystem_vendor) && ++ (id->subdevice == PCI_ANY_ID || ++ id->subdevice == dev->subsystem_device) && ++ !((id->class ^ dev->class) & id->class_mask)) ++ return id; ++ return NULL; ++} ++ ++struct pciback_config_quirk *pciback_find_quirk(struct pci_dev *dev) ++{ ++ struct pciback_config_quirk *tmp_quirk; ++ ++ list_for_each_entry(tmp_quirk, &pciback_quirks, quirks_list) ++ if (match_one_device(&tmp_quirk->devid, dev) != NULL) ++ goto out; ++ tmp_quirk = NULL; ++ printk(KERN_DEBUG ++ "quirk didn't match any device pciback knows about\n"); ++out: ++ return tmp_quirk; ++} ++ ++static inline void register_quirk(struct pciback_config_quirk *quirk) ++{ ++ list_add_tail(&quirk->quirks_list, &pciback_quirks); ++} ++ ++int pciback_field_is_dup(struct pci_dev *dev, unsigned int reg) ++{ ++ int ret = 0; ++ struct pciback_dev_data *dev_data = pci_get_drvdata(dev); ++ struct config_field_entry *cfg_entry; ++ ++ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) { ++ if (OFFSET(cfg_entry) == reg) { ++ ret = 1; ++ break; ++ } ++ } ++ return ret; ++} ++ ++int pciback_config_quirks_add_field(struct pci_dev *dev, struct config_field ++ *field) ++{ ++ int err = 0; ++ ++ switch (field->size) { ++ case 1: ++ field->u.b.read = pciback_read_config_byte; ++ field->u.b.write = pciback_write_config_byte; ++ break; ++ case 2: ++ field->u.w.read = pciback_read_config_word; ++ field->u.w.write = pciback_write_config_word; ++ break; ++ case 4: ++ field->u.dw.read = pciback_read_config_dword; ++ field->u.dw.write = pciback_write_config_dword; ++ break; ++ default: ++ err = -EINVAL; ++ goto out; ++ } ++ ++ pciback_config_add_field(dev, field); ++ ++out: ++ return err; ++} ++ ++int pciback_config_quirks_init(struct pci_dev *dev) ++{ ++ struct pciback_config_quirk *quirk; ++ int ret = 0; ++ ++ quirk = kzalloc(sizeof(*quirk), GFP_ATOMIC); ++ if (!quirk) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ quirk->devid.vendor = dev->vendor; ++ quirk->devid.device = dev->device; ++ quirk->devid.subvendor = dev->subsystem_vendor; ++ quirk->devid.subdevice = dev->subsystem_device; ++ quirk->devid.class = 0; ++ quirk->devid.class_mask = 0; ++ quirk->devid.driver_data = 0UL; ++ ++ quirk->pdev = dev; ++ ++ register_quirk(quirk); ++out: ++ return ret; ++} ++ ++void pciback_config_field_free(struct config_field *field) ++{ ++ kfree(field); ++} ++ ++int pciback_config_quirk_release(struct pci_dev *dev) ++{ ++ struct pciback_config_quirk *quirk; ++ int ret = 0; ++ ++ quirk = pciback_find_quirk(dev); ++ if (!quirk) { ++ ret = -ENXIO; ++ goto out; ++ } ++ ++ list_del(&quirk->quirks_list); ++ kfree(quirk); ++ ++out: ++ return ret; ++} +diff --git a/drivers/xen/pciback/conf_space_quirks.h b/drivers/xen/pciback/conf_space_quirks.h +new file mode 100644 +index 0000000..acd0e1a +--- /dev/null ++++ b/drivers/xen/pciback/conf_space_quirks.h +@@ -0,0 +1,35 @@ ++/* ++ * PCI Backend - Data structures for special overlays for broken devices. ++ * ++ * Ryan Wilson <hap9@epoch.ncsc.mil> ++ * Chris Bookholt <hap10@epoch.ncsc.mil> ++ */ ++ ++#ifndef __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__ ++#define __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__ ++ ++#include <linux/pci.h> ++#include <linux/list.h> ++ ++struct pciback_config_quirk { ++ struct list_head quirks_list; ++ struct pci_device_id devid; ++ struct pci_dev *pdev; ++}; ++ ++struct pciback_config_quirk *pciback_find_quirk(struct pci_dev *dev); ++ ++int pciback_config_quirks_add_field(struct pci_dev *dev, struct config_field ++ *field); ++ ++int pciback_config_quirks_remove_field(struct pci_dev *dev, int reg); ++ ++int pciback_config_quirks_init(struct pci_dev *dev); ++ ++void pciback_config_field_free(struct config_field *field); ++ ++int pciback_config_quirk_release(struct pci_dev *dev); ++ ++int pciback_field_is_dup(struct pci_dev *dev, unsigned int reg); ++ ++#endif +diff --git a/drivers/xen/pciback/controller.c b/drivers/xen/pciback/controller.c +new file mode 100644 +index 0000000..7f04f11 +--- /dev/null ++++ b/drivers/xen/pciback/controller.c +@@ -0,0 +1,442 @@ ++/* ++ * Copyright (C) 2007 Hewlett-Packard Development Company, L.P. ++ * Alex Williamson <alex.williamson@hp.com> ++ * ++ * PCI "Controller" Backend - virtualize PCI bus topology based on PCI ++ * controllers. Devices under the same PCI controller are exposed on the ++ * same virtual domain:bus. Within a bus, device slots are virtualized ++ * to compact the bus. ++ * ++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ */ ++ ++#include <linux/acpi.h> ++#include <linux/list.h> ++#include <linux/pci.h> ++#include <linux/spinlock.h> ++#include "pciback.h" ++ ++#define PCI_MAX_BUSSES 255 ++#define PCI_MAX_SLOTS 32 ++ ++struct controller_dev_entry { ++ struct list_head list; ++ struct pci_dev *dev; ++ unsigned int devfn; ++}; ++ ++struct controller_list_entry { ++ struct list_head list; ++ struct pci_controller *controller; ++ unsigned int domain; ++ unsigned int bus; ++ unsigned int next_devfn; ++ struct list_head dev_list; ++}; ++ ++struct controller_dev_data { ++ struct list_head list; ++ unsigned int next_domain; ++ unsigned int next_bus; ++ spinlock_t lock; ++}; ++ ++struct walk_info { ++ struct pciback_device *pdev; ++ int resource_count; ++ int root_num; ++}; ++ ++struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev, ++ unsigned int domain, unsigned int bus, ++ unsigned int devfn) ++{ ++ struct controller_dev_data *dev_data = pdev->pci_dev_data; ++ struct controller_dev_entry *dev_entry; ++ struct controller_list_entry *cntrl_entry; ++ struct pci_dev *dev = NULL; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&dev_data->lock, flags); ++ ++ list_for_each_entry(cntrl_entry, &dev_data->list, list) { ++ if (cntrl_entry->domain != domain || ++ cntrl_entry->bus != bus) ++ continue; ++ ++ list_for_each_entry(dev_entry, &cntrl_entry->dev_list, list) { ++ if (devfn == dev_entry->devfn) { ++ dev = dev_entry->dev; ++ goto found; ++ } ++ } ++ } ++found: ++ spin_unlock_irqrestore(&dev_data->lock, flags); ++ ++ return dev; ++} ++ ++int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev, ++ int devid, publish_pci_dev_cb publish_cb) ++{ ++ struct controller_dev_data *dev_data = pdev->pci_dev_data; ++ struct controller_dev_entry *dev_entry; ++ struct controller_list_entry *cntrl_entry; ++ struct pci_controller *dev_controller = PCI_CONTROLLER(dev); ++ unsigned long flags; ++ int ret = 0, found = 0; ++ ++ spin_lock_irqsave(&dev_data->lock, flags); ++ ++ /* Look to see if we already have a domain:bus for this controller */ ++ list_for_each_entry(cntrl_entry, &dev_data->list, list) { ++ if (cntrl_entry->controller == dev_controller) { ++ found = 1; ++ break; ++ } ++ } ++ ++ if (!found) { ++ cntrl_entry = kmalloc(sizeof(*cntrl_entry), GFP_ATOMIC); ++ if (!cntrl_entry) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ cntrl_entry->controller = dev_controller; ++ cntrl_entry->next_devfn = PCI_DEVFN(0, 0); ++ ++ cntrl_entry->domain = dev_data->next_domain; ++ cntrl_entry->bus = dev_data->next_bus++; ++ if (dev_data->next_bus > PCI_MAX_BUSSES) { ++ dev_data->next_domain++; ++ dev_data->next_bus = 0; ++ } ++ ++ INIT_LIST_HEAD(&cntrl_entry->dev_list); ++ ++ list_add_tail(&cntrl_entry->list, &dev_data->list); ++ } ++ ++ if (PCI_SLOT(cntrl_entry->next_devfn) > PCI_MAX_SLOTS) { ++ /* ++ * While it seems unlikely, this can actually happen if ++ * a controller has P2P bridges under it. ++ */ ++ xenbus_dev_fatal(pdev->xdev, -ENOSPC, "Virtual bus %04x:%02x " ++ "is full, no room to export %04x:%02x:%02x.%x", ++ cntrl_entry->domain, cntrl_entry->bus, ++ pci_domain_nr(dev->bus), dev->bus->number, ++ PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn)); ++ ret = -ENOSPC; ++ goto out; ++ } ++ ++ dev_entry = kmalloc(sizeof(*dev_entry), GFP_ATOMIC); ++ if (!dev_entry) { ++ if (list_empty(&cntrl_entry->dev_list)) { ++ list_del(&cntrl_entry->list); ++ kfree(cntrl_entry); ++ } ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ dev_entry->dev = dev; ++ dev_entry->devfn = cntrl_entry->next_devfn; ++ ++ list_add_tail(&dev_entry->list, &cntrl_entry->dev_list); ++ ++ cntrl_entry->next_devfn += PCI_DEVFN(1, 0); ++ ++out: ++ spin_unlock_irqrestore(&dev_data->lock, flags); ++ ++ /* TODO: Publish virtual domain:bus:slot.func here. */ ++ ++ return ret; ++} ++ ++void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev) ++{ ++ struct controller_dev_data *dev_data = pdev->pci_dev_data; ++ struct controller_list_entry *cntrl_entry; ++ struct controller_dev_entry *dev_entry = NULL; ++ struct pci_dev *found_dev = NULL; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&dev_data->lock, flags); ++ ++ list_for_each_entry(cntrl_entry, &dev_data->list, list) { ++ if (cntrl_entry->controller != PCI_CONTROLLER(dev)) ++ continue; ++ ++ list_for_each_entry(dev_entry, &cntrl_entry->dev_list, list) { ++ if (dev_entry->dev == dev) { ++ found_dev = dev_entry->dev; ++ break; ++ } ++ } ++ } ++ ++ if (!found_dev) { ++ spin_unlock_irqrestore(&dev_data->lock, flags); ++ return; ++ } ++ ++ list_del(&dev_entry->list); ++ kfree(dev_entry); ++ ++ if (list_empty(&cntrl_entry->dev_list)) { ++ list_del(&cntrl_entry->list); ++ kfree(cntrl_entry); ++ } ++ ++ spin_unlock_irqrestore(&dev_data->lock, flags); ++ pcistub_put_pci_dev(found_dev); ++} ++ ++int pciback_init_devices(struct pciback_device *pdev) ++{ ++ struct controller_dev_data *dev_data; ++ ++ dev_data = kmalloc(sizeof(*dev_data), GFP_KERNEL); ++ if (!dev_data) ++ return -ENOMEM; ++ ++ spin_lock_init(&dev_data->lock); ++ ++ INIT_LIST_HEAD(&dev_data->list); ++ ++ /* Starting domain:bus numbers */ ++ dev_data->next_domain = 0; ++ dev_data->next_bus = 0; ++ ++ pdev->pci_dev_data = dev_data; ++ ++ return 0; ++} ++ ++static acpi_status write_xenbus_resource(struct acpi_resource *res, void *data) ++{ ++ struct walk_info *info = data; ++ struct acpi_resource_address64 addr; ++ acpi_status status; ++ int i, len, err; ++ char str[32], tmp[3]; ++ unsigned char *ptr, *buf; ++ ++ status = acpi_resource_to_address64(res, &addr); ++ ++ /* Do we care about this range? Let's check. */ ++ if (!ACPI_SUCCESS(status) || ++ !(addr.resource_type == ACPI_MEMORY_RANGE || ++ addr.resource_type == ACPI_IO_RANGE) || ++ !addr.address_length || addr.producer_consumer != ACPI_PRODUCER) ++ return AE_OK; ++ ++ /* ++ * Furthermore, we really only care to tell the guest about ++ * address ranges that require address translation of some sort. ++ */ ++ if (!(addr.resource_type == ACPI_MEMORY_RANGE && ++ addr.info.mem.translation) && ++ !(addr.resource_type == ACPI_IO_RANGE && ++ addr.info.io.translation)) ++ return AE_OK; ++ ++ /* Store the resource in xenbus for the guest */ ++ len = snprintf(str, sizeof(str), "root-%d-resource-%d", ++ info->root_num, info->resource_count); ++ if (unlikely(len >= (sizeof(str) - 1))) ++ return AE_OK; ++ ++ buf = kzalloc((sizeof(*res) * 2) + 1, GFP_KERNEL); ++ if (!buf) ++ return AE_OK; ++ ++ /* Clean out resource_source */ ++ res->data.address64.resource_source.index = 0xFF; ++ res->data.address64.resource_source.string_length = 0; ++ res->data.address64.resource_source.string_ptr = NULL; ++ ++ ptr = (unsigned char *)res; ++ ++ /* Turn the acpi_resource into an ASCII byte stream */ ++ for (i = 0; i < sizeof(*res); i++) { ++ snprintf(tmp, sizeof(tmp), "%02x", ptr[i]); ++ strncat(buf, tmp, 2); ++ } ++ ++ err = xenbus_printf(XBT_NIL, info->pdev->xdev->nodename, ++ str, "%s", buf); ++ ++ if (!err) ++ info->resource_count++; ++ ++ kfree(buf); ++ ++ return AE_OK; ++} ++ ++int pciback_publish_pci_roots(struct pciback_device *pdev, ++ publish_pci_root_cb publish_root_cb) ++{ ++ struct controller_dev_data *dev_data = pdev->pci_dev_data; ++ struct controller_list_entry *cntrl_entry; ++ int i, root_num, len, err = 0; ++ unsigned int domain, bus; ++ char str[64]; ++ struct walk_info info; ++ ++ spin_lock(&dev_data->lock); ++ ++ list_for_each_entry(cntrl_entry, &dev_data->list, list) { ++ /* First publish all the domain:bus info */ ++ err = publish_root_cb(pdev, cntrl_entry->domain, ++ cntrl_entry->bus); ++ if (err) ++ goto out; ++ ++ /* ++ * Now figure out which root-%d this belongs to ++ * so we can associate resources with it. ++ */ ++ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, ++ "root_num", "%d", &root_num); ++ ++ if (err != 1) ++ goto out; ++ ++ for (i = 0; i < root_num; i++) { ++ len = snprintf(str, sizeof(str), "root-%d", i); ++ if (unlikely(len >= (sizeof(str) - 1))) { ++ err = -ENOMEM; ++ goto out; ++ } ++ ++ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, ++ str, "%x:%x", &domain, &bus); ++ if (err != 2) ++ goto out; ++ ++ /* Is this the one we just published? */ ++ if (domain == cntrl_entry->domain && ++ bus == cntrl_entry->bus) ++ break; ++ } ++ ++ if (i == root_num) ++ goto out; ++ ++ info.pdev = pdev; ++ info.resource_count = 0; ++ info.root_num = i; ++ ++ /* Let ACPI do the heavy lifting on decoding resources */ ++ acpi_walk_resources(cntrl_entry->controller->acpi_handle, ++ METHOD_NAME__CRS, write_xenbus_resource, ++ &info); ++ ++ /* No resouces. OK. On to the next one */ ++ if (!info.resource_count) ++ continue; ++ ++ /* Store the number of resources we wrote for this root-%d */ ++ len = snprintf(str, sizeof(str), "root-%d-resources", i); ++ if (unlikely(len >= (sizeof(str) - 1))) { ++ err = -ENOMEM; ++ goto out; ++ } ++ ++ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str, ++ "%d", info.resource_count); ++ if (err) ++ goto out; ++ } ++ ++ /* Finally, write some magic to synchronize with the guest. */ ++ len = snprintf(str, sizeof(str), "root-resource-magic"); ++ if (unlikely(len >= (sizeof(str) - 1))) { ++ err = -ENOMEM; ++ goto out; ++ } ++ ++ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str, ++ "%lx", (sizeof(struct acpi_resource) * 2) + 1); ++ ++out: ++ spin_unlock(&dev_data->lock); ++ ++ return err; ++} ++ ++void pciback_release_devices(struct pciback_device *pdev) ++{ ++ struct controller_dev_data *dev_data = pdev->pci_dev_data; ++ struct controller_list_entry *cntrl_entry, *c; ++ struct controller_dev_entry *dev_entry, *d; ++ ++ list_for_each_entry_safe(cntrl_entry, c, &dev_data->list, list) { ++ list_for_each_entry_safe(dev_entry, d, ++ &cntrl_entry->dev_list, list) { ++ list_del(&dev_entry->list); ++ pcistub_put_pci_dev(dev_entry->dev); ++ kfree(dev_entry); ++ } ++ list_del(&cntrl_entry->list); ++ kfree(cntrl_entry); ++ } ++ ++ kfree(dev_data); ++ pdev->pci_dev_data = NULL; ++} ++ ++int pciback_get_pcifront_dev(struct pci_dev *pcidev, ++ struct pciback_device *pdev, ++ unsigned int *domain, unsigned int *bus, unsigned int *devfn) ++{ ++ struct controller_dev_data *dev_data = pdev->pci_dev_data; ++ struct controller_dev_entry *dev_entry; ++ struct controller_list_entry *cntrl_entry; ++ unsigned long flags; ++ int found = 0; ++ spin_lock_irqsave(&dev_data->lock, flags); ++ ++ list_for_each_entry(cntrl_entry, &dev_data->list, list) { ++ list_for_each_entry(dev_entry, &cntrl_entry->dev_list, list) { ++ if ((dev_entry->dev->bus->number == ++ pcidev->bus->number) && ++ (dev_entry->dev->devfn == ++ pcidev->devfn) && ++ (pci_domain_nr(dev_entry->dev->bus) == ++ pci_domain_nr(pcidev->bus))) { ++ found = 1; ++ *domain = cntrl_entry->domain; ++ *bus = cntrl_entry->bus; ++ *devfn = dev_entry->devfn; ++ goto out; ++ } ++ } ++ } ++out: ++ spin_unlock_irqrestore(&dev_data->lock, flags); ++ return found; ++ ++} ++ +diff --git a/drivers/xen/pciback/passthrough.c b/drivers/xen/pciback/passthrough.c +new file mode 100644 +index 0000000..5386bebf +--- /dev/null ++++ b/drivers/xen/pciback/passthrough.c +@@ -0,0 +1,178 @@ ++/* ++ * PCI Backend - Provides restricted access to the real PCI bus topology ++ * to the frontend ++ * ++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> ++ */ ++ ++#include <linux/list.h> ++#include <linux/pci.h> ++#include <linux/spinlock.h> ++#include "pciback.h" ++ ++struct passthrough_dev_data { ++ /* Access to dev_list must be protected by lock */ ++ struct list_head dev_list; ++ spinlock_t lock; ++}; ++ ++struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev, ++ unsigned int domain, unsigned int bus, ++ unsigned int devfn) ++{ ++ struct passthrough_dev_data *dev_data = pdev->pci_dev_data; ++ struct pci_dev_entry *dev_entry; ++ struct pci_dev *dev = NULL; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&dev_data->lock, flags); ++ ++ list_for_each_entry(dev_entry, &dev_data->dev_list, list) { ++ if (domain == (unsigned int)pci_domain_nr(dev_entry->dev->bus) ++ && bus == (unsigned int)dev_entry->dev->bus->number ++ && devfn == dev_entry->dev->devfn) { ++ dev = dev_entry->dev; ++ break; ++ } ++ } ++ ++ spin_unlock_irqrestore(&dev_data->lock, flags); ++ ++ return dev; ++} ++ ++int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev, ++ int devid, publish_pci_dev_cb publish_cb) ++{ ++ struct passthrough_dev_data *dev_data = pdev->pci_dev_data; ++ struct pci_dev_entry *dev_entry; ++ unsigned long flags; ++ unsigned int domain, bus, devfn; ++ int err; ++ ++ dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL); ++ if (!dev_entry) ++ return -ENOMEM; ++ dev_entry->dev = dev; ++ ++ spin_lock_irqsave(&dev_data->lock, flags); ++ list_add_tail(&dev_entry->list, &dev_data->dev_list); ++ spin_unlock_irqrestore(&dev_data->lock, flags); ++ ++ /* Publish this device. */ ++ domain = (unsigned int)pci_domain_nr(dev->bus); ++ bus = (unsigned int)dev->bus->number; ++ devfn = dev->devfn; ++ err = publish_cb(pdev, domain, bus, devfn, devid); ++ ++ return err; ++} ++ ++void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev) ++{ ++ struct passthrough_dev_data *dev_data = pdev->pci_dev_data; ++ struct pci_dev_entry *dev_entry, *t; ++ struct pci_dev *found_dev = NULL; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&dev_data->lock, flags); ++ ++ list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) { ++ if (dev_entry->dev == dev) { ++ list_del(&dev_entry->list); ++ found_dev = dev_entry->dev; ++ kfree(dev_entry); ++ } ++ } ++ ++ spin_unlock_irqrestore(&dev_data->lock, flags); ++ ++ if (found_dev) ++ pcistub_put_pci_dev(found_dev); ++} ++ ++int pciback_init_devices(struct pciback_device *pdev) ++{ ++ struct passthrough_dev_data *dev_data; ++ ++ dev_data = kmalloc(sizeof(*dev_data), GFP_KERNEL); ++ if (!dev_data) ++ return -ENOMEM; ++ ++ spin_lock_init(&dev_data->lock); ++ ++ INIT_LIST_HEAD(&dev_data->dev_list); ++ ++ pdev->pci_dev_data = dev_data; ++ ++ return 0; ++} ++ ++int pciback_publish_pci_roots(struct pciback_device *pdev, ++ publish_pci_root_cb publish_root_cb) ++{ ++ int err = 0; ++ struct passthrough_dev_data *dev_data = pdev->pci_dev_data; ++ struct pci_dev_entry *dev_entry, *e; ++ struct pci_dev *dev; ++ int found; ++ unsigned int domain, bus; ++ ++ spin_lock(&dev_data->lock); ++ ++ list_for_each_entry(dev_entry, &dev_data->dev_list, list) { ++ /* Only publish this device as a root if none of its ++ * parent bridges are exported ++ */ ++ found = 0; ++ dev = dev_entry->dev->bus->self; ++ for (; !found && dev != NULL; dev = dev->bus->self) { ++ list_for_each_entry(e, &dev_data->dev_list, list) { ++ if (dev == e->dev) { ++ found = 1; ++ break; ++ } ++ } ++ } ++ ++ domain = (unsigned int)pci_domain_nr(dev_entry->dev->bus); ++ bus = (unsigned int)dev_entry->dev->bus->number; ++ ++ if (!found) { ++ err = publish_root_cb(pdev, domain, bus); ++ if (err) ++ break; ++ } ++ } ++ ++ spin_unlock(&dev_data->lock); ++ ++ return err; ++} ++ ++void pciback_release_devices(struct pciback_device *pdev) ++{ ++ struct passthrough_dev_data *dev_data = pdev->pci_dev_data; ++ struct pci_dev_entry *dev_entry, *t; ++ ++ list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) { ++ list_del(&dev_entry->list); ++ pcistub_put_pci_dev(dev_entry->dev); ++ kfree(dev_entry); ++ } ++ ++ kfree(dev_data); ++ pdev->pci_dev_data = NULL; ++} ++ ++int pciback_get_pcifront_dev(struct pci_dev *pcidev, ++ struct pciback_device *pdev, ++ unsigned int *domain, unsigned int *bus, ++ unsigned int *devfn) ++ ++{ ++ *domain = pci_domain_nr(pcidev->bus); ++ *bus = pcidev->bus->number; ++ *devfn = pcidev->devfn; ++ return 1; ++} +diff --git a/drivers/xen/pciback/pci_stub.c b/drivers/xen/pciback/pci_stub.c +new file mode 100644 +index 0000000..88c7ca1 +--- /dev/null ++++ b/drivers/xen/pciback/pci_stub.c +@@ -0,0 +1,1370 @@ ++/* ++ * PCI Stub Driver - Grabs devices in backend to be exported later ++ * ++ * Ryan Wilson <hap9@epoch.ncsc.mil> ++ * Chris Bookholt <hap10@epoch.ncsc.mil> ++ */ ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/rwsem.h> ++#include <linux/list.h> ++#include <linux/spinlock.h> ++#include <linux/kref.h> ++#include <linux/pci.h> ++#include <linux/wait.h> ++#include <linux/sched.h> ++#include <asm/atomic.h> ++#include <xen/events.h> ++#include <asm/xen/pci.h> ++#include <asm/xen/hypervisor.h> ++#include "pciback.h" ++#include "conf_space.h" ++#include "conf_space_quirks.h" ++ ++#define DRV_NAME "pciback" ++ ++static char *pci_devs_to_hide; ++wait_queue_head_t aer_wait_queue; ++/*Add sem for sync AER handling and pciback remove/reconfigue ops, ++* We want to avoid in middle of AER ops, pciback devices is being removed ++*/ ++static DECLARE_RWSEM(pcistub_sem); ++module_param_named(hide, pci_devs_to_hide, charp, 0444); ++ ++struct pcistub_device_id { ++ struct list_head slot_list; ++ int domain; ++ unsigned char bus; ++ unsigned int devfn; ++}; ++static LIST_HEAD(pcistub_device_ids); ++static DEFINE_SPINLOCK(device_ids_lock); ++ ++struct pcistub_device { ++ struct kref kref; ++ struct list_head dev_list; ++ spinlock_t lock; ++ ++ struct pci_dev *dev; ++ struct pciback_device *pdev;/* non-NULL if struct pci_dev is in use */ ++}; ++ ++/* Access to pcistub_devices & seized_devices lists and the initialize_devices ++ * flag must be locked with pcistub_devices_lock ++ */ ++static DEFINE_SPINLOCK(pcistub_devices_lock); ++static LIST_HEAD(pcistub_devices); ++ ++/* wait for device_initcall before initializing our devices ++ * (see pcistub_init_devices_late) ++ */ ++static int initialize_devices; ++static LIST_HEAD(seized_devices); ++ ++static struct pcistub_device *pcistub_device_alloc(struct pci_dev *dev) ++{ ++ struct pcistub_device *psdev; ++ ++ dev_dbg(&dev->dev, "pcistub_device_alloc\n"); ++ ++ psdev = kzalloc(sizeof(*psdev), GFP_ATOMIC); ++ if (!psdev) ++ return NULL; ++ ++ psdev->dev = pci_dev_get(dev); ++ if (!psdev->dev) { ++ kfree(psdev); ++ return NULL; ++ } ++ ++ kref_init(&psdev->kref); ++ spin_lock_init(&psdev->lock); ++ ++ return psdev; ++} ++ ++/* Don't call this directly as it's called by pcistub_device_put */ ++static void pcistub_device_release(struct kref *kref) ++{ ++ struct pcistub_device *psdev; ++ ++ psdev = container_of(kref, struct pcistub_device, kref); ++ ++ dev_dbg(&psdev->dev->dev, "pcistub_device_release\n"); ++ ++ xen_unregister_device_domain_owner(psdev->dev); ++ ++ /* Clean-up the device */ ++ pciback_reset_device(psdev->dev); ++ pciback_config_free_dyn_fields(psdev->dev); ++ pciback_config_free_dev(psdev->dev); ++ kfree(pci_get_drvdata(psdev->dev)); ++ pci_set_drvdata(psdev->dev, NULL); ++ ++ pci_dev_put(psdev->dev); ++ ++ kfree(psdev); ++} ++ ++static inline void pcistub_device_get(struct pcistub_device *psdev) ++{ ++ kref_get(&psdev->kref); ++} ++ ++static inline void pcistub_device_put(struct pcistub_device *psdev) ++{ ++ kref_put(&psdev->kref, pcistub_device_release); ++} ++ ++static struct pcistub_device *pcistub_device_find(int domain, int bus, ++ int slot, int func) ++{ ++ struct pcistub_device *psdev = NULL; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&pcistub_devices_lock, flags); ++ ++ list_for_each_entry(psdev, &pcistub_devices, dev_list) { ++ if (psdev->dev != NULL ++ && domain == pci_domain_nr(psdev->dev->bus) ++ && bus == psdev->dev->bus->number ++ && PCI_DEVFN(slot, func) == psdev->dev->devfn) { ++ pcistub_device_get(psdev); ++ goto out; ++ } ++ } ++ ++ /* didn't find it */ ++ psdev = NULL; ++ ++out: ++ spin_unlock_irqrestore(&pcistub_devices_lock, flags); ++ return psdev; ++} ++ ++static struct pci_dev *pcistub_device_get_pci_dev(struct pciback_device *pdev, ++ struct pcistub_device *psdev) ++{ ++ struct pci_dev *pci_dev = NULL; ++ unsigned long flags; ++ ++ pcistub_device_get(psdev); ++ ++ spin_lock_irqsave(&psdev->lock, flags); ++ if (!psdev->pdev) { ++ psdev->pdev = pdev; ++ pci_dev = psdev->dev; ++ } ++ spin_unlock_irqrestore(&psdev->lock, flags); ++ ++ if (!pci_dev) ++ pcistub_device_put(psdev); ++ ++ return pci_dev; ++} ++ ++struct pci_dev *pcistub_get_pci_dev_by_slot(struct pciback_device *pdev, ++ int domain, int bus, ++ int slot, int func) ++{ ++ struct pcistub_device *psdev; ++ struct pci_dev *found_dev = NULL; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&pcistub_devices_lock, flags); ++ ++ list_for_each_entry(psdev, &pcistub_devices, dev_list) { ++ if (psdev->dev != NULL ++ && domain == pci_domain_nr(psdev->dev->bus) ++ && bus == psdev->dev->bus->number ++ && PCI_DEVFN(slot, func) == psdev->dev->devfn) { ++ found_dev = pcistub_device_get_pci_dev(pdev, psdev); ++ break; ++ } ++ } ++ ++ spin_unlock_irqrestore(&pcistub_devices_lock, flags); ++ return found_dev; ++} ++ ++struct pci_dev *pcistub_get_pci_dev(struct pciback_device *pdev, ++ struct pci_dev *dev) ++{ ++ struct pcistub_device *psdev; ++ struct pci_dev *found_dev = NULL; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&pcistub_devices_lock, flags); ++ ++ list_for_each_entry(psdev, &pcistub_devices, dev_list) { ++ if (psdev->dev == dev) { ++ found_dev = pcistub_device_get_pci_dev(pdev, psdev); ++ break; ++ } ++ } ++ ++ spin_unlock_irqrestore(&pcistub_devices_lock, flags); ++ return found_dev; ++} ++ ++void pcistub_put_pci_dev(struct pci_dev *dev) ++{ ++ struct pcistub_device *psdev, *found_psdev = NULL; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&pcistub_devices_lock, flags); ++ ++ list_for_each_entry(psdev, &pcistub_devices, dev_list) { ++ if (psdev->dev == dev) { ++ found_psdev = psdev; ++ break; ++ } ++ } ++ ++ spin_unlock_irqrestore(&pcistub_devices_lock, flags); ++ ++ /*hold this lock for avoiding breaking link between ++ * pcistub and pciback when AER is in processing ++ */ ++ down_write(&pcistub_sem); ++ /* Cleanup our device ++ * (so it's ready for the next domain) ++ */ ++ pciback_reset_device(found_psdev->dev); ++ pciback_config_free_dyn_fields(found_psdev->dev); ++ pciback_config_reset_dev(found_psdev->dev); ++ ++ spin_lock_irqsave(&found_psdev->lock, flags); ++ found_psdev->pdev = NULL; ++ spin_unlock_irqrestore(&found_psdev->lock, flags); ++ ++ pcistub_device_put(found_psdev); ++ up_write(&pcistub_sem); ++} ++ ++static int __devinit pcistub_match_one(struct pci_dev *dev, ++ struct pcistub_device_id *pdev_id) ++{ ++ /* Match the specified device by domain, bus, slot, func and also if ++ * any of the device's parent bridges match. ++ */ ++ for (; dev != NULL; dev = dev->bus->self) { ++ if (pci_domain_nr(dev->bus) == pdev_id->domain ++ && dev->bus->number == pdev_id->bus ++ && dev->devfn == pdev_id->devfn) ++ return 1; ++ ++ /* Sometimes topmost bridge links to itself. */ ++ if (dev == dev->bus->self) ++ break; ++ } ++ ++ return 0; ++} ++ ++static int __devinit pcistub_match(struct pci_dev *dev) ++{ ++ struct pcistub_device_id *pdev_id; ++ unsigned long flags; ++ int found = 0; ++ ++ spin_lock_irqsave(&device_ids_lock, flags); ++ list_for_each_entry(pdev_id, &pcistub_device_ids, slot_list) { ++ if (pcistub_match_one(dev, pdev_id)) { ++ found = 1; ++ break; ++ } ++ } ++ spin_unlock_irqrestore(&device_ids_lock, flags); ++ ++ return found; ++} ++ ++static int __devinit pcistub_init_device(struct pci_dev *dev) ++{ ++ struct pciback_dev_data *dev_data; ++ int err = 0; ++ ++ dev_dbg(&dev->dev, "initializing...\n"); ++ ++ /* The PCI backend is not intended to be a module (or to work with ++ * removable PCI devices (yet). If it were, pciback_config_free() ++ * would need to be called somewhere to free the memory allocated ++ * here and then to call kfree(pci_get_drvdata(psdev->dev)). ++ */ ++ dev_data = kzalloc(sizeof(*dev_data) + strlen(DRV_NAME "[]") ++ + strlen(pci_name(dev)) + 1, GFP_ATOMIC); ++ if (!dev_data) { ++ err = -ENOMEM; ++ goto out; ++ } ++ pci_set_drvdata(dev, dev_data); ++ ++ /* ++ * Setup name for fake IRQ handler. It will only be enabled ++ * once the device is turned on by the guest. ++ */ ++ sprintf(dev_data->irq_name, DRV_NAME "[%s]", pci_name(dev)); ++ ++ dev_dbg(&dev->dev, "initializing config\n"); ++ ++ init_waitqueue_head(&aer_wait_queue); ++ err = pciback_config_init_dev(dev); ++ if (err) ++ goto out; ++ ++ /* HACK: Force device (& ACPI) to determine what IRQ it's on - we ++ * must do this here because pcibios_enable_device may specify ++ * the pci device's true irq (and possibly its other resources) ++ * if they differ from what's in the configuration space. ++ * This makes the assumption that the device's resources won't ++ * change after this point (otherwise this code may break!) ++ */ ++ dev_dbg(&dev->dev, "enabling device\n"); ++ err = pci_enable_device(dev); ++ if (err) ++ goto config_release; ++ ++ /* Now disable the device (this also ensures some private device ++ * data is setup before we export) ++ */ ++ dev_dbg(&dev->dev, "reset device\n"); ++ pciback_reset_device(dev); ++ ++ return 0; ++ ++config_release: ++ pciback_config_free_dev(dev); ++ ++out: ++ pci_set_drvdata(dev, NULL); ++ kfree(dev_data); ++ return err; ++} ++ ++/* ++ * Because some initialization still happens on ++ * devices during fs_initcall, we need to defer ++ * full initialization of our devices until ++ * device_initcall. ++ */ ++static int __init pcistub_init_devices_late(void) ++{ ++ struct pcistub_device *psdev; ++ unsigned long flags; ++ int err = 0; ++ ++ pr_debug("pciback: pcistub_init_devices_late\n"); ++ ++ spin_lock_irqsave(&pcistub_devices_lock, flags); ++ ++ while (!list_empty(&seized_devices)) { ++ psdev = container_of(seized_devices.next, ++ struct pcistub_device, dev_list); ++ list_del(&psdev->dev_list); ++ ++ spin_unlock_irqrestore(&pcistub_devices_lock, flags); ++ ++ err = pcistub_init_device(psdev->dev); ++ if (err) { ++ dev_err(&psdev->dev->dev, ++ "error %d initializing device\n", err); ++ kfree(psdev); ++ psdev = NULL; ++ } ++ ++ spin_lock_irqsave(&pcistub_devices_lock, flags); ++ ++ if (psdev) ++ list_add_tail(&psdev->dev_list, &pcistub_devices); ++ } ++ ++ initialize_devices = 1; ++ ++ spin_unlock_irqrestore(&pcistub_devices_lock, flags); ++ ++ return 0; ++} ++ ++static int __devinit pcistub_seize(struct pci_dev *dev) ++{ ++ struct pcistub_device *psdev; ++ unsigned long flags; ++ int err = 0; ++ ++ psdev = pcistub_device_alloc(dev); ++ if (!psdev) ++ return -ENOMEM; ++ ++ spin_lock_irqsave(&pcistub_devices_lock, flags); ++ ++ if (initialize_devices) { ++ spin_unlock_irqrestore(&pcistub_devices_lock, flags); ++ ++ /* don't want irqs disabled when calling pcistub_init_device */ ++ err = pcistub_init_device(psdev->dev); ++ ++ spin_lock_irqsave(&pcistub_devices_lock, flags); ++ ++ if (!err) ++ list_add(&psdev->dev_list, &pcistub_devices); ++ } else { ++ dev_dbg(&dev->dev, "deferring initialization\n"); ++ list_add(&psdev->dev_list, &seized_devices); ++ } ++ ++ spin_unlock_irqrestore(&pcistub_devices_lock, flags); ++ ++ if (err) ++ pcistub_device_put(psdev); ++ ++ return err; ++} ++ ++static int __devinit pcistub_probe(struct pci_dev *dev, ++ const struct pci_device_id *id) ++{ ++ int err = 0; ++ ++ dev_dbg(&dev->dev, "probing...\n"); ++ ++ if (pcistub_match(dev)) { ++ ++ if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL ++ && dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) { ++ dev_err(&dev->dev, "can't export pci devices that " ++ "don't have a normal (0) or bridge (1) " ++ "header type!\n"); ++ err = -ENODEV; ++ goto out; ++ } ++ ++ dev_info(&dev->dev, "seizing device\n"); ++ err = pcistub_seize(dev); ++ } else ++ /* Didn't find the device */ ++ err = -ENODEV; ++ ++out: ++ return err; ++} ++ ++static void pcistub_remove(struct pci_dev *dev) ++{ ++ struct pcistub_device *psdev, *found_psdev = NULL; ++ unsigned long flags; ++ ++ dev_dbg(&dev->dev, "removing\n"); ++ ++ spin_lock_irqsave(&pcistub_devices_lock, flags); ++ ++ pciback_config_quirk_release(dev); ++ ++ list_for_each_entry(psdev, &pcistub_devices, dev_list) { ++ if (psdev->dev == dev) { ++ found_psdev = psdev; ++ break; ++ } ++ } ++ ++ spin_unlock_irqrestore(&pcistub_devices_lock, flags); ++ ++ if (found_psdev) { ++ dev_dbg(&dev->dev, "found device to remove - in use? %p\n", ++ found_psdev->pdev); ++ ++ if (found_psdev->pdev) { ++ printk(KERN_WARNING "pciback: ****** removing device " ++ "%s while still in-use! ******\n", ++ pci_name(found_psdev->dev)); ++ printk(KERN_WARNING "pciback: ****** driver domain may " ++ "still access this device's i/o resources!\n"); ++ printk(KERN_WARNING "pciback: ****** shutdown driver " ++ "domain before binding device\n"); ++ printk(KERN_WARNING "pciback: ****** to other drivers " ++ "or domains\n"); ++ ++ pciback_release_pci_dev(found_psdev->pdev, ++ found_psdev->dev); ++ } ++ ++ spin_lock_irqsave(&pcistub_devices_lock, flags); ++ list_del(&found_psdev->dev_list); ++ spin_unlock_irqrestore(&pcistub_devices_lock, flags); ++ ++ /* the final put for releasing from the list */ ++ pcistub_device_put(found_psdev); ++ } ++} ++ ++static const struct pci_device_id pcistub_ids[] = { ++ { ++ .vendor = PCI_ANY_ID, ++ .device = PCI_ANY_ID, ++ .subvendor = PCI_ANY_ID, ++ .subdevice = PCI_ANY_ID, ++ }, ++ {0,}, ++}; ++ ++#define PCI_NODENAME_MAX 40 ++static void kill_domain_by_device(struct pcistub_device *psdev) ++{ ++ struct xenbus_transaction xbt; ++ int err; ++ char nodename[PCI_NODENAME_MAX]; ++ ++ if (!psdev) ++ dev_err(&psdev->dev->dev, ++ "device is NULL when do AER recovery/kill_domain\n"); ++ snprintf(nodename, PCI_NODENAME_MAX, "/local/domain/0/backend/pci/%d/0", ++ psdev->pdev->xdev->otherend_id); ++ nodename[strlen(nodename)] = '\0'; ++ ++again: ++ err = xenbus_transaction_start(&xbt); ++ if (err) { ++ dev_err(&psdev->dev->dev, ++ "error %d when start xenbus transaction\n", err); ++ return; ++ } ++ /*PV AER handlers will set this flag*/ ++ xenbus_printf(xbt, nodename, "aerState" , "aerfail"); ++ err = xenbus_transaction_end(xbt, 0); ++ if (err) { ++ if (err == -EAGAIN) ++ goto again; ++ dev_err(&psdev->dev->dev, ++ "error %d when end xenbus transaction\n", err); ++ return; ++ } ++} ++ ++/* For each aer recovery step error_detected, mmio_enabled, etc, front_end and ++ * backend need to have cooperation. In pciback, those steps will do similar ++ * jobs: send service request and waiting for front_end response. ++*/ ++static pci_ers_result_t common_process(struct pcistub_device *psdev, ++ pci_channel_state_t state, int aer_cmd, pci_ers_result_t result) ++{ ++ pci_ers_result_t res = result; ++ struct xen_pcie_aer_op *aer_op; ++ int ret; ++ ++ /*with PV AER drivers*/ ++ aer_op = &(psdev->pdev->sh_info->aer_op); ++ aer_op->cmd = aer_cmd ; ++ /*useful for error_detected callback*/ ++ aer_op->err = state; ++ /*pcifront_end BDF*/ ++ ret = pciback_get_pcifront_dev(psdev->dev, psdev->pdev, ++ &aer_op->domain, &aer_op->bus, &aer_op->devfn); ++ if (!ret) { ++ dev_err(&psdev->dev->dev, ++ "pciback: failed to get pcifront device\n"); ++ return PCI_ERS_RESULT_NONE; ++ } ++ wmb(); ++ ++ dev_dbg(&psdev->dev->dev, ++ "pciback: aer_op %x dom %x bus %x devfn %x\n", ++ aer_cmd, aer_op->domain, aer_op->bus, aer_op->devfn); ++ /*local flag to mark there's aer request, pciback callback will use this ++ * flag to judge whether we need to check pci-front give aer service ++ * ack signal ++ */ ++ set_bit(_PCIB_op_pending, (unsigned long *)&psdev->pdev->flags); ++ ++ /*It is possible that a pcifront conf_read_write ops request invokes ++ * the callback which cause the spurious execution of wake_up. ++ * Yet it is harmless and better than a spinlock here ++ */ ++ set_bit(_XEN_PCIB_active, ++ (unsigned long *)&psdev->pdev->sh_info->flags); ++ wmb(); ++ notify_remote_via_irq(psdev->pdev->evtchn_irq); ++ ++ ret = wait_event_timeout(aer_wait_queue, !(test_bit(_XEN_PCIB_active, ++ (unsigned long *)&psdev->pdev->sh_info->flags)), 300*HZ); ++ ++ if (!ret) { ++ if (test_bit(_XEN_PCIB_active, ++ (unsigned long *)&psdev->pdev->sh_info->flags)) { ++ dev_err(&psdev->dev->dev, ++ "pcifront aer process not responding!\n"); ++ clear_bit(_XEN_PCIB_active, ++ (unsigned long *)&psdev->pdev->sh_info->flags); ++ aer_op->err = PCI_ERS_RESULT_NONE; ++ return res; ++ } ++ } ++ clear_bit(_PCIB_op_pending, (unsigned long *)&psdev->pdev->flags); ++ ++ if (test_bit(_XEN_PCIF_active, ++ (unsigned long *)&psdev->pdev->sh_info->flags)) { ++ dev_dbg(&psdev->dev->dev, ++ "schedule pci_conf service in pciback \n"); ++ test_and_schedule_op(psdev->pdev); ++ } ++ ++ res = (pci_ers_result_t)aer_op->err; ++ return res; ++} ++ ++/* ++* pciback_slot_reset: it will send the slot_reset request to pcifront in case ++* of the device driver could provide this service, and then wait for pcifront ++* ack. ++* @dev: pointer to PCI devices ++* return value is used by aer_core do_recovery policy ++*/ ++static pci_ers_result_t pciback_slot_reset(struct pci_dev *dev) ++{ ++ struct pcistub_device *psdev; ++ pci_ers_result_t result; ++ ++ result = PCI_ERS_RESULT_RECOVERED; ++ dev_dbg(&dev->dev, "pciback_slot_reset(bus:%x,devfn:%x)\n", ++ dev->bus->number, dev->devfn); ++ ++ down_write(&pcistub_sem); ++ psdev = pcistub_device_find(pci_domain_nr(dev->bus), ++ dev->bus->number, ++ PCI_SLOT(dev->devfn), ++ PCI_FUNC(dev->devfn)); ++ ++ if (!psdev || !psdev->pdev) { ++ dev_err(&dev->dev, ++ "pciback device is not found/assigned\n"); ++ goto end; ++ } ++ ++ if (!psdev->pdev->sh_info) { ++ dev_err(&dev->dev, "pciback device is not connected or owned" ++ " by HVM, kill it\n"); ++ kill_domain_by_device(psdev); ++ goto release; ++ } ++ ++ if (!test_bit(_XEN_PCIB_AERHANDLER, ++ (unsigned long *)&psdev->pdev->sh_info->flags)) { ++ dev_err(&dev->dev, ++ "guest with no AER driver should have been killed\n"); ++ goto release; ++ } ++ result = common_process(psdev, 1, XEN_PCI_OP_aer_slotreset, result); ++ ++ if (result == PCI_ERS_RESULT_NONE || ++ result == PCI_ERS_RESULT_DISCONNECT) { ++ dev_dbg(&dev->dev, ++ "No AER slot_reset service or disconnected!\n"); ++ kill_domain_by_device(psdev); ++ } ++release: ++ pcistub_device_put(psdev); ++end: ++ up_write(&pcistub_sem); ++ return result; ++ ++} ++ ++ ++/*pciback_mmio_enabled: it will send the mmio_enabled request to pcifront ++* in case of the device driver could provide this service, and then wait ++* for pcifront ack ++* @dev: pointer to PCI devices ++* return value is used by aer_core do_recovery policy ++*/ ++ ++static pci_ers_result_t pciback_mmio_enabled(struct pci_dev *dev) ++{ ++ struct pcistub_device *psdev; ++ pci_ers_result_t result; ++ ++ result = PCI_ERS_RESULT_RECOVERED; ++ dev_dbg(&dev->dev, "pciback_mmio_enabled(bus:%x,devfn:%x)\n", ++ dev->bus->number, dev->devfn); ++ ++ down_write(&pcistub_sem); ++ psdev = pcistub_device_find(pci_domain_nr(dev->bus), ++ dev->bus->number, ++ PCI_SLOT(dev->devfn), ++ PCI_FUNC(dev->devfn)); ++ ++ if (!psdev || !psdev->pdev) { ++ dev_err(&dev->dev, ++ "pciback device is not found/assigned\n"); ++ goto end; ++ } ++ ++ if (!psdev->pdev->sh_info) { ++ dev_err(&dev->dev, "pciback device is not connected or owned" ++ " by HVM, kill it\n"); ++ kill_domain_by_device(psdev); ++ goto release; ++ } ++ ++ if (!test_bit(_XEN_PCIB_AERHANDLER, ++ (unsigned long *)&psdev->pdev->sh_info->flags)) { ++ dev_err(&dev->dev, ++ "guest with no AER driver should have been killed\n"); ++ goto release; ++ } ++ result = common_process(psdev, 1, XEN_PCI_OP_aer_mmio, result); ++ ++ if (result == PCI_ERS_RESULT_NONE || ++ result == PCI_ERS_RESULT_DISCONNECT) { ++ dev_dbg(&dev->dev, ++ "No AER mmio_enabled service or disconnected!\n"); ++ kill_domain_by_device(psdev); ++ } ++release: ++ pcistub_device_put(psdev); ++end: ++ up_write(&pcistub_sem); ++ return result; ++} ++ ++/*pciback_error_detected: it will send the error_detected request to pcifront ++* in case of the device driver could provide this service, and then wait ++* for pcifront ack. ++* @dev: pointer to PCI devices ++* @error: the current PCI connection state ++* return value is used by aer_core do_recovery policy ++*/ ++ ++static pci_ers_result_t pciback_error_detected(struct pci_dev *dev, ++ pci_channel_state_t error) ++{ ++ struct pcistub_device *psdev; ++ pci_ers_result_t result; ++ ++ result = PCI_ERS_RESULT_CAN_RECOVER; ++ dev_dbg(&dev->dev, "pciback_error_detected(bus:%x,devfn:%x)\n", ++ dev->bus->number, dev->devfn); ++ ++ down_write(&pcistub_sem); ++ psdev = pcistub_device_find(pci_domain_nr(dev->bus), ++ dev->bus->number, ++ PCI_SLOT(dev->devfn), ++ PCI_FUNC(dev->devfn)); ++ ++ if (!psdev || !psdev->pdev) { ++ dev_err(&dev->dev, ++ "pciback device is not found/assigned\n"); ++ goto end; ++ } ++ ++ if (!psdev->pdev->sh_info) { ++ dev_err(&dev->dev, "pciback device is not connected or owned" ++ " by HVM, kill it\n"); ++ kill_domain_by_device(psdev); ++ goto release; ++ } ++ ++ /*Guest owns the device yet no aer handler regiested, kill guest*/ ++ if (!test_bit(_XEN_PCIB_AERHANDLER, ++ (unsigned long *)&psdev->pdev->sh_info->flags)) { ++ dev_dbg(&dev->dev, "guest may have no aer driver, kill it\n"); ++ kill_domain_by_device(psdev); ++ goto release; ++ } ++ result = common_process(psdev, error, XEN_PCI_OP_aer_detected, result); ++ ++ if (result == PCI_ERS_RESULT_NONE || ++ result == PCI_ERS_RESULT_DISCONNECT) { ++ dev_dbg(&dev->dev, ++ "No AER error_detected service or disconnected!\n"); ++ kill_domain_by_device(psdev); ++ } ++release: ++ pcistub_device_put(psdev); ++end: ++ up_write(&pcistub_sem); ++ return result; ++} ++ ++/*pciback_error_resume: it will send the error_resume request to pcifront ++* in case of the device driver could provide this service, and then wait ++* for pcifront ack. ++* @dev: pointer to PCI devices ++*/ ++ ++static void pciback_error_resume(struct pci_dev *dev) ++{ ++ struct pcistub_device *psdev; ++ ++ dev_dbg(&dev->dev, "pciback_error_resume(bus:%x,devfn:%x)\n", ++ dev->bus->number, dev->devfn); ++ ++ down_write(&pcistub_sem); ++ psdev = pcistub_device_find(pci_domain_nr(dev->bus), ++ dev->bus->number, ++ PCI_SLOT(dev->devfn), ++ PCI_FUNC(dev->devfn)); ++ ++ if (!psdev || !psdev->pdev) { ++ dev_err(&dev->dev, ++ "pciback device is not found/assigned\n"); ++ goto end; ++ } ++ ++ if (!psdev->pdev->sh_info) { ++ dev_err(&dev->dev, "pciback device is not connected or owned" ++ " by HVM, kill it\n"); ++ kill_domain_by_device(psdev); ++ goto release; ++ } ++ ++ if (!test_bit(_XEN_PCIB_AERHANDLER, ++ (unsigned long *)&psdev->pdev->sh_info->flags)) { ++ dev_err(&dev->dev, ++ "guest with no AER driver should have been killed\n"); ++ kill_domain_by_device(psdev); ++ goto release; ++ } ++ common_process(psdev, 1, XEN_PCI_OP_aer_resume, ++ PCI_ERS_RESULT_RECOVERED); ++release: ++ pcistub_device_put(psdev); ++end: ++ up_write(&pcistub_sem); ++ return; ++} ++ ++/*add pciback AER handling*/ ++static struct pci_error_handlers pciback_error_handler = { ++ .error_detected = pciback_error_detected, ++ .mmio_enabled = pciback_mmio_enabled, ++ .slot_reset = pciback_slot_reset, ++ .resume = pciback_error_resume, ++}; ++ ++/* ++ * Note: There is no MODULE_DEVICE_TABLE entry here because this isn't ++ * for a normal device. I don't want it to be loaded automatically. ++ */ ++ ++static struct pci_driver pciback_pci_driver = { ++ .name = DRV_NAME, ++ .id_table = pcistub_ids, ++ .probe = pcistub_probe, ++ .remove = pcistub_remove, ++ .err_handler = &pciback_error_handler, ++}; ++ ++static inline int str_to_slot(const char *buf, int *domain, int *bus, ++ int *slot, int *func) ++{ ++ int err; ++ ++ err = sscanf(buf, " %x:%x:%x.%x", domain, bus, slot, func); ++ if (err == 4) ++ return 0; ++ else if (err < 0) ++ return -EINVAL; ++ ++ /* try again without domain */ ++ *domain = 0; ++ err = sscanf(buf, " %x:%x.%x", bus, slot, func); ++ if (err == 3) ++ return 0; ++ ++ return -EINVAL; ++} ++ ++static inline int str_to_quirk(const char *buf, int *domain, int *bus, int ++ *slot, int *func, int *reg, int *size, int *mask) ++{ ++ int err; ++ ++ err = ++ sscanf(buf, " %04x:%02x:%02x.%1x-%08x:%1x:%08x", domain, bus, slot, ++ func, reg, size, mask); ++ if (err == 7) ++ return 0; ++ return -EINVAL; ++} ++ ++static int pcistub_device_id_add(int domain, int bus, int slot, int func) ++{ ++ struct pcistub_device_id *pci_dev_id; ++ unsigned long flags; ++ ++ pci_dev_id = kmalloc(sizeof(*pci_dev_id), GFP_KERNEL); ++ if (!pci_dev_id) ++ return -ENOMEM; ++ ++ pci_dev_id->domain = domain; ++ pci_dev_id->bus = bus; ++ pci_dev_id->devfn = PCI_DEVFN(slot, func); ++ ++ pr_debug("pciback: wants to seize %04x:%02x:%02x.%01x\n", ++ domain, bus, slot, func); ++ ++ spin_lock_irqsave(&device_ids_lock, flags); ++ list_add_tail(&pci_dev_id->slot_list, &pcistub_device_ids); ++ spin_unlock_irqrestore(&device_ids_lock, flags); ++ ++ return 0; ++} ++ ++static int pcistub_device_id_remove(int domain, int bus, int slot, int func) ++{ ++ struct pcistub_device_id *pci_dev_id, *t; ++ int devfn = PCI_DEVFN(slot, func); ++ int err = -ENOENT; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&device_ids_lock, flags); ++ list_for_each_entry_safe(pci_dev_id, t, &pcistub_device_ids, ++ slot_list) { ++ if (pci_dev_id->domain == domain ++ && pci_dev_id->bus == bus && pci_dev_id->devfn == devfn) { ++ /* Don't break; here because it's possible the same ++ * slot could be in the list more than once ++ */ ++ list_del(&pci_dev_id->slot_list); ++ kfree(pci_dev_id); ++ ++ err = 0; ++ ++ pr_debug("pciback: removed %04x:%02x:%02x.%01x from " ++ "seize list\n", domain, bus, slot, func); ++ } ++ } ++ spin_unlock_irqrestore(&device_ids_lock, flags); ++ ++ return err; ++} ++ ++static int pcistub_reg_add(int domain, int bus, int slot, int func, int reg, ++ int size, int mask) ++{ ++ int err = 0; ++ struct pcistub_device *psdev; ++ struct pci_dev *dev; ++ struct config_field *field; ++ ++ psdev = pcistub_device_find(domain, bus, slot, func); ++ if (!psdev || !psdev->dev) { ++ err = -ENODEV; ++ goto out; ++ } ++ dev = psdev->dev; ++ ++ field = kzalloc(sizeof(*field), GFP_ATOMIC); ++ if (!field) { ++ err = -ENOMEM; ++ goto out; ++ } ++ ++ field->offset = reg; ++ field->size = size; ++ field->mask = mask; ++ field->init = NULL; ++ field->reset = NULL; ++ field->release = NULL; ++ field->clean = pciback_config_field_free; ++ ++ err = pciback_config_quirks_add_field(dev, field); ++ if (err) ++ kfree(field); ++out: ++ return err; ++} ++ ++static ssize_t pcistub_slot_add(struct device_driver *drv, const char *buf, ++ size_t count) ++{ ++ int domain, bus, slot, func; ++ int err; ++ ++ err = str_to_slot(buf, &domain, &bus, &slot, &func); ++ if (err) ++ goto out; ++ ++ err = pcistub_device_id_add(domain, bus, slot, func); ++ ++out: ++ if (!err) ++ err = count; ++ return err; ++} ++ ++DRIVER_ATTR(new_slot, S_IWUSR, NULL, pcistub_slot_add); ++ ++static ssize_t pcistub_slot_remove(struct device_driver *drv, const char *buf, ++ size_t count) ++{ ++ int domain, bus, slot, func; ++ int err; ++ ++ err = str_to_slot(buf, &domain, &bus, &slot, &func); ++ if (err) ++ goto out; ++ ++ err = pcistub_device_id_remove(domain, bus, slot, func); ++ ++out: ++ if (!err) ++ err = count; ++ return err; ++} ++ ++DRIVER_ATTR(remove_slot, S_IWUSR, NULL, pcistub_slot_remove); ++ ++static ssize_t pcistub_slot_show(struct device_driver *drv, char *buf) ++{ ++ struct pcistub_device_id *pci_dev_id; ++ size_t count = 0; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&device_ids_lock, flags); ++ list_for_each_entry(pci_dev_id, &pcistub_device_ids, slot_list) { ++ if (count >= PAGE_SIZE) ++ break; ++ ++ count += scnprintf(buf + count, PAGE_SIZE - count, ++ "%04x:%02x:%02x.%01x\n", ++ pci_dev_id->domain, pci_dev_id->bus, ++ PCI_SLOT(pci_dev_id->devfn), ++ PCI_FUNC(pci_dev_id->devfn)); ++ } ++ spin_unlock_irqrestore(&device_ids_lock, flags); ++ ++ return count; ++} ++ ++DRIVER_ATTR(slots, S_IRUSR, pcistub_slot_show, NULL); ++ ++static ssize_t pcistub_irq_handler_show(struct device_driver *drv, char *buf) ++{ ++ struct pcistub_device *psdev; ++ struct pciback_dev_data *dev_data; ++ size_t count = 0; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&pcistub_devices_lock, flags); ++ list_for_each_entry(psdev, &pcistub_devices, dev_list) { ++ if (count >= PAGE_SIZE) ++ break; ++ if (!psdev->dev) ++ continue; ++ dev_data = pci_get_drvdata(psdev->dev); ++ if (!dev_data) ++ continue; ++ count += ++ scnprintf(buf + count, PAGE_SIZE - count, "%s:%s:%sing:%ld\n", ++ pci_name(psdev->dev), ++ dev_data->isr_on ? "on" : "off", ++ dev_data->ack_intr ? "ack" : "not ack", ++ dev_data->handled); ++ } ++ spin_unlock_irqrestore(&pcistub_devices_lock, flags); ++ return count; ++} ++ ++DRIVER_ATTR(irq_handlers, S_IRUSR, pcistub_irq_handler_show, NULL); ++ ++static ssize_t pcistub_irq_handler_switch(struct device_driver *drv, ++ const char *buf, ++ size_t count) ++{ ++ struct pcistub_device *psdev; ++ struct pciback_dev_data *dev_data; ++ int domain, bus, slot, func; ++ int err = -ENOENT; ++ ++ err = str_to_slot(buf, &domain, &bus, &slot, &func); ++ if (err) ++ goto out; ++ ++ psdev = pcistub_device_find(domain, bus, slot, func); ++ ++ if (!psdev) ++ goto out; ++ ++ dev_data = pci_get_drvdata(psdev->dev); ++ if (!dev_data) ++ goto out; ++ ++ dev_dbg(&psdev->dev->dev, "%s fake irq handler: %d->%d\n", ++ dev_data->irq_name, dev_data->isr_on, ++ !dev_data->isr_on); ++ ++ dev_data->isr_on = !(dev_data->isr_on); ++ if (dev_data->isr_on) ++ dev_data->ack_intr = 1; ++out: ++ if (!err) ++ err = count; ++ return err; ++} ++DRIVER_ATTR(irq_handler_state, S_IWUSR, NULL, pcistub_irq_handler_switch); ++ ++static ssize_t pcistub_quirk_add(struct device_driver *drv, const char *buf, ++ size_t count) ++{ ++ int domain, bus, slot, func, reg, size, mask; ++ int err; ++ ++ err = str_to_quirk(buf, &domain, &bus, &slot, &func, ®, &size, ++ &mask); ++ if (err) ++ goto out; ++ ++ err = pcistub_reg_add(domain, bus, slot, func, reg, size, mask); ++ ++out: ++ if (!err) ++ err = count; ++ return err; ++} ++ ++static ssize_t pcistub_quirk_show(struct device_driver *drv, char *buf) ++{ ++ int count = 0; ++ unsigned long flags; ++ struct pciback_config_quirk *quirk; ++ struct pciback_dev_data *dev_data; ++ const struct config_field *field; ++ const struct config_field_entry *cfg_entry; ++ ++ spin_lock_irqsave(&device_ids_lock, flags); ++ list_for_each_entry(quirk, &pciback_quirks, quirks_list) { ++ if (count >= PAGE_SIZE) ++ goto out; ++ ++ count += scnprintf(buf + count, PAGE_SIZE - count, ++ "%02x:%02x.%01x\n\t%04x:%04x:%04x:%04x\n", ++ quirk->pdev->bus->number, ++ PCI_SLOT(quirk->pdev->devfn), ++ PCI_FUNC(quirk->pdev->devfn), ++ quirk->devid.vendor, quirk->devid.device, ++ quirk->devid.subvendor, ++ quirk->devid.subdevice); ++ ++ dev_data = pci_get_drvdata(quirk->pdev); ++ ++ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) { ++ field = cfg_entry->field; ++ if (count >= PAGE_SIZE) ++ goto out; ++ ++ count += scnprintf(buf + count, PAGE_SIZE - count, ++ "\t\t%08x:%01x:%08x\n", ++ cfg_entry->base_offset + ++ field->offset, field->size, ++ field->mask); ++ } ++ } ++ ++out: ++ spin_unlock_irqrestore(&device_ids_lock, flags); ++ ++ return count; ++} ++ ++DRIVER_ATTR(quirks, S_IRUSR | S_IWUSR, pcistub_quirk_show, pcistub_quirk_add); ++ ++static ssize_t permissive_add(struct device_driver *drv, const char *buf, ++ size_t count) ++{ ++ int domain, bus, slot, func; ++ int err; ++ struct pcistub_device *psdev; ++ struct pciback_dev_data *dev_data; ++ err = str_to_slot(buf, &domain, &bus, &slot, &func); ++ if (err) ++ goto out; ++ psdev = pcistub_device_find(domain, bus, slot, func); ++ if (!psdev) { ++ err = -ENODEV; ++ goto out; ++ } ++ if (!psdev->dev) { ++ err = -ENODEV; ++ goto release; ++ } ++ dev_data = pci_get_drvdata(psdev->dev); ++ /* the driver data for a device should never be null at this point */ ++ if (!dev_data) { ++ err = -ENXIO; ++ goto release; ++ } ++ if (!dev_data->permissive) { ++ dev_data->permissive = 1; ++ /* Let user know that what they're doing could be unsafe */ ++ dev_warn(&psdev->dev->dev, "enabling permissive mode " ++ "configuration space accesses!\n"); ++ dev_warn(&psdev->dev->dev, ++ "permissive mode is potentially unsafe!\n"); ++ } ++release: ++ pcistub_device_put(psdev); ++out: ++ if (!err) ++ err = count; ++ return err; ++} ++ ++static ssize_t permissive_show(struct device_driver *drv, char *buf) ++{ ++ struct pcistub_device *psdev; ++ struct pciback_dev_data *dev_data; ++ size_t count = 0; ++ unsigned long flags; ++ spin_lock_irqsave(&pcistub_devices_lock, flags); ++ list_for_each_entry(psdev, &pcistub_devices, dev_list) { ++ if (count >= PAGE_SIZE) ++ break; ++ if (!psdev->dev) ++ continue; ++ dev_data = pci_get_drvdata(psdev->dev); ++ if (!dev_data || !dev_data->permissive) ++ continue; ++ count += ++ scnprintf(buf + count, PAGE_SIZE - count, "%s\n", ++ pci_name(psdev->dev)); ++ } ++ spin_unlock_irqrestore(&pcistub_devices_lock, flags); ++ return count; ++} ++ ++DRIVER_ATTR(permissive, S_IRUSR | S_IWUSR, permissive_show, permissive_add); ++ ++static void pcistub_exit(void) ++{ ++ driver_remove_file(&pciback_pci_driver.driver, &driver_attr_new_slot); ++ driver_remove_file(&pciback_pci_driver.driver, ++ &driver_attr_remove_slot); ++ driver_remove_file(&pciback_pci_driver.driver, &driver_attr_slots); ++ driver_remove_file(&pciback_pci_driver.driver, &driver_attr_quirks); ++ driver_remove_file(&pciback_pci_driver.driver, &driver_attr_permissive); ++ driver_remove_file(&pciback_pci_driver.driver, ++ &driver_attr_irq_handlers); ++ driver_remove_file(&pciback_pci_driver.driver, ++ &driver_attr_irq_handler_state); ++ pci_unregister_driver(&pciback_pci_driver); ++} ++ ++static int __init pcistub_init(void) ++{ ++ int pos = 0; ++ int err = 0; ++ int domain, bus, slot, func; ++ int parsed; ++ ++ if (pci_devs_to_hide && *pci_devs_to_hide) { ++ do { ++ parsed = 0; ++ ++ err = sscanf(pci_devs_to_hide + pos, ++ " (%x:%x:%x.%x) %n", ++ &domain, &bus, &slot, &func, &parsed); ++ if (err != 4) { ++ domain = 0; ++ err = sscanf(pci_devs_to_hide + pos, ++ " (%x:%x.%x) %n", ++ &bus, &slot, &func, &parsed); ++ if (err != 3) ++ goto parse_error; ++ } ++ ++ err = pcistub_device_id_add(domain, bus, slot, func); ++ if (err) ++ goto out; ++ ++ /* if parsed<=0, we've reached the end of the string */ ++ pos += parsed; ++ } while (parsed > 0 && pci_devs_to_hide[pos]); ++ } ++ ++ /* If we're the first PCI Device Driver to register, we're the ++ * first one to get offered PCI devices as they become ++ * available (and thus we can be the first to grab them) ++ */ ++ err = pci_register_driver(&pciback_pci_driver); ++ if (err < 0) ++ goto out; ++ ++ err = driver_create_file(&pciback_pci_driver.driver, ++ &driver_attr_new_slot); ++ if (!err) ++ err = driver_create_file(&pciback_pci_driver.driver, ++ &driver_attr_remove_slot); ++ if (!err) ++ err = driver_create_file(&pciback_pci_driver.driver, ++ &driver_attr_slots); ++ if (!err) ++ err = driver_create_file(&pciback_pci_driver.driver, ++ &driver_attr_quirks); ++ if (!err) ++ err = driver_create_file(&pciback_pci_driver.driver, ++ &driver_attr_permissive); ++ ++ if (!err) ++ err = driver_create_file(&pciback_pci_driver.driver, ++ &driver_attr_irq_handlers); ++ if (!err) ++ err = driver_create_file(&pciback_pci_driver.driver, ++ &driver_attr_irq_handler_state); ++ if (err) ++ pcistub_exit(); ++ ++out: ++ return err; ++ ++parse_error: ++ printk(KERN_ERR "pciback: Error parsing pci_devs_to_hide at \"%s\"\n", ++ pci_devs_to_hide + pos); ++ return -EINVAL; ++} ++ ++#ifndef MODULE ++/* ++ * fs_initcall happens before device_initcall ++ * so pciback *should* get called first (b/c we ++ * want to suck up any device before other drivers ++ * get a chance by being the first pci device ++ * driver to register) ++ */ ++fs_initcall(pcistub_init); ++#endif ++ ++static int __init pciback_init(void) ++{ ++ int err; ++ ++ if (!xen_initial_domain()) ++ return -ENODEV; ++ ++ err = pciback_config_init(); ++ if (err) ++ return err; ++ ++#ifdef MODULE ++ err = pcistub_init(); ++ if (err < 0) ++ return err; ++#endif ++ ++ pcistub_init_devices_late(); ++ err = pciback_xenbus_register(); ++ if (err) ++ pcistub_exit(); ++ ++ return err; ++} ++ ++static void __exit pciback_cleanup(void) ++{ ++ pciback_xenbus_unregister(); ++ pcistub_exit(); ++} ++ ++module_init(pciback_init); ++module_exit(pciback_cleanup); ++ ++MODULE_LICENSE("Dual BSD/GPL"); +diff --git a/drivers/xen/pciback/pciback.h b/drivers/xen/pciback/pciback.h +new file mode 100644 +index 0000000..fc31052 +--- /dev/null ++++ b/drivers/xen/pciback/pciback.h +@@ -0,0 +1,142 @@ ++/* ++ * PCI Backend Common Data Structures & Function Declarations ++ * ++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> ++ */ ++#ifndef __XEN_PCIBACK_H__ ++#define __XEN_PCIBACK_H__ ++ ++#include <linux/pci.h> ++#include <linux/interrupt.h> ++#include <xen/xenbus.h> ++#include <linux/list.h> ++#include <linux/spinlock.h> ++#include <linux/workqueue.h> ++#include <asm/atomic.h> ++#include <xen/interface/io/pciif.h> ++ ++struct pci_dev_entry { ++ struct list_head list; ++ struct pci_dev *dev; ++}; ++ ++#define _PDEVF_op_active (0) ++#define PDEVF_op_active (1<<(_PDEVF_op_active)) ++#define _PCIB_op_pending (1) ++#define PCIB_op_pending (1<<(_PCIB_op_pending)) ++ ++struct pciback_device { ++ void *pci_dev_data; ++ spinlock_t dev_lock; ++ ++ struct xenbus_device *xdev; ++ ++ struct xenbus_watch be_watch; ++ u8 be_watching; ++ ++ int evtchn_irq; ++ ++ struct xen_pci_sharedinfo *sh_info; ++ ++ unsigned long flags; ++ ++ struct work_struct op_work; ++}; ++ ++struct pciback_dev_data { ++ struct list_head config_fields; ++ unsigned int permissive : 1; ++ unsigned int warned_on_write : 1; ++ unsigned int enable_intx : 1; ++ unsigned int isr_on : 1; /* Whether the IRQ handler is installed. */ ++ unsigned int ack_intr : 1; /* .. and ACK-ing */ ++ unsigned long handled; ++ unsigned int irq; /* Saved in case device transitions to MSI/MSI-X */ ++ char irq_name[0]; /* pciback[000:04:00.0] */ ++}; ++ ++/* Used by XenBus and pciback_ops.c */ ++extern wait_queue_head_t aer_wait_queue; ++extern struct workqueue_struct *pciback_wq; ++/* Used by pcistub.c and conf_space_quirks.c */ ++extern struct list_head pciback_quirks; ++ ++/* Get/Put PCI Devices that are hidden from the PCI Backend Domain */ ++struct pci_dev *pcistub_get_pci_dev_by_slot(struct pciback_device *pdev, ++ int domain, int bus, ++ int slot, int func); ++struct pci_dev *pcistub_get_pci_dev(struct pciback_device *pdev, ++ struct pci_dev *dev); ++void pcistub_put_pci_dev(struct pci_dev *dev); ++ ++/* Ensure a device is turned off or reset */ ++void pciback_reset_device(struct pci_dev *pdev); ++ ++/* Access a virtual configuration space for a PCI device */ ++int pciback_config_init(void); ++int pciback_config_init_dev(struct pci_dev *dev); ++void pciback_config_free_dyn_fields(struct pci_dev *dev); ++void pciback_config_reset_dev(struct pci_dev *dev); ++void pciback_config_free_dev(struct pci_dev *dev); ++int pciback_config_read(struct pci_dev *dev, int offset, int size, ++ u32 *ret_val); ++int pciback_config_write(struct pci_dev *dev, int offset, int size, u32 value); ++ ++/* Handle requests for specific devices from the frontend */ ++typedef int (*publish_pci_dev_cb) (struct pciback_device *pdev, ++ unsigned int domain, unsigned int bus, ++ unsigned int devfn, unsigned int devid); ++typedef int (*publish_pci_root_cb) (struct pciback_device *pdev, ++ unsigned int domain, unsigned int bus); ++int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev, ++ int devid, publish_pci_dev_cb publish_cb); ++void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev); ++struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev, ++ unsigned int domain, unsigned int bus, ++ unsigned int devfn); ++ ++/** ++* Add for domain0 PCIE-AER handling. Get guest domain/bus/devfn in pciback ++* before sending aer request to pcifront, so that guest could identify ++* device, coopearte with pciback to finish aer recovery job if device driver ++* has the capability ++*/ ++ ++int pciback_get_pcifront_dev(struct pci_dev *pcidev, ++ struct pciback_device *pdev, ++ unsigned int *domain, unsigned int *bus, ++ unsigned int *devfn); ++int pciback_init_devices(struct pciback_device *pdev); ++int pciback_publish_pci_roots(struct pciback_device *pdev, ++ publish_pci_root_cb cb); ++void pciback_release_devices(struct pciback_device *pdev); ++ ++/* Handles events from front-end */ ++irqreturn_t pciback_handle_event(int irq, void *dev_id); ++void pciback_do_op(struct work_struct *data); ++ ++int pciback_xenbus_register(void); ++void pciback_xenbus_unregister(void); ++ ++#ifdef CONFIG_PCI_MSI ++int pciback_enable_msi(struct pciback_device *pdev, ++ struct pci_dev *dev, struct xen_pci_op *op); ++ ++int pciback_disable_msi(struct pciback_device *pdev, ++ struct pci_dev *dev, struct xen_pci_op *op); ++ ++ ++int pciback_enable_msix(struct pciback_device *pdev, ++ struct pci_dev *dev, struct xen_pci_op *op); ++ ++int pciback_disable_msix(struct pciback_device *pdev, ++ struct pci_dev *dev, struct xen_pci_op *op); ++#endif ++extern int verbose_request; ++ ++void test_and_schedule_op(struct pciback_device *pdev); ++#endif ++ ++/* Handles shared IRQs that can to device domain and control domain. */ ++void pciback_irq_handler(struct pci_dev *dev, int reset); ++irqreturn_t pciback_guest_interrupt(int irq, void *dev_id); +diff --git a/drivers/xen/pciback/pciback_ops.c b/drivers/xen/pciback/pciback_ops.c +new file mode 100644 +index 0000000..5543881 +--- /dev/null ++++ b/drivers/xen/pciback/pciback_ops.c +@@ -0,0 +1,242 @@ ++/* ++ * PCI Backend Operations - respond to PCI requests from Frontend ++ * ++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> ++ */ ++#include <linux/module.h> ++#include <linux/wait.h> ++#include <linux/bitops.h> ++#include <xen/events.h> ++#include <linux/sched.h> ++#include "pciback.h" ++ ++int verbose_request; ++module_param(verbose_request, int, 0644); ++ ++/* Ensure a device is has the fake IRQ handler "turned on/off" and is ++ * ready to be exported. This MUST be run after pciback_reset_device ++ * which does the actual PCI device enable/disable. ++ */ ++void pciback_control_isr(struct pci_dev *dev, int reset) ++{ ++ struct pciback_dev_data *dev_data; ++ int rc; ++ int enable = 0; ++ ++ dev_data = pci_get_drvdata(dev); ++ if (!dev_data) ++ return; ++ ++ /* We don't deal with bridges */ ++ if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) ++ return; ++ ++ if (reset) { ++ dev_data->enable_intx = 0; ++ dev_data->ack_intr = 0; ++ } ++ enable = dev_data->enable_intx; ++ ++ /* Asked to disable, but ISR isn't runnig */ ++ if (!enable && !dev_data->isr_on) ++ return; ++ ++ /* Squirrel away the IRQs in the dev_data. We need this ++ * b/c when device transitions to MSI, the dev->irq is ++ * overwritten with the MSI vector. ++ */ ++ if (enable) ++ dev_data->irq = dev->irq; ++ ++ dev_dbg(&dev->dev, "%s: #%d %s %s%s %s-> %s\n", ++ dev_data->irq_name, ++ dev_data->irq, ++ pci_is_enabled(dev) ? "on" : "off", ++ dev->msi_enabled ? "MSI" : "", ++ dev->msix_enabled ? "MSI/X" : "", ++ dev_data->isr_on ? "enable" : "disable", ++ enable ? "enable" : "disable"); ++ ++ if (enable) { ++ rc = request_irq(dev_data->irq, ++ pciback_guest_interrupt, IRQF_SHARED, ++ dev_data->irq_name, dev); ++ if (rc) { ++ dev_err(&dev->dev, "%s: failed to install fake IRQ " \ ++ "handler for IRQ %d! (rc:%d)\n", dev_data->irq_name, ++ dev_data->irq, rc); ++ goto out; ++ } ++ } ++ else { ++ free_irq(dev_data->irq, dev); ++ dev_data->irq = 0; ++ } ++ dev_data->isr_on = enable; ++ dev_data->ack_intr = enable; ++out: ++ dev_dbg(&dev->dev, "%s: #%d %s %s%s %s\n", ++ dev_data->irq_name, ++ dev_data->irq, ++ pci_is_enabled(dev) ? "on" : "off", ++ dev->msi_enabled ? "MSI" : "", ++ dev->msix_enabled ? "MSI/X" : "", ++ enable ? (dev_data->isr_on ? "enabled" : "failed to enable") : ++ (dev_data->isr_on ? "failed to disable" : "disabled")); ++} ++ ++/* Ensure a device is "turned off" and ready to be exported. ++ * (Also see pciback_config_reset to ensure virtual configuration space is ++ * ready to be re-exported) ++ */ ++void pciback_reset_device(struct pci_dev *dev) ++{ ++ u16 cmd; ++ ++ pciback_control_isr(dev, 1 /* reset device */); ++ ++ /* Disable devices (but not bridges) */ ++ if (dev->hdr_type == PCI_HEADER_TYPE_NORMAL) { ++#ifdef CONFIG_PCI_MSI ++ /* The guest could have been abruptly killed without ++ * disabling MSI/MSI-X interrupts.*/ ++ if (dev->msix_enabled) ++ pci_disable_msix(dev); ++ if (dev->msi_enabled) ++ pci_disable_msi(dev); ++#endif ++ pci_disable_device(dev); ++ ++ pci_write_config_word(dev, PCI_COMMAND, 0); ++ ++ dev->is_busmaster = 0; ++ } else { ++ pci_read_config_word(dev, PCI_COMMAND, &cmd); ++ if (cmd & (PCI_COMMAND_INVALIDATE)) { ++ cmd &= ~(PCI_COMMAND_INVALIDATE); ++ pci_write_config_word(dev, PCI_COMMAND, cmd); ++ ++ dev->is_busmaster = 0; ++ } ++ } ++} ++/* ++* Now the same evtchn is used for both pcifront conf_read_write request ++* as well as pcie aer front end ack. We use a new work_queue to schedule ++* pciback conf_read_write service for avoiding confict with aer_core ++* do_recovery job which also use the system default work_queue ++*/ ++void test_and_schedule_op(struct pciback_device *pdev) ++{ ++ /* Check that frontend is requesting an operation and that we are not ++ * already processing a request */ ++ if (test_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags) ++ && !test_and_set_bit(_PDEVF_op_active, &pdev->flags)) { ++ queue_work(pciback_wq, &pdev->op_work); ++ } ++ /*_XEN_PCIB_active should have been cleared by pcifront. And also make ++ sure pciback is waiting for ack by checking _PCIB_op_pending*/ ++ if (!test_bit(_XEN_PCIB_active, (unsigned long *)&pdev->sh_info->flags) ++ && test_bit(_PCIB_op_pending, &pdev->flags)) { ++ wake_up(&aer_wait_queue); ++ } ++} ++ ++/* Performing the configuration space reads/writes must not be done in atomic ++ * context because some of the pci_* functions can sleep (mostly due to ACPI ++ * use of semaphores). This function is intended to be called from a work ++ * queue in process context taking a struct pciback_device as a parameter */ ++ ++void pciback_do_op(struct work_struct *data) ++{ ++ struct pciback_device *pdev = ++ container_of(data, struct pciback_device, op_work); ++ struct pci_dev *dev; ++ struct pciback_dev_data *dev_data = NULL; ++ struct xen_pci_op *op = &pdev->sh_info->op; ++ int test_intx = 0; ++ ++ dev = pciback_get_pci_dev(pdev, op->domain, op->bus, op->devfn); ++ ++ if (dev == NULL) ++ op->err = XEN_PCI_ERR_dev_not_found; ++ else { ++ dev_data = pci_get_drvdata(dev); ++ if (dev_data) ++ test_intx = dev_data->enable_intx; ++ switch (op->cmd) { ++ case XEN_PCI_OP_conf_read: ++ op->err = pciback_config_read(dev, ++ op->offset, op->size, &op->value); ++ break; ++ case XEN_PCI_OP_conf_write: ++ op->err = pciback_config_write(dev, ++ op->offset, op->size, op->value); ++ break; ++#ifdef CONFIG_PCI_MSI ++ case XEN_PCI_OP_enable_msi: ++ op->err = pciback_enable_msi(pdev, dev, op); ++ break; ++ case XEN_PCI_OP_disable_msi: ++ op->err = pciback_disable_msi(pdev, dev, op); ++ break; ++ case XEN_PCI_OP_enable_msix: ++ op->err = pciback_enable_msix(pdev, dev, op); ++ break; ++ case XEN_PCI_OP_disable_msix: ++ op->err = pciback_disable_msix(pdev, dev, op); ++ break; ++#endif ++ default: ++ op->err = XEN_PCI_ERR_not_implemented; ++ break; ++ } ++ } ++ if (!op->err && dev && dev_data) { ++ /* Transition detected */ ++ if ((dev_data->enable_intx != test_intx)) ++ pciback_control_isr(dev, 0 /* no reset */); ++ } ++ /* Tell the driver domain that we're done. */ ++ wmb(); ++ clear_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags); ++ notify_remote_via_irq(pdev->evtchn_irq); ++ ++ /* Mark that we're done. */ ++ smp_mb__before_clear_bit(); /* /after/ clearing PCIF_active */ ++ clear_bit(_PDEVF_op_active, &pdev->flags); ++ smp_mb__after_clear_bit(); /* /before/ final check for work */ ++ ++ /* Check to see if the driver domain tried to start another request in ++ * between clearing _XEN_PCIF_active and clearing _PDEVF_op_active. ++ */ ++ test_and_schedule_op(pdev); ++} ++ ++irqreturn_t pciback_handle_event(int irq, void *dev_id) ++{ ++ struct pciback_device *pdev = dev_id; ++ ++ test_and_schedule_op(pdev); ++ ++ return IRQ_HANDLED; ++} ++irqreturn_t pciback_guest_interrupt(int irq, void *dev_id) ++{ ++ struct pci_dev *dev = (struct pci_dev *)dev_id; ++ struct pciback_dev_data *dev_data = pci_get_drvdata(dev); ++ ++ if (dev_data->isr_on && dev_data->ack_intr) { ++ dev_data->handled++; ++ if ((dev_data->handled % 1000) == 0) { ++ if (xen_ignore_irq(irq)) { ++ printk(KERN_INFO "%s IRQ line is not shared " ++ "with other domains. Turning ISR off\n", ++ dev_data->irq_name); ++ dev_data->ack_intr = 0; ++ } ++ } ++ return IRQ_HANDLED; ++ } ++ return IRQ_NONE; ++} +diff --git a/drivers/xen/pciback/slot.c b/drivers/xen/pciback/slot.c +new file mode 100644 +index 0000000..efb922d +--- /dev/null ++++ b/drivers/xen/pciback/slot.c +@@ -0,0 +1,191 @@ ++/* ++ * PCI Backend - Provides a Virtual PCI bus (with real devices) ++ * to the frontend ++ * ++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> (vpci.c) ++ * Author: Tristan Gingold <tristan.gingold@bull.net>, from vpci.c ++ */ ++ ++#include <linux/list.h> ++#include <linux/slab.h> ++#include <linux/pci.h> ++#include <linux/spinlock.h> ++#include "pciback.h" ++ ++/* There are at most 32 slots in a pci bus. */ ++#define PCI_SLOT_MAX 32 ++ ++#define PCI_BUS_NBR 2 ++ ++struct slot_dev_data { ++ /* Access to dev_list must be protected by lock */ ++ struct pci_dev *slots[PCI_BUS_NBR][PCI_SLOT_MAX]; ++ spinlock_t lock; ++}; ++ ++struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev, ++ unsigned int domain, unsigned int bus, ++ unsigned int devfn) ++{ ++ struct pci_dev *dev = NULL; ++ struct slot_dev_data *slot_dev = pdev->pci_dev_data; ++ unsigned long flags; ++ ++ if (domain != 0 || PCI_FUNC(devfn) != 0) ++ return NULL; ++ ++ if (PCI_SLOT(devfn) >= PCI_SLOT_MAX || bus >= PCI_BUS_NBR) ++ return NULL; ++ ++ spin_lock_irqsave(&slot_dev->lock, flags); ++ dev = slot_dev->slots[bus][PCI_SLOT(devfn)]; ++ spin_unlock_irqrestore(&slot_dev->lock, flags); ++ ++ return dev; ++} ++ ++int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev, ++ int devid, publish_pci_dev_cb publish_cb) ++{ ++ int err = 0, slot, bus; ++ struct slot_dev_data *slot_dev = pdev->pci_dev_data; ++ unsigned long flags; ++ ++ if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) { ++ err = -EFAULT; ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Can't export bridges on the virtual PCI bus"); ++ goto out; ++ } ++ ++ spin_lock_irqsave(&slot_dev->lock, flags); ++ ++ /* Assign to a new slot on the virtual PCI bus */ ++ for (bus = 0; bus < PCI_BUS_NBR; bus++) ++ for (slot = 0; slot < PCI_SLOT_MAX; slot++) { ++ if (slot_dev->slots[bus][slot] == NULL) { ++ printk(KERN_INFO ++ "pciback: slot: %s: assign to virtual " ++ "slot %d, bus %d\n", ++ pci_name(dev), slot, bus); ++ slot_dev->slots[bus][slot] = dev; ++ goto unlock; ++ } ++ } ++ ++ err = -ENOMEM; ++ xenbus_dev_fatal(pdev->xdev, err, ++ "No more space on root virtual PCI bus"); ++ ++unlock: ++ spin_unlock_irqrestore(&slot_dev->lock, flags); ++ ++ /* Publish this device. */ ++ if (!err) ++ err = publish_cb(pdev, 0, 0, PCI_DEVFN(slot, 0), devid); ++ ++out: ++ return err; ++} ++ ++void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev) ++{ ++ int slot, bus; ++ struct slot_dev_data *slot_dev = pdev->pci_dev_data; ++ struct pci_dev *found_dev = NULL; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&slot_dev->lock, flags); ++ ++ for (bus = 0; bus < PCI_BUS_NBR; bus++) ++ for (slot = 0; slot < PCI_SLOT_MAX; slot++) { ++ if (slot_dev->slots[bus][slot] == dev) { ++ slot_dev->slots[bus][slot] = NULL; ++ found_dev = dev; ++ goto out; ++ } ++ } ++ ++out: ++ spin_unlock_irqrestore(&slot_dev->lock, flags); ++ ++ if (found_dev) ++ pcistub_put_pci_dev(found_dev); ++} ++ ++int pciback_init_devices(struct pciback_device *pdev) ++{ ++ int slot, bus; ++ struct slot_dev_data *slot_dev; ++ ++ slot_dev = kmalloc(sizeof(*slot_dev), GFP_KERNEL); ++ if (!slot_dev) ++ return -ENOMEM; ++ ++ spin_lock_init(&slot_dev->lock); ++ ++ for (bus = 0; bus < PCI_BUS_NBR; bus++) ++ for (slot = 0; slot < PCI_SLOT_MAX; slot++) ++ slot_dev->slots[bus][slot] = NULL; ++ ++ pdev->pci_dev_data = slot_dev; ++ ++ return 0; ++} ++ ++int pciback_publish_pci_roots(struct pciback_device *pdev, ++ publish_pci_root_cb publish_cb) ++{ ++ /* The Virtual PCI bus has only one root */ ++ return publish_cb(pdev, 0, 0); ++} ++ ++void pciback_release_devices(struct pciback_device *pdev) ++{ ++ int slot, bus; ++ struct slot_dev_data *slot_dev = pdev->pci_dev_data; ++ struct pci_dev *dev; ++ ++ for (bus = 0; bus < PCI_BUS_NBR; bus++) ++ for (slot = 0; slot < PCI_SLOT_MAX; slot++) { ++ dev = slot_dev->slots[bus][slot]; ++ if (dev != NULL) ++ pcistub_put_pci_dev(dev); ++ } ++ ++ kfree(slot_dev); ++ pdev->pci_dev_data = NULL; ++} ++ ++int pciback_get_pcifront_dev(struct pci_dev *pcidev, ++ struct pciback_device *pdev, ++ unsigned int *domain, unsigned int *bus, ++ unsigned int *devfn) ++{ ++ int slot, busnr; ++ struct slot_dev_data *slot_dev = pdev->pci_dev_data; ++ struct pci_dev *dev; ++ int found = 0; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&slot_dev->lock, flags); ++ ++ for (busnr = 0; busnr < PCI_BUS_NBR; bus++) ++ for (slot = 0; slot < PCI_SLOT_MAX; slot++) { ++ dev = slot_dev->slots[busnr][slot]; ++ if (dev && dev->bus->number == pcidev->bus->number ++ && dev->devfn == pcidev->devfn ++ && pci_domain_nr(dev->bus) == ++ pci_domain_nr(pcidev->bus)) { ++ found = 1; ++ *domain = 0; ++ *bus = busnr; ++ *devfn = PCI_DEVFN(slot, 0); ++ goto out; ++ } ++ } ++out: ++ spin_unlock_irqrestore(&slot_dev->lock, flags); ++ return found; ++ ++} +diff --git a/drivers/xen/pciback/vpci.c b/drivers/xen/pciback/vpci.c +new file mode 100644 +index 0000000..2857ab8 +--- /dev/null ++++ b/drivers/xen/pciback/vpci.c +@@ -0,0 +1,244 @@ ++/* ++ * PCI Backend - Provides a Virtual PCI bus (with real devices) ++ * to the frontend ++ * ++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> ++ */ ++ ++#include <linux/list.h> ++#include <linux/slab.h> ++#include <linux/pci.h> ++#include <linux/spinlock.h> ++#include "pciback.h" ++ ++#define PCI_SLOT_MAX 32 ++ ++struct vpci_dev_data { ++ /* Access to dev_list must be protected by lock */ ++ struct list_head dev_list[PCI_SLOT_MAX]; ++ spinlock_t lock; ++}; ++ ++static inline struct list_head *list_first(struct list_head *head) ++{ ++ return head->next; ++} ++ ++struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev, ++ unsigned int domain, unsigned int bus, ++ unsigned int devfn) ++{ ++ struct pci_dev_entry *entry; ++ struct pci_dev *dev = NULL; ++ struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; ++ unsigned long flags; ++ ++ if (domain != 0 || bus != 0) ++ return NULL; ++ ++ if (PCI_SLOT(devfn) < PCI_SLOT_MAX) { ++ spin_lock_irqsave(&vpci_dev->lock, flags); ++ ++ list_for_each_entry(entry, ++ &vpci_dev->dev_list[PCI_SLOT(devfn)], ++ list) { ++ if (PCI_FUNC(entry->dev->devfn) == PCI_FUNC(devfn)) { ++ dev = entry->dev; ++ break; ++ } ++ } ++ ++ spin_unlock_irqrestore(&vpci_dev->lock, flags); ++ } ++ return dev; ++} ++ ++static inline int match_slot(struct pci_dev *l, struct pci_dev *r) ++{ ++ if (pci_domain_nr(l->bus) == pci_domain_nr(r->bus) ++ && l->bus == r->bus && PCI_SLOT(l->devfn) == PCI_SLOT(r->devfn)) ++ return 1; ++ ++ return 0; ++} ++ ++int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev, ++ int devid, publish_pci_dev_cb publish_cb) ++{ ++ int err = 0, slot, func = -1; ++ struct pci_dev_entry *t, *dev_entry; ++ struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; ++ unsigned long flags; ++ ++ if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) { ++ err = -EFAULT; ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Can't export bridges on the virtual PCI bus"); ++ goto out; ++ } ++ ++ dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL); ++ if (!dev_entry) { ++ err = -ENOMEM; ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error adding entry to virtual PCI bus"); ++ goto out; ++ } ++ ++ dev_entry->dev = dev; ++ ++ spin_lock_irqsave(&vpci_dev->lock, flags); ++ ++ /* Keep multi-function devices together on the virtual PCI bus */ ++ for (slot = 0; slot < PCI_SLOT_MAX; slot++) { ++ if (!list_empty(&vpci_dev->dev_list[slot])) { ++ t = list_entry(list_first(&vpci_dev->dev_list[slot]), ++ struct pci_dev_entry, list); ++ ++ if (match_slot(dev, t->dev)) { ++ pr_info("pciback: vpci: %s: " ++ "assign to virtual slot %d func %d\n", ++ pci_name(dev), slot, ++ PCI_FUNC(dev->devfn)); ++ list_add_tail(&dev_entry->list, ++ &vpci_dev->dev_list[slot]); ++ func = PCI_FUNC(dev->devfn); ++ goto unlock; ++ } ++ } ++ } ++ ++ /* Assign to a new slot on the virtual PCI bus */ ++ for (slot = 0; slot < PCI_SLOT_MAX; slot++) { ++ if (list_empty(&vpci_dev->dev_list[slot])) { ++ printk(KERN_INFO ++ "pciback: vpci: %s: assign to virtual slot %d\n", ++ pci_name(dev), slot); ++ list_add_tail(&dev_entry->list, ++ &vpci_dev->dev_list[slot]); ++ func = PCI_FUNC(dev->devfn); ++ goto unlock; ++ } ++ } ++ ++ err = -ENOMEM; ++ xenbus_dev_fatal(pdev->xdev, err, ++ "No more space on root virtual PCI bus"); ++ ++unlock: ++ spin_unlock_irqrestore(&vpci_dev->lock, flags); ++ ++ /* Publish this device. */ ++ if (!err) ++ err = publish_cb(pdev, 0, 0, PCI_DEVFN(slot, func), devid); ++ ++out: ++ return err; ++} ++ ++void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev) ++{ ++ int slot; ++ struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; ++ struct pci_dev *found_dev = NULL; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&vpci_dev->lock, flags); ++ ++ for (slot = 0; slot < PCI_SLOT_MAX; slot++) { ++ struct pci_dev_entry *e, *tmp; ++ list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot], ++ list) { ++ if (e->dev == dev) { ++ list_del(&e->list); ++ found_dev = e->dev; ++ kfree(e); ++ goto out; ++ } ++ } ++ } ++ ++out: ++ spin_unlock_irqrestore(&vpci_dev->lock, flags); ++ ++ if (found_dev) ++ pcistub_put_pci_dev(found_dev); ++} ++ ++int pciback_init_devices(struct pciback_device *pdev) ++{ ++ int slot; ++ struct vpci_dev_data *vpci_dev; ++ ++ vpci_dev = kmalloc(sizeof(*vpci_dev), GFP_KERNEL); ++ if (!vpci_dev) ++ return -ENOMEM; ++ ++ spin_lock_init(&vpci_dev->lock); ++ ++ for (slot = 0; slot < PCI_SLOT_MAX; slot++) ++ INIT_LIST_HEAD(&vpci_dev->dev_list[slot]); ++ ++ pdev->pci_dev_data = vpci_dev; ++ ++ return 0; ++} ++ ++int pciback_publish_pci_roots(struct pciback_device *pdev, ++ publish_pci_root_cb publish_cb) ++{ ++ /* The Virtual PCI bus has only one root */ ++ return publish_cb(pdev, 0, 0); ++} ++ ++void pciback_release_devices(struct pciback_device *pdev) ++{ ++ int slot; ++ struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; ++ ++ for (slot = 0; slot < PCI_SLOT_MAX; slot++) { ++ struct pci_dev_entry *e, *tmp; ++ list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot], ++ list) { ++ list_del(&e->list); ++ pcistub_put_pci_dev(e->dev); ++ kfree(e); ++ } ++ } ++ ++ kfree(vpci_dev); ++ pdev->pci_dev_data = NULL; ++} ++ ++int pciback_get_pcifront_dev(struct pci_dev *pcidev, ++ struct pciback_device *pdev, ++ unsigned int *domain, unsigned int *bus, ++ unsigned int *devfn) ++{ ++ struct pci_dev_entry *entry; ++ struct pci_dev *dev = NULL; ++ struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; ++ unsigned long flags; ++ int found = 0, slot; ++ ++ spin_lock_irqsave(&vpci_dev->lock, flags); ++ for (slot = 0; slot < PCI_SLOT_MAX; slot++) { ++ list_for_each_entry(entry, ++ &vpci_dev->dev_list[slot], ++ list) { ++ dev = entry->dev; ++ if (dev && dev->bus->number == pcidev->bus->number ++ && pci_domain_nr(dev->bus) == ++ pci_domain_nr(pcidev->bus) ++ && dev->devfn == pcidev->devfn) { ++ found = 1; ++ *domain = 0; ++ *bus = 0; ++ *devfn = PCI_DEVFN(slot, ++ PCI_FUNC(pcidev->devfn)); ++ } ++ } ++ } ++ spin_unlock_irqrestore(&vpci_dev->lock, flags); ++ return found; ++} +diff --git a/drivers/xen/pciback/xenbus.c b/drivers/xen/pciback/xenbus.c +new file mode 100644 +index 0000000..f0d5426 +--- /dev/null ++++ b/drivers/xen/pciback/xenbus.c +@@ -0,0 +1,730 @@ ++/* ++ * PCI Backend Xenbus Setup - handles setup with frontend and xend ++ * ++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> ++ */ ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/list.h> ++#include <linux/vmalloc.h> ++#include <linux/workqueue.h> ++#include <xen/xenbus.h> ++#include <xen/events.h> ++#include <asm/xen/pci.h> ++#include <linux/workqueue.h> ++#include "pciback.h" ++ ++#define INVALID_EVTCHN_IRQ (-1) ++struct workqueue_struct *pciback_wq; ++ ++static struct pciback_device *alloc_pdev(struct xenbus_device *xdev) ++{ ++ struct pciback_device *pdev; ++ ++ pdev = kzalloc(sizeof(struct pciback_device), GFP_KERNEL); ++ if (pdev == NULL) ++ goto out; ++ dev_dbg(&xdev->dev, "allocated pdev @ 0x%p\n", pdev); ++ ++ pdev->xdev = xdev; ++ dev_set_drvdata(&xdev->dev, pdev); ++ ++ spin_lock_init(&pdev->dev_lock); ++ ++ pdev->sh_info = NULL; ++ pdev->evtchn_irq = INVALID_EVTCHN_IRQ; ++ pdev->be_watching = 0; ++ ++ INIT_WORK(&pdev->op_work, pciback_do_op); ++ ++ if (pciback_init_devices(pdev)) { ++ kfree(pdev); ++ pdev = NULL; ++ } ++out: ++ return pdev; ++} ++ ++static void pciback_disconnect(struct pciback_device *pdev) ++{ ++ spin_lock(&pdev->dev_lock); ++ ++ /* Ensure the guest can't trigger our handler before removing devices */ ++ if (pdev->evtchn_irq != INVALID_EVTCHN_IRQ) { ++ unbind_from_irqhandler(pdev->evtchn_irq, pdev); ++ pdev->evtchn_irq = INVALID_EVTCHN_IRQ; ++ } ++ spin_unlock(&pdev->dev_lock); ++ ++ /* If the driver domain started an op, make sure we complete it ++ * before releasing the shared memory */ ++ ++ /* Note, the workqueue does not use spinlocks at all.*/ ++ flush_workqueue(pciback_wq); ++ ++ spin_lock(&pdev->dev_lock); ++ if (pdev->sh_info != NULL) { ++ xenbus_unmap_ring_vfree(pdev->xdev, pdev->sh_info); ++ pdev->sh_info = NULL; ++ } ++ spin_unlock(&pdev->dev_lock); ++ ++} ++ ++static void free_pdev(struct pciback_device *pdev) ++{ ++ spin_lock(&pdev->dev_lock); ++ if (pdev->be_watching) { ++ unregister_xenbus_watch(&pdev->be_watch); ++ pdev->be_watching = 0; ++ } ++ spin_unlock(&pdev->dev_lock); ++ ++ pciback_disconnect(pdev); ++ ++ pciback_release_devices(pdev); ++ ++ dev_set_drvdata(&pdev->xdev->dev, NULL); ++ pdev->xdev = NULL; ++ ++ kfree(pdev); ++} ++ ++static int pciback_do_attach(struct pciback_device *pdev, int gnt_ref, ++ int remote_evtchn) ++{ ++ int err = 0; ++ void *vaddr; ++ ++ dev_dbg(&pdev->xdev->dev, ++ "Attaching to frontend resources - gnt_ref=%d evtchn=%d\n", ++ gnt_ref, remote_evtchn); ++ ++ err = xenbus_map_ring_valloc(pdev->xdev, gnt_ref, &vaddr); ++ if (err < 0) { ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error mapping other domain page in ours."); ++ goto out; ++ } ++ ++ spin_lock(&pdev->dev_lock); ++ pdev->sh_info = vaddr; ++ spin_unlock(&pdev->dev_lock); ++ ++ err = bind_interdomain_evtchn_to_irqhandler( ++ pdev->xdev->otherend_id, remote_evtchn, pciback_handle_event, ++ 0, "pciback", pdev); ++ if (err < 0) { ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error binding event channel to IRQ"); ++ goto out; ++ } ++ ++ spin_lock(&pdev->dev_lock); ++ pdev->evtchn_irq = err; ++ spin_unlock(&pdev->dev_lock); ++ err = 0; ++ ++ dev_dbg(&pdev->xdev->dev, "Attached!\n"); ++out: ++ return err; ++} ++ ++static int pciback_attach(struct pciback_device *pdev) ++{ ++ int err = 0; ++ int gnt_ref, remote_evtchn; ++ char *magic = NULL; ++ ++ ++ /* Make sure we only do this setup once */ ++ if (xenbus_read_driver_state(pdev->xdev->nodename) != ++ XenbusStateInitialised) ++ goto out; ++ ++ /* Wait for frontend to state that it has published the configuration */ ++ if (xenbus_read_driver_state(pdev->xdev->otherend) != ++ XenbusStateInitialised) ++ goto out; ++ ++ dev_dbg(&pdev->xdev->dev, "Reading frontend config\n"); ++ ++ err = xenbus_gather(XBT_NIL, pdev->xdev->otherend, ++ "pci-op-ref", "%u", &gnt_ref, ++ "event-channel", "%u", &remote_evtchn, ++ "magic", NULL, &magic, NULL); ++ if (err) { ++ /* If configuration didn't get read correctly, wait longer */ ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error reading configuration from frontend"); ++ goto out; ++ } ++ ++ if (magic == NULL || strcmp(magic, XEN_PCI_MAGIC) != 0) { ++ xenbus_dev_fatal(pdev->xdev, -EFAULT, ++ "version mismatch (%s/%s) with pcifront - " ++ "halting pciback", ++ magic, XEN_PCI_MAGIC); ++ goto out; ++ } ++ ++ err = pciback_do_attach(pdev, gnt_ref, remote_evtchn); ++ if (err) ++ goto out; ++ ++ dev_dbg(&pdev->xdev->dev, "Connecting...\n"); ++ ++ err = xenbus_switch_state(pdev->xdev, XenbusStateConnected); ++ if (err) ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error switching to connected state!"); ++ ++ dev_dbg(&pdev->xdev->dev, "Connected? %d\n", err); ++out: ++ ++ kfree(magic); ++ ++ return err; ++} ++ ++static int pciback_publish_pci_dev(struct pciback_device *pdev, ++ unsigned int domain, unsigned int bus, ++ unsigned int devfn, unsigned int devid) ++{ ++ int err; ++ int len; ++ char str[64]; ++ ++ len = snprintf(str, sizeof(str), "vdev-%d", devid); ++ if (unlikely(len >= (sizeof(str) - 1))) { ++ err = -ENOMEM; ++ goto out; ++ } ++ ++ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str, ++ "%04x:%02x:%02x.%02x", domain, bus, ++ PCI_SLOT(devfn), PCI_FUNC(devfn)); ++ ++out: ++ return err; ++} ++ ++static int pciback_export_device(struct pciback_device *pdev, ++ int domain, int bus, int slot, int func, ++ int devid) ++{ ++ struct pci_dev *dev; ++ int err = 0; ++ ++ dev_dbg(&pdev->xdev->dev, "exporting dom %x bus %x slot %x func %x\n", ++ domain, bus, slot, func); ++ ++ dev = pcistub_get_pci_dev_by_slot(pdev, domain, bus, slot, func); ++ if (!dev) { ++ err = -EINVAL; ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Couldn't locate PCI device " ++ "(%04x:%02x:%02x.%01x)! " ++ "perhaps already in-use?", ++ domain, bus, slot, func); ++ goto out; ++ } ++ ++ err = pciback_add_pci_dev(pdev, dev, devid, pciback_publish_pci_dev); ++ if (err) ++ goto out; ++ ++ dev_dbg(&dev->dev, "registering for %d\n", pdev->xdev->otherend_id); ++ if (xen_register_device_domain_owner(dev, ++ pdev->xdev->otherend_id) != 0) { ++ dev_err(&dev->dev, "device has been assigned to another " \ ++ "domain! Over-writting the ownership, but beware.\n"); ++ xen_unregister_device_domain_owner(dev); ++ xen_register_device_domain_owner(dev, pdev->xdev->otherend_id); ++ } ++ ++ /* TODO: It'd be nice to export a bridge and have all of its children ++ * get exported with it. This may be best done in xend (which will ++ * have to calculate resource usage anyway) but we probably want to ++ * put something in here to ensure that if a bridge gets given to a ++ * driver domain, that all devices under that bridge are not given ++ * to other driver domains (as he who controls the bridge can disable ++ * it and stop the other devices from working). ++ */ ++out: ++ return err; ++} ++ ++static int pciback_remove_device(struct pciback_device *pdev, ++ int domain, int bus, int slot, int func) ++{ ++ int err = 0; ++ struct pci_dev *dev; ++ ++ dev_dbg(&pdev->xdev->dev, "removing dom %x bus %x slot %x func %x\n", ++ domain, bus, slot, func); ++ ++ dev = pciback_get_pci_dev(pdev, domain, bus, PCI_DEVFN(slot, func)); ++ if (!dev) { ++ err = -EINVAL; ++ dev_dbg(&pdev->xdev->dev, "Couldn't locate PCI device " ++ "(%04x:%02x:%02x.%01x)! not owned by this domain\n", ++ domain, bus, slot, func); ++ goto out; ++ } ++ ++ dev_dbg(&dev->dev, "unregistering for %d\n", pdev->xdev->otherend_id); ++ xen_unregister_device_domain_owner(dev); ++ ++ pciback_release_pci_dev(pdev, dev); ++ ++out: ++ return err; ++} ++ ++static int pciback_publish_pci_root(struct pciback_device *pdev, ++ unsigned int domain, unsigned int bus) ++{ ++ unsigned int d, b; ++ int i, root_num, len, err; ++ char str[64]; ++ ++ dev_dbg(&pdev->xdev->dev, "Publishing pci roots\n"); ++ ++ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, ++ "root_num", "%d", &root_num); ++ if (err == 0 || err == -ENOENT) ++ root_num = 0; ++ else if (err < 0) ++ goto out; ++ ++ /* Verify that we haven't already published this pci root */ ++ for (i = 0; i < root_num; i++) { ++ len = snprintf(str, sizeof(str), "root-%d", i); ++ if (unlikely(len >= (sizeof(str) - 1))) { ++ err = -ENOMEM; ++ goto out; ++ } ++ ++ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, ++ str, "%x:%x", &d, &b); ++ if (err < 0) ++ goto out; ++ if (err != 2) { ++ err = -EINVAL; ++ goto out; ++ } ++ ++ if (d == domain && b == bus) { ++ err = 0; ++ goto out; ++ } ++ } ++ ++ len = snprintf(str, sizeof(str), "root-%d", root_num); ++ if (unlikely(len >= (sizeof(str) - 1))) { ++ err = -ENOMEM; ++ goto out; ++ } ++ ++ dev_dbg(&pdev->xdev->dev, "writing root %d at %04x:%02x\n", ++ root_num, domain, bus); ++ ++ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str, ++ "%04x:%02x", domain, bus); ++ if (err) ++ goto out; ++ ++ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, ++ "root_num", "%d", (root_num + 1)); ++ ++out: ++ return err; ++} ++ ++static int pciback_reconfigure(struct pciback_device *pdev) ++{ ++ int err = 0; ++ int num_devs; ++ int domain, bus, slot, func; ++ int substate; ++ int i, len; ++ char state_str[64]; ++ char dev_str[64]; ++ ++ ++ dev_dbg(&pdev->xdev->dev, "Reconfiguring device ...\n"); ++ ++ /* Make sure we only reconfigure once */ ++ if (xenbus_read_driver_state(pdev->xdev->nodename) != ++ XenbusStateReconfiguring) ++ goto out; ++ ++ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d", ++ &num_devs); ++ if (err != 1) { ++ if (err >= 0) ++ err = -EINVAL; ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error reading number of devices"); ++ goto out; ++ } ++ ++ for (i = 0; i < num_devs; i++) { ++ len = snprintf(state_str, sizeof(state_str), "state-%d", i); ++ if (unlikely(len >= (sizeof(state_str) - 1))) { ++ err = -ENOMEM; ++ xenbus_dev_fatal(pdev->xdev, err, ++ "String overflow while reading " ++ "configuration"); ++ goto out; ++ } ++ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, state_str, ++ "%d", &substate); ++ if (err != 1) ++ substate = XenbusStateUnknown; ++ ++ switch (substate) { ++ case XenbusStateInitialising: ++ dev_dbg(&pdev->xdev->dev, "Attaching dev-%d ...\n", i); ++ ++ len = snprintf(dev_str, sizeof(dev_str), "dev-%d", i); ++ if (unlikely(len >= (sizeof(dev_str) - 1))) { ++ err = -ENOMEM; ++ xenbus_dev_fatal(pdev->xdev, err, ++ "String overflow while " ++ "reading configuration"); ++ goto out; ++ } ++ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, ++ dev_str, "%x:%x:%x.%x", ++ &domain, &bus, &slot, &func); ++ if (err < 0) { ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error reading device " ++ "configuration"); ++ goto out; ++ } ++ if (err != 4) { ++ err = -EINVAL; ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error parsing pci device " ++ "configuration"); ++ goto out; ++ } ++ ++ err = pciback_export_device(pdev, domain, bus, slot, ++ func, i); ++ if (err) ++ goto out; ++ ++ /* Publish pci roots. */ ++ err = pciback_publish_pci_roots(pdev, ++ pciback_publish_pci_root); ++ if (err) { ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error while publish PCI root" ++ "buses for frontend"); ++ goto out; ++ } ++ ++ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, ++ state_str, "%d", ++ XenbusStateInitialised); ++ if (err) { ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error switching substate of " ++ "dev-%d\n", i); ++ goto out; ++ } ++ break; ++ ++ case XenbusStateClosing: ++ dev_dbg(&pdev->xdev->dev, "Detaching dev-%d ...\n", i); ++ ++ len = snprintf(dev_str, sizeof(dev_str), "vdev-%d", i); ++ if (unlikely(len >= (sizeof(dev_str) - 1))) { ++ err = -ENOMEM; ++ xenbus_dev_fatal(pdev->xdev, err, ++ "String overflow while " ++ "reading configuration"); ++ goto out; ++ } ++ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, ++ dev_str, "%x:%x:%x.%x", ++ &domain, &bus, &slot, &func); ++ if (err < 0) { ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error reading device " ++ "configuration"); ++ goto out; ++ } ++ if (err != 4) { ++ err = -EINVAL; ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error parsing pci device " ++ "configuration"); ++ goto out; ++ } ++ ++ err = pciback_remove_device(pdev, domain, bus, slot, ++ func); ++ if (err) ++ goto out; ++ ++ /* TODO: If at some point we implement support for pci ++ * root hot-remove on pcifront side, we'll need to ++ * remove unnecessary xenstore nodes of pci roots here. ++ */ ++ ++ break; ++ ++ default: ++ break; ++ } ++ } ++ ++ err = xenbus_switch_state(pdev->xdev, XenbusStateReconfigured); ++ if (err) { ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error switching to reconfigured state!"); ++ goto out; ++ } ++ ++out: ++ return 0; ++} ++ ++static void pciback_frontend_changed(struct xenbus_device *xdev, ++ enum xenbus_state fe_state) ++{ ++ struct pciback_device *pdev = dev_get_drvdata(&xdev->dev); ++ ++ dev_dbg(&xdev->dev, "fe state changed %d\n", fe_state); ++ ++ switch (fe_state) { ++ case XenbusStateInitialised: ++ pciback_attach(pdev); ++ break; ++ ++ case XenbusStateReconfiguring: ++ pciback_reconfigure(pdev); ++ break; ++ ++ case XenbusStateConnected: ++ /* pcifront switched its state from reconfiguring to connected. ++ * Then switch to connected state. ++ */ ++ xenbus_switch_state(xdev, XenbusStateConnected); ++ break; ++ ++ case XenbusStateClosing: ++ pciback_disconnect(pdev); ++ xenbus_switch_state(xdev, XenbusStateClosing); ++ break; ++ ++ case XenbusStateClosed: ++ pciback_disconnect(pdev); ++ xenbus_switch_state(xdev, XenbusStateClosed); ++ if (xenbus_dev_is_online(xdev)) ++ break; ++ /* fall through if not online */ ++ case XenbusStateUnknown: ++ dev_dbg(&xdev->dev, "frontend is gone! unregister device\n"); ++ device_unregister(&xdev->dev); ++ break; ++ ++ default: ++ break; ++ } ++} ++ ++static int pciback_setup_backend(struct pciback_device *pdev) ++{ ++ /* Get configuration from xend (if available now) */ ++ int domain, bus, slot, func; ++ int err = 0; ++ int i, num_devs; ++ char dev_str[64]; ++ char state_str[64]; ++ ++ /* It's possible we could get the call to setup twice, so make sure ++ * we're not already connected. ++ */ ++ if (xenbus_read_driver_state(pdev->xdev->nodename) != ++ XenbusStateInitWait) ++ goto out; ++ ++ dev_dbg(&pdev->xdev->dev, "getting be setup\n"); ++ ++ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d", ++ &num_devs); ++ if (err != 1) { ++ if (err >= 0) ++ err = -EINVAL; ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error reading number of devices"); ++ goto out; ++ } ++ ++ for (i = 0; i < num_devs; i++) { ++ int l = snprintf(dev_str, sizeof(dev_str), "dev-%d", i); ++ if (unlikely(l >= (sizeof(dev_str) - 1))) { ++ err = -ENOMEM; ++ xenbus_dev_fatal(pdev->xdev, err, ++ "String overflow while reading " ++ "configuration"); ++ goto out; ++ } ++ ++ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, dev_str, ++ "%x:%x:%x.%x", &domain, &bus, &slot, &func); ++ if (err < 0) { ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error reading device configuration"); ++ goto out; ++ } ++ if (err != 4) { ++ err = -EINVAL; ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error parsing pci device " ++ "configuration"); ++ goto out; ++ } ++ ++ err = pciback_export_device(pdev, domain, bus, slot, func, i); ++ if (err) ++ goto out; ++ ++ /* Switch substate of this device. */ ++ l = snprintf(state_str, sizeof(state_str), "state-%d", i); ++ if (unlikely(l >= (sizeof(state_str) - 1))) { ++ err = -ENOMEM; ++ xenbus_dev_fatal(pdev->xdev, err, ++ "String overflow while reading " ++ "configuration"); ++ goto out; ++ } ++ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, state_str, ++ "%d", XenbusStateInitialised); ++ if (err) { ++ xenbus_dev_fatal(pdev->xdev, err, "Error switching " ++ "substate of dev-%d\n", i); ++ goto out; ++ } ++ } ++ ++ err = pciback_publish_pci_roots(pdev, pciback_publish_pci_root); ++ if (err) { ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error while publish PCI root buses " ++ "for frontend"); ++ goto out; ++ } ++ ++ err = xenbus_switch_state(pdev->xdev, XenbusStateInitialised); ++ if (err) ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error switching to initialised state!"); ++ ++out: ++ if (!err) ++ /* see if pcifront is already configured (if not, we'll wait) */ ++ pciback_attach(pdev); ++ ++ return err; ++} ++ ++static void pciback_be_watch(struct xenbus_watch *watch, ++ const char **vec, unsigned int len) ++{ ++ struct pciback_device *pdev = ++ container_of(watch, struct pciback_device, be_watch); ++ ++ switch (xenbus_read_driver_state(pdev->xdev->nodename)) { ++ case XenbusStateInitWait: ++ pciback_setup_backend(pdev); ++ break; ++ ++ default: ++ break; ++ } ++} ++ ++static int pciback_xenbus_probe(struct xenbus_device *dev, ++ const struct xenbus_device_id *id) ++{ ++ int err = 0; ++ struct pciback_device *pdev = alloc_pdev(dev); ++ ++ if (pdev == NULL) { ++ err = -ENOMEM; ++ xenbus_dev_fatal(dev, err, ++ "Error allocating pciback_device struct"); ++ goto out; ++ } ++ ++ /* wait for xend to configure us */ ++ err = xenbus_switch_state(dev, XenbusStateInitWait); ++ if (err) ++ goto out; ++ ++ /* watch the backend node for backend configuration information */ ++ err = xenbus_watch_path(dev, dev->nodename, &pdev->be_watch, ++ pciback_be_watch); ++ if (err) ++ goto out; ++ ++ spin_lock(&pdev->dev_lock); ++ pdev->be_watching = 1; ++ spin_unlock(&pdev->dev_lock); ++ ++ /* We need to force a call to our callback here in case ++ * xend already configured us! ++ */ ++ pciback_be_watch(&pdev->be_watch, NULL, 0); ++ ++out: ++ return err; ++} ++ ++static int pciback_xenbus_remove(struct xenbus_device *dev) ++{ ++ struct pciback_device *pdev = dev_get_drvdata(&dev->dev); ++ ++ if (pdev != NULL) ++ free_pdev(pdev); ++ ++ return 0; ++} ++ ++static const struct xenbus_device_id xenpci_ids[] = { ++ {"pci"}, ++ {""}, ++}; ++ ++static struct xenbus_driver xenbus_pciback_driver = { ++ .name = "pciback", ++ .owner = THIS_MODULE, ++ .ids = xenpci_ids, ++ .probe = pciback_xenbus_probe, ++ .remove = pciback_xenbus_remove, ++ .otherend_changed = pciback_frontend_changed, ++}; ++ ++int __init pciback_xenbus_register(void) ++{ ++ pciback_wq = create_workqueue("pciback_workqueue"); ++ if (!pciback_wq) { ++ printk(KERN_ERR "%s: create" ++ "pciback_workqueue failed\n",__FUNCTION__); ++ return -EFAULT; ++ } ++ return xenbus_register_backend(&xenbus_pciback_driver); ++} ++ ++void __exit pciback_xenbus_unregister(void) ++{ ++ destroy_workqueue(pciback_wq); ++ xenbus_unregister_driver(&xenbus_pciback_driver); ++} +diff --git a/drivers/xen/pcpu.c b/drivers/xen/pcpu.c +new file mode 100644 +index 0000000..6d1a770 +--- /dev/null ++++ b/drivers/xen/pcpu.c +@@ -0,0 +1,452 @@ ++/* ++ * pcpu.c - management physical cpu in dom0 environment ++ */ ++#include <linux/interrupt.h> ++#include <linux/spinlock.h> ++#include <asm/xen/hypervisor.h> ++#include <asm/xen/hypercall.h> ++#include <linux/cpu.h> ++#include <xen/xenbus.h> ++#include <xen/pcpu.h> ++#include <xen/events.h> ++#include <xen/acpi.h> ++ ++static struct sysdev_class xen_pcpu_sysdev_class = { ++ .name = "xen_pcpu", ++}; ++ ++static DEFINE_MUTEX(xen_pcpu_lock); ++static RAW_NOTIFIER_HEAD(xen_pcpu_chain); ++ ++/* No need for irq disable since hotplug notify is in workqueue context */ ++#define get_pcpu_lock() mutex_lock(&xen_pcpu_lock); ++#define put_pcpu_lock() mutex_unlock(&xen_pcpu_lock); ++ ++struct xen_pcpus { ++ struct list_head list; ++ int present; ++}; ++static struct xen_pcpus xen_pcpus; ++ ++int register_xen_pcpu_notifier(struct notifier_block *nb) ++{ ++ int ret; ++ ++ /* All refer to the chain notifier is protected by the pcpu_lock */ ++ get_pcpu_lock(); ++ ret = raw_notifier_chain_register(&xen_pcpu_chain, nb); ++ put_pcpu_lock(); ++ return ret; ++} ++EXPORT_SYMBOL_GPL(register_xen_pcpu_notifier); ++ ++void unregister_xen_pcpu_notifier(struct notifier_block *nb) ++{ ++ get_pcpu_lock(); ++ raw_notifier_chain_unregister(&xen_pcpu_chain, nb); ++ put_pcpu_lock(); ++} ++EXPORT_SYMBOL_GPL(unregister_xen_pcpu_notifier); ++ ++static int xen_pcpu_down(uint32_t xen_id) ++{ ++ int ret; ++ xen_platform_op_t op = { ++ .cmd = XENPF_cpu_offline, ++ .interface_version = XENPF_INTERFACE_VERSION, ++ .u.cpu_ol.cpuid = xen_id, ++ }; ++ ++ ret = HYPERVISOR_dom0_op(&op); ++ return ret; ++} ++ ++static int xen_pcpu_up(uint32_t xen_id) ++{ ++ int ret; ++ xen_platform_op_t op = { ++ .cmd = XENPF_cpu_online, ++ .interface_version = XENPF_INTERFACE_VERSION, ++ .u.cpu_ol.cpuid = xen_id, ++ }; ++ ++ ret = HYPERVISOR_dom0_op(&op); ++ return ret; ++} ++ ++static ssize_t show_online(struct sys_device *dev, ++ struct sysdev_attribute *attr, ++ char *buf) ++{ ++ struct pcpu *cpu = container_of(dev, struct pcpu, sysdev); ++ ++ return sprintf(buf, "%u\n", !!(cpu->flags & XEN_PCPU_FLAGS_ONLINE)); ++} ++ ++static ssize_t __ref store_online(struct sys_device *dev, ++ struct sysdev_attribute *attr, ++ const char *buf, size_t count) ++{ ++ struct pcpu *cpu = container_of(dev, struct pcpu, sysdev); ++ ssize_t ret; ++ ++ switch (buf[0]) { ++ case '0': ++ ret = xen_pcpu_down(cpu->xen_id); ++ break; ++ case '1': ++ ret = xen_pcpu_up(cpu->xen_id); ++ break; ++ default: ++ ret = -EINVAL; ++ } ++ ++ if (ret >= 0) ++ ret = count; ++ return ret; ++} ++ ++static SYSDEV_ATTR(online, 0644, show_online, store_online); ++ ++static ssize_t show_apicid(struct sys_device *dev, ++ struct sysdev_attribute *attr, ++ char *buf) ++{ ++ struct pcpu *cpu = container_of(dev, struct pcpu, sysdev); ++ ++ return sprintf(buf, "%u\n", cpu->apic_id); ++} ++ ++static ssize_t show_acpiid(struct sys_device *dev, ++ struct sysdev_attribute *attr, ++ char *buf) ++{ ++ struct pcpu *cpu = container_of(dev, struct pcpu, sysdev); ++ ++ return sprintf(buf, "%u\n", cpu->acpi_id); ++} ++static SYSDEV_ATTR(apic_id, 0444, show_apicid, NULL); ++static SYSDEV_ATTR(acpi_id, 0444, show_acpiid, NULL); ++ ++static int xen_pcpu_free(struct pcpu *pcpu) ++{ ++ if (!pcpu) ++ return 0; ++ ++ sysdev_remove_file(&pcpu->sysdev, &attr_online); ++ sysdev_unregister(&pcpu->sysdev); ++ list_del(&pcpu->pcpu_list); ++ kfree(pcpu); ++ ++ return 0; ++} ++ ++static inline int same_pcpu(struct xenpf_pcpuinfo *info, ++ struct pcpu *pcpu) ++{ ++ return (pcpu->apic_id == info->apic_id) && ++ (pcpu->xen_id == info->xen_cpuid); ++} ++ ++/* ++ * Return 1 if online status changed ++ */ ++static int xen_pcpu_online_check(struct xenpf_pcpuinfo *info, ++ struct pcpu *pcpu) ++{ ++ int result = 0; ++ ++ if (info->xen_cpuid != pcpu->xen_id) ++ return 0; ++ ++ if (xen_pcpu_online(info->flags) && !xen_pcpu_online(pcpu->flags)) { ++ /* the pcpu is onlined */ ++ pcpu->flags |= XEN_PCPU_FLAGS_ONLINE; ++ kobject_uevent(&pcpu->sysdev.kobj, KOBJ_ONLINE); ++ raw_notifier_call_chain(&xen_pcpu_chain, ++ XEN_PCPU_ONLINE, (void *)(long)pcpu->xen_id); ++ result = 1; ++ } else if (!xen_pcpu_online(info->flags) && ++ xen_pcpu_online(pcpu->flags)) { ++ /* The pcpu is offlined now */ ++ pcpu->flags &= ~XEN_PCPU_FLAGS_ONLINE; ++ kobject_uevent(&pcpu->sysdev.kobj, KOBJ_OFFLINE); ++ raw_notifier_call_chain(&xen_pcpu_chain, ++ XEN_PCPU_OFFLINE, (void *)(long)pcpu->xen_id); ++ result = 1; ++ } ++ ++ return result; ++} ++ ++static int pcpu_sysdev_init(struct pcpu *cpu) ++{ ++ int error; ++ ++ error = sysdev_register(&cpu->sysdev); ++ if (error) { ++ printk(KERN_WARNING "xen_pcpu_add: Failed to register pcpu\n"); ++ kfree(cpu); ++ return -1; ++ } ++ sysdev_create_file(&cpu->sysdev, &attr_online); ++ sysdev_create_file(&cpu->sysdev, &attr_apic_id); ++ sysdev_create_file(&cpu->sysdev, &attr_acpi_id); ++ return 0; ++} ++ ++static struct pcpu *get_pcpu(int xen_id) ++{ ++ struct pcpu *pcpu = NULL; ++ ++ list_for_each_entry(pcpu, &xen_pcpus.list, pcpu_list) { ++ if (pcpu->xen_id == xen_id) ++ return pcpu; ++ } ++ return NULL; ++} ++ ++static struct pcpu *init_pcpu(struct xenpf_pcpuinfo *info) ++{ ++ struct pcpu *pcpu; ++ ++ if (info->flags & XEN_PCPU_FLAGS_INVALID) ++ return NULL; ++ ++ /* The PCPU is just added */ ++ pcpu = kzalloc(sizeof(struct pcpu), GFP_KERNEL); ++ if (!pcpu) ++ return NULL; ++ ++ INIT_LIST_HEAD(&pcpu->pcpu_list); ++ pcpu->xen_id = info->xen_cpuid; ++ pcpu->apic_id = info->apic_id; ++ pcpu->acpi_id = info->acpi_id; ++ pcpu->flags = info->flags; ++ ++ pcpu->sysdev.cls = &xen_pcpu_sysdev_class; ++ pcpu->sysdev.id = info->xen_cpuid; ++ ++ if (pcpu_sysdev_init(pcpu)) { ++ kfree(pcpu); ++ return NULL; ++ } ++ ++ list_add_tail(&pcpu->pcpu_list, &xen_pcpus.list); ++ raw_notifier_call_chain(&xen_pcpu_chain, ++ XEN_PCPU_ADD, ++ (void *)(long)pcpu->xen_id); ++ return pcpu; ++} ++ ++#define PCPU_NO_CHANGE 0 ++#define PCPU_ADDED 1 ++#define PCPU_ONLINE_OFFLINE 2 ++#define PCPU_REMOVED 3 ++/* ++ * Caller should hold the pcpu lock ++ * < 0: Something wrong ++ * 0: No changes ++ * > 0: State changed ++ */ ++static struct pcpu *_sync_pcpu(int cpu_num, int *max_id, int *result) ++{ ++ struct pcpu *pcpu = NULL; ++ struct xenpf_pcpuinfo *info; ++ xen_platform_op_t op = { ++ .cmd = XENPF_get_cpuinfo, ++ .interface_version = XENPF_INTERFACE_VERSION, ++ }; ++ int ret; ++ ++ *result = -1; ++ ++ info = &op.u.pcpu_info; ++ info->xen_cpuid = cpu_num; ++ ++ ret = HYPERVISOR_dom0_op(&op); ++ if (ret) ++ return NULL; ++ ++ if (max_id) ++ *max_id = op.u.pcpu_info.max_present; ++ ++ pcpu = get_pcpu(cpu_num); ++ ++ if (info->flags & XEN_PCPU_FLAGS_INVALID) { ++ /* The pcpu has been removed */ ++ *result = PCPU_NO_CHANGE; ++ if (pcpu) { ++ raw_notifier_call_chain(&xen_pcpu_chain, ++ XEN_PCPU_REMOVE, ++ (void *)(long)pcpu->xen_id); ++ xen_pcpu_free(pcpu); ++ *result = PCPU_REMOVED; ++ } ++ return NULL; ++ } ++ ++ ++ if (!pcpu) { ++ *result = PCPU_ADDED; ++ pcpu = init_pcpu(info); ++ if (pcpu == NULL) { ++ printk(KERN_WARNING "Failed to init pcpu %x\n", ++ info->xen_cpuid); ++ *result = -1; ++ } ++ } else { ++ *result = PCPU_NO_CHANGE; ++ /* ++ * Old PCPU is replaced with a new pcpu, this means ++ * several virq is missed, will it happen? ++ */ ++ if (!same_pcpu(info, pcpu)) { ++ printk(KERN_WARNING "Pcpu %x changed!\n", ++ pcpu->xen_id); ++ pcpu->apic_id = info->apic_id; ++ pcpu->acpi_id = info->acpi_id; ++ } ++ if (xen_pcpu_online_check(info, pcpu)) ++ *result = PCPU_ONLINE_OFFLINE; ++ } ++ return pcpu; ++} ++ ++int xen_pcpu_index(uint32_t id, int is_acpiid) ++{ ++ int cpu_num = 0, max_id = 0, ret; ++ xen_platform_op_t op = { ++ .cmd = XENPF_get_cpuinfo, ++ .interface_version = XENPF_INTERFACE_VERSION, ++ }; ++ struct xenpf_pcpuinfo *info = &op.u.pcpu_info; ++ ++ info->xen_cpuid = 0; ++ ret = HYPERVISOR_dom0_op(&op); ++ if (ret) ++ return -1; ++ max_id = op.u.pcpu_info.max_present; ++ ++ while ((cpu_num <= max_id)) { ++ info->xen_cpuid = cpu_num; ++ ret = HYPERVISOR_dom0_op(&op); ++ if (ret) ++ continue; ++ ++ if (op.u.pcpu_info.max_present > max_id) ++ max_id = op.u.pcpu_info.max_present; ++ if (id == (is_acpiid ? info->acpi_id : info->apic_id)) ++ return cpu_num; ++ cpu_num++; ++ } ++ ++ return -1; ++} ++EXPORT_SYMBOL(xen_pcpu_index); ++ ++/* ++ * Sync dom0's pcpu information with xen hypervisor's ++ */ ++static int xen_sync_pcpus(void) ++{ ++ /* ++ * Boot cpu always have cpu_id 0 in xen ++ */ ++ int cpu_num = 0, max_id = 0, result = 0, present = 0; ++ struct list_head *elem, *tmp; ++ struct pcpu *pcpu; ++ ++ get_pcpu_lock(); ++ ++ while ((result >= 0) && (cpu_num <= max_id)) { ++ pcpu = _sync_pcpu(cpu_num, &max_id, &result); ++ ++ printk(KERN_DEBUG "sync cpu %x get result %x max_id %x\n", ++ cpu_num, result, max_id); ++ ++ switch (result) { ++ case PCPU_NO_CHANGE: ++ if (pcpu) ++ present++; ++ break; ++ case PCPU_ADDED: ++ case PCPU_ONLINE_OFFLINE: ++ present++; ++ case PCPU_REMOVED: ++ break; ++ default: ++ printk(KERN_WARNING "Failed to sync pcpu %x\n", ++ cpu_num); ++ break; ++ ++ } ++ cpu_num++; ++ } ++ ++ if (result < 0) { ++ list_for_each_safe(elem, tmp, &xen_pcpus.list) { ++ pcpu = list_entry(elem, struct pcpu, pcpu_list); ++ xen_pcpu_free(pcpu); ++ } ++ present = 0; ++ } ++ ++ xen_pcpus.present = present; ++ ++ put_pcpu_lock(); ++ ++ return 0; ++} ++ ++static void xen_pcpu_dpc(struct work_struct *work) ++{ ++ if (xen_sync_pcpus() < 0) ++ printk(KERN_WARNING ++ "xen_pcpu_dpc: Failed to sync pcpu information\n"); ++} ++static DECLARE_WORK(xen_pcpu_work, xen_pcpu_dpc); ++ ++int xen_pcpu_hotplug(int type, uint32_t apic_id) ++{ ++ schedule_work(&xen_pcpu_work); ++ ++ return 0; ++} ++EXPORT_SYMBOL(xen_pcpu_hotplug); ++ ++static irqreturn_t xen_pcpu_interrupt(int irq, void *dev_id) ++{ ++ schedule_work(&xen_pcpu_work); ++ return IRQ_HANDLED; ++} ++ ++static int __init xen_pcpu_init(void) ++{ ++ int err; ++ ++ if (!xen_initial_domain()) ++ return 0; ++ ++ err = sysdev_class_register(&xen_pcpu_sysdev_class); ++ if (err) { ++ printk(KERN_WARNING ++ "xen_pcpu_init: register xen_pcpu sysdev Failed!\n"); ++ return err; ++ } ++ ++ INIT_LIST_HEAD(&xen_pcpus.list); ++ xen_pcpus.present = 0; ++ ++ xen_sync_pcpus(); ++ if (xen_pcpus.present > 0) ++ err = bind_virq_to_irqhandler(VIRQ_PCPU_STATE, ++ 0, xen_pcpu_interrupt, 0, "pcpu", NULL); ++ if (err < 0) ++ printk(KERN_WARNING "xen_pcpu_init: " ++ "Failed to bind pcpu_state virq\n" ++ "You will lost latest information! \n"); ++ return err; ++} ++ ++arch_initcall(xen_pcpu_init); +diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c +new file mode 100644 +index 0000000..c01b5dd +--- /dev/null ++++ b/drivers/xen/platform-pci.c +@@ -0,0 +1,207 @@ ++/****************************************************************************** ++ * platform-pci.c ++ * ++ * Xen platform PCI device driver ++ * Copyright (c) 2005, Intel Corporation. ++ * Copyright (c) 2007, XenSource Inc. ++ * Copyright (c) 2010, Citrix ++ * ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms and conditions of the GNU General Public License, ++ * version 2, as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++ * more details. ++ * ++ * You should have received a copy of the GNU General Public License along with ++ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple ++ * Place - Suite 330, Boston, MA 02111-1307 USA. ++ * ++ */ ++ ++ ++#include <linux/interrupt.h> ++#include <linux/io.h> ++#include <linux/module.h> ++#include <linux/pci.h> ++ ++#include <xen/platform_pci.h> ++#include <xen/grant_table.h> ++#include <xen/xenbus.h> ++#include <xen/events.h> ++#include <xen/hvm.h> ++#include <xen/xen-ops.h> ++ ++#define DRV_NAME "xen-platform-pci" ++ ++MODULE_AUTHOR("ssmith@xensource.com and stefano.stabellini@eu.citrix.com"); ++MODULE_DESCRIPTION("Xen platform PCI device"); ++MODULE_LICENSE("GPL"); ++ ++static unsigned long platform_mmio; ++static unsigned long platform_mmio_alloc; ++static unsigned long platform_mmiolen; ++static uint64_t callback_via; ++ ++unsigned long alloc_xen_mmio(unsigned long len) ++{ ++ unsigned long addr; ++ ++ addr = platform_mmio + platform_mmio_alloc; ++ platform_mmio_alloc += len; ++ BUG_ON(platform_mmio_alloc > platform_mmiolen); ++ ++ return addr; ++} ++ ++static uint64_t get_callback_via(struct pci_dev *pdev) ++{ ++ u8 pin; ++ int irq; ++ ++ irq = pdev->irq; ++ if (irq < 16) ++ return irq; /* ISA IRQ */ ++ ++ pin = pdev->pin; ++ ++ /* We don't know the GSI. Specify the PCI INTx line instead. */ ++ return ((uint64_t)0x01 << 56) | /* PCI INTx identifier */ ++ ((uint64_t)pci_domain_nr(pdev->bus) << 32) | ++ ((uint64_t)pdev->bus->number << 16) | ++ ((uint64_t)(pdev->devfn & 0xff) << 8) | ++ ((uint64_t)(pin - 1) & 3); ++} ++ ++static irqreturn_t do_hvm_evtchn_intr(int irq, void *dev_id) ++{ ++ xen_hvm_evtchn_do_upcall(); ++ return IRQ_HANDLED; ++} ++ ++static int xen_allocate_irq(struct pci_dev *pdev) ++{ ++ return request_irq(pdev->irq, do_hvm_evtchn_intr, ++ IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TRIGGER_RISING, ++ "xen-platform-pci", pdev); ++} ++ ++static int platform_pci_resume(struct pci_dev *pdev) ++{ ++ int err; ++ if (xen_have_vector_callback) ++ return 0; ++ err = xen_set_callback_via(callback_via); ++ if (err) { ++ dev_err(&pdev->dev, "platform_pci_resume failure!\n"); ++ return err; ++ } ++ return 0; ++} ++ ++static int __devinit platform_pci_init(struct pci_dev *pdev, ++ const struct pci_device_id *ent) ++{ ++ int i, ret; ++ long ioaddr, iolen; ++ long mmio_addr, mmio_len; ++ unsigned int max_nr_gframes; ++ ++ i = pci_enable_device(pdev); ++ if (i) ++ return i; ++ ++ ioaddr = pci_resource_start(pdev, 0); ++ iolen = pci_resource_len(pdev, 0); ++ ++ mmio_addr = pci_resource_start(pdev, 1); ++ mmio_len = pci_resource_len(pdev, 1); ++ ++ if (mmio_addr == 0 || ioaddr == 0) { ++ dev_err(&pdev->dev, "no resources found\n"); ++ ret = -ENOENT; ++ goto pci_out; ++ } ++ ++ if (request_mem_region(mmio_addr, mmio_len, DRV_NAME) == NULL) { ++ dev_err(&pdev->dev, "MEM I/O resource 0x%lx @ 0x%lx busy\n", ++ mmio_addr, mmio_len); ++ ret = -EBUSY; ++ goto pci_out; ++ } ++ ++ if (request_region(ioaddr, iolen, DRV_NAME) == NULL) { ++ dev_err(&pdev->dev, "I/O resource 0x%lx @ 0x%lx busy\n", ++ iolen, ioaddr); ++ ret = -EBUSY; ++ goto mem_out; ++ } ++ ++ platform_mmio = mmio_addr; ++ platform_mmiolen = mmio_len; ++ ++ if (!xen_have_vector_callback) { ++ ret = xen_allocate_irq(pdev); ++ if (ret) { ++ dev_warn(&pdev->dev, "request_irq failed err=%d\n", ret); ++ goto out; ++ } ++ callback_via = get_callback_via(pdev); ++ ret = xen_set_callback_via(callback_via); ++ if (ret) { ++ dev_warn(&pdev->dev, "Unable to set the evtchn callback " ++ "err=%d\n", ret); ++ goto out; ++ } ++ } ++ ++ max_nr_gframes = gnttab_max_grant_frames(); ++ xen_hvm_resume_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes); ++ ret = gnttab_init(); ++ if (ret) ++ goto out; ++ xenbus_probe(NULL); ++ ret = xen_setup_shutdown_event(); ++ if (ret) ++ goto out; ++ return 0; ++ ++out: ++ release_region(ioaddr, iolen); ++mem_out: ++ release_mem_region(mmio_addr, mmio_len); ++pci_out: ++ pci_disable_device(pdev); ++ return ret; ++} ++ ++static struct pci_device_id platform_pci_tbl[] __devinitdata = { ++ {PCI_VENDOR_ID_XEN, PCI_DEVICE_ID_XEN_PLATFORM, ++ PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, ++ {0,} ++}; ++ ++MODULE_DEVICE_TABLE(pci, platform_pci_tbl); ++ ++static struct pci_driver platform_driver = { ++ .name = DRV_NAME, ++ .probe = platform_pci_init, ++ .id_table = platform_pci_tbl, ++#ifdef CONFIG_PM ++ .resume_early = platform_pci_resume, ++#endif ++}; ++ ++static int __init platform_pci_module_init(void) ++{ ++ /* no unplug has been done, IGNORE hasn't been specified: just ++ * return now */ ++ if (!xen_platform_pci_unplug) ++ return -ENODEV; ++ ++ return pci_register_driver(&platform_driver); ++} ++ ++module_init(platform_pci_module_init); +diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c +index 88a60e0..ae5cb05 100644 +--- a/drivers/xen/sys-hypervisor.c ++++ b/drivers/xen/sys-hypervisor.c +@@ -14,6 +14,7 @@ + #include <asm/xen/hypervisor.h> + #include <asm/xen/hypercall.h> + ++#include <xen/xen.h> + #include <xen/xenbus.h> + #include <xen/interface/xen.h> + #include <xen/interface/version.h> +diff --git a/drivers/xen/xen_acpi_memhotplug.c b/drivers/xen/xen_acpi_memhotplug.c +new file mode 100644 +index 0000000..0c4af99 +--- /dev/null ++++ b/drivers/xen/xen_acpi_memhotplug.c +@@ -0,0 +1,209 @@ ++/* ++ * xen_acpi_memhotplug.c - interface to notify Xen on memory device hotadd ++ * ++ * Copyright (C) 2008, Intel corporation ++ * ++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or (at ++ * your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, but ++ * WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with this program; if not, write to the Free Software Foundation, Inc., ++ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. ++ * ++ */ ++ ++#include <linux/kernel.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/types.h> ++#include <linux/memory_hotplug.h> ++#include <acpi/acpi_drivers.h> ++#include <xen/interface/platform.h> ++#include <linux/interrupt.h> ++#include <linux/spinlock.h> ++#include <asm/xen/hypervisor.h> ++#include <asm/xen/hypercall.h> ++#include <xen/acpi.h> ++ ++struct xen_hotmem_entry { ++ struct list_head hotmem_list; ++ uint64_t start; ++ uint64_t end; ++ uint32_t flags; ++ uint32_t pxm; ++}; ++ ++struct xen_hotmem_list { ++ struct list_head list; ++ int entry_nr; ++} xen_hotmem; ++ ++DEFINE_SPINLOCK(xen_hotmem_lock); ++ ++static int xen_hyper_addmem(struct xen_hotmem_entry *entry) ++{ ++ int ret; ++ ++ xen_platform_op_t op = { ++ .cmd = XENPF_mem_hotadd, ++ .interface_version = XENPF_INTERFACE_VERSION, ++ }; ++ op.u.mem_add.spfn = entry->start >> PAGE_SHIFT; ++ op.u.mem_add.epfn = entry->end >> PAGE_SHIFT; ++ op.u.mem_add.flags = entry->flags; ++ op.u.mem_add.pxm = entry->pxm; ++ ++ ret = HYPERVISOR_dom0_op(&op); ++ return ret; ++} ++ ++static int add_hotmem_entry(int pxm, uint64_t start, ++ uint64_t length, uint32_t flags) ++{ ++ struct xen_hotmem_entry *entry; ++ ++ if (pxm < 0 || !length) ++ return -EINVAL; ++ ++ entry = kzalloc(sizeof(struct xen_hotmem_entry), GFP_ATOMIC); ++ if (!entry) ++ return -ENOMEM; ++ ++ INIT_LIST_HEAD(&entry->hotmem_list); ++ entry->start = start; ++ entry->end = start + length; ++ entry->flags = flags; ++ entry->pxm = pxm; ++ ++ spin_lock(&xen_hotmem_lock); ++ ++ list_add_tail(&entry->hotmem_list, &xen_hotmem.list); ++ xen_hotmem.entry_nr++; ++ ++ spin_unlock(&xen_hotmem_lock); ++ ++ return 0; ++} ++ ++static int free_hotmem_entry(struct xen_hotmem_entry *entry) ++{ ++ list_del(&entry->hotmem_list); ++ kfree(entry); ++ ++ return 0; ++} ++ ++static void xen_hotadd_mem_dpc(struct work_struct *work) ++{ ++ struct list_head *elem, *tmp; ++ struct xen_hotmem_entry *entry; ++ unsigned long flags; ++ int ret; ++ ++ spin_lock_irqsave(&xen_hotmem_lock, flags); ++ list_for_each_safe(elem, tmp, &xen_hotmem.list) { ++ entry = list_entry(elem, struct xen_hotmem_entry, hotmem_list); ++ ret = xen_hyper_addmem(entry); ++ if (ret) ++ printk(KERN_WARNING "xen addmem failed with %x\n", ret); ++ free_hotmem_entry(entry); ++ xen_hotmem.entry_nr--; ++ } ++ spin_unlock_irqrestore(&xen_hotmem_lock, flags); ++} ++ ++static DECLARE_WORK(xen_hotadd_mem_work, xen_hotadd_mem_dpc); ++ ++static int xen_acpi_get_pxm(acpi_handle h) ++{ ++ unsigned long long pxm; ++ acpi_status status; ++ acpi_handle handle; ++ acpi_handle phandle = h; ++ ++ do { ++ handle = phandle; ++ status = acpi_evaluate_integer(handle, "_PXM", NULL, &pxm); ++ if (ACPI_SUCCESS(status)) ++ return pxm; ++ status = acpi_get_parent(handle, &phandle); ++ } while (ACPI_SUCCESS(status)); ++ ++ return -1; ++} ++ ++int xen_hotadd_memory(struct acpi_memory_device *mem_device) ++{ ++ int pxm, result; ++ int num_enabled = 0; ++ struct acpi_memory_info *info; ++ ++ if (!mem_device) ++ return -EINVAL; ++ ++ pxm = xen_acpi_get_pxm(mem_device->device->handle); ++ ++ if (pxm < 0) ++ return -EINVAL; ++ ++ /* ++ * Always return success to ACPI driver, and notify hypervisor later ++ * because hypervisor will utilize the memory in memory hotadd hypercall ++ */ ++ list_for_each_entry(info, &mem_device->res_list, list) { ++ if (info->enabled) { /* just sanity check...*/ ++ num_enabled++; ++ continue; ++ } ++ /* ++ * If the memory block size is zero, please ignore it. ++ * Don't try to do the following memory hotplug flowchart. ++ */ ++ if (!info->length) ++ continue; ++ ++ result = add_hotmem_entry(pxm, info->start_addr, ++ info->length, 0); ++ if (result) ++ continue; ++ info->enabled = 1; ++ num_enabled++; ++ } ++ ++ if (!num_enabled) ++ return -EINVAL; ++ ++ schedule_work(&xen_hotadd_mem_work); ++ ++ return 0; ++} ++EXPORT_SYMBOL(xen_hotadd_memory); ++ ++static int xen_hotadd_mem_init(void) ++{ ++ if (!xen_initial_domain()) ++ return -ENODEV; ++ ++ INIT_LIST_HEAD(&xen_hotmem.list); ++ xen_hotmem.entry_nr = 0; ++ ++ return 0; ++} ++ ++static void xen_hotadd_mem_exit(void) ++{ ++ flush_scheduled_work(); ++} ++ ++module_init(xen_hotadd_mem_init); ++module_exit(xen_hotadd_mem_exit); ++MODULE_LICENSE("GPL"); +diff --git a/drivers/xen/xenbus/Makefile b/drivers/xen/xenbus/Makefile +index 5571f5b..8dca685 100644 +--- a/drivers/xen/xenbus/Makefile ++++ b/drivers/xen/xenbus/Makefile +@@ -5,3 +5,8 @@ xenbus-objs += xenbus_client.o + xenbus-objs += xenbus_comms.o + xenbus-objs += xenbus_xs.o + xenbus-objs += xenbus_probe.o ++ ++xenbus-be-objs-$(CONFIG_XEN_BACKEND) += xenbus_probe_backend.o ++xenbus-objs += $(xenbus-be-objs-y) ++ ++obj-$(CONFIG_XEN_XENBUS_FRONTEND) += xenbus_probe_frontend.o +diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c +index 92a1ef8..89f2e42 100644 +--- a/drivers/xen/xenbus/xenbus_client.c ++++ b/drivers/xen/xenbus/xenbus_client.c +@@ -49,6 +49,8 @@ const char *xenbus_strstate(enum xenbus_state state) + [ XenbusStateConnected ] = "Connected", + [ XenbusStateClosing ] = "Closing", + [ XenbusStateClosed ] = "Closed", ++ [ XenbusStateReconfiguring ] = "Reconfiguring", ++ [ XenbusStateReconfigured ] = "Reconfigured", + }; + return (state < ARRAY_SIZE(name)) ? name[state] : "INVALID"; + } +@@ -132,17 +134,12 @@ int xenbus_watch_pathfmt(struct xenbus_device *dev, + } + EXPORT_SYMBOL_GPL(xenbus_watch_pathfmt); + ++static void xenbus_switch_fatal(struct xenbus_device *, int, int, ++ const char *, ...); + +-/** +- * xenbus_switch_state +- * @dev: xenbus device +- * @state: new state +- * +- * Advertise in the store a change of the given driver to the given new_state. +- * Return 0 on success, or -errno on error. On error, the device will switch +- * to XenbusStateClosing, and the error will be saved in the store. +- */ +-int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state) ++static int ++__xenbus_switch_state(struct xenbus_device *dev, ++ enum xenbus_state state, int depth) + { + /* We check whether the state is currently set to the given value, and + if not, then the state is set. We don't want to unconditionally +@@ -151,35 +148,65 @@ int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state) + to it, as the device will be tearing down, and we don't want to + resurrect that directory. + +- Note that, because of this cached value of our state, this function +- will not work inside a Xenstore transaction (something it was +- trying to in the past) because dev->state would not get reset if +- the transaction was aborted. +- ++ Note that, because of this cached value of our state, this ++ function will not take a caller's Xenstore transaction ++ (something it was trying to in the past) because dev->state ++ would not get reset if the transaction was aborted. + */ + ++ struct xenbus_transaction xbt; + int current_state; +- int err; ++ int err, abort; + + if (state == dev->state) + return 0; + +- err = xenbus_scanf(XBT_NIL, dev->nodename, "state", "%d", +- ¤t_state); +- if (err != 1) ++again: ++ abort = 1; ++ ++ err = xenbus_transaction_start(&xbt); ++ if (err) { ++ xenbus_switch_fatal(dev, depth, err, "starting transaction"); + return 0; ++ } ++ ++ err = xenbus_scanf(xbt, dev->nodename, "state", "%d", ¤t_state); ++ if (err != 1) ++ goto abort; + +- err = xenbus_printf(XBT_NIL, dev->nodename, "state", "%d", state); ++ err = xenbus_printf(xbt, dev->nodename, "state", "%d", state); + if (err) { +- if (state != XenbusStateClosing) /* Avoid looping */ +- xenbus_dev_fatal(dev, err, "writing new state"); +- return err; ++ xenbus_switch_fatal(dev, depth, err, "writing new state"); ++ goto abort; + } + +- dev->state = state; ++ abort = 0; ++abort: ++ err = xenbus_transaction_end(xbt, abort); ++ if (err) { ++ if (err == -EAGAIN && !abort) ++ goto again; ++ xenbus_switch_fatal(dev, depth, err, "ending transaction"); ++ } else ++ dev->state = state; + + return 0; + } ++ ++/** ++ * xenbus_switch_state ++ * @dev: xenbus device ++ * @state: new state ++ * ++ * Advertise in the store a change of the given driver to the given new_state. ++ * Return 0 on success, or -errno on error. On error, the device will switch ++ * to XenbusStateClosing, and the error will be saved in the store. ++ */ ++int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state) ++{ ++ return __xenbus_switch_state(dev, state, 0); ++} ++ + EXPORT_SYMBOL_GPL(xenbus_switch_state); + + int xenbus_frontend_closed(struct xenbus_device *dev) +@@ -283,6 +310,23 @@ void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, ...) + EXPORT_SYMBOL_GPL(xenbus_dev_fatal); + + /** ++ * Equivalent to xenbus_dev_fatal(dev, err, fmt, args), but helps ++ * avoiding recursion within xenbus_switch_state. ++ */ ++static void xenbus_switch_fatal(struct xenbus_device *dev, int depth, int err, ++ const char *fmt, ...) ++{ ++ va_list ap; ++ ++ va_start(ap, fmt); ++ xenbus_va_dev_error(dev, err, fmt, ap); ++ va_end(ap); ++ ++ if (!depth) ++ __xenbus_switch_state(dev, XenbusStateClosing, 1); ++} ++ ++/** + * xenbus_grant_ring + * @dev: xenbus device + * @ring_mfn: mfn of ring to grant +diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c +index 649fcdf..3a83ba2 100644 +--- a/drivers/xen/xenbus/xenbus_probe.c ++++ b/drivers/xen/xenbus/xenbus_probe.c +@@ -49,31 +49,29 @@ + #include <asm/page.h> + #include <asm/pgtable.h> + #include <asm/xen/hypervisor.h> ++ ++#include <xen/xen.h> + #include <xen/xenbus.h> + #include <xen/events.h> + #include <xen/page.h> + ++#include <xen/platform_pci.h> ++#include <xen/hvm.h> ++ + #include "xenbus_comms.h" + #include "xenbus_probe.h" + + + int xen_store_evtchn; +-EXPORT_SYMBOL(xen_store_evtchn); ++EXPORT_SYMBOL_GPL(xen_store_evtchn); + + struct xenstore_domain_interface *xen_store_interface; ++EXPORT_SYMBOL_GPL(xen_store_interface); ++ + static unsigned long xen_store_mfn; + + static BLOCKING_NOTIFIER_HEAD(xenstore_chain); + +-static void wait_for_devices(struct xenbus_driver *xendrv); +- +-static int xenbus_probe_frontend(const char *type, const char *name); +- +-static void xenbus_dev_shutdown(struct device *_dev); +- +-static int xenbus_dev_suspend(struct device *dev, pm_message_t state); +-static int xenbus_dev_resume(struct device *dev); +- + /* If something in array of ids matches this device, return it. */ + static const struct xenbus_device_id * + match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev) +@@ -94,34 +92,7 @@ int xenbus_match(struct device *_dev, struct device_driver *_drv) + + return match_device(drv->ids, to_xenbus_device(_dev)) != NULL; + } +- +-static int xenbus_uevent(struct device *_dev, struct kobj_uevent_env *env) +-{ +- struct xenbus_device *dev = to_xenbus_device(_dev); +- +- if (add_uevent_var(env, "MODALIAS=xen:%s", dev->devicetype)) +- return -ENOMEM; +- +- return 0; +-} +- +-/* device/<type>/<id> => <type>-<id> */ +-static int frontend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename) +-{ +- nodename = strchr(nodename, '/'); +- if (!nodename || strlen(nodename + 1) >= XEN_BUS_ID_SIZE) { +- printk(KERN_WARNING "XENBUS: bad frontend %s\n", nodename); +- return -EINVAL; +- } +- +- strlcpy(bus_id, nodename + 1, XEN_BUS_ID_SIZE); +- if (!strchr(bus_id, '/')) { +- printk(KERN_WARNING "XENBUS: bus_id %s no slash\n", bus_id); +- return -EINVAL; +- } +- *strchr(bus_id, '/') = '-'; +- return 0; +-} ++EXPORT_SYMBOL_GPL(xenbus_match); + + + static void free_otherend_details(struct xenbus_device *dev) +@@ -141,7 +112,28 @@ static void free_otherend_watch(struct xenbus_device *dev) + } + + +-int read_otherend_details(struct xenbus_device *xendev, ++static int talk_to_otherend(struct xenbus_device *dev) ++{ ++ struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver); ++ ++ free_otherend_watch(dev); ++ free_otherend_details(dev); ++ ++ return drv->read_otherend_details(dev); ++} ++ ++ ++ ++static int watch_otherend(struct xenbus_device *dev) ++{ ++ struct xen_bus_type *bus = container_of(dev->dev.bus, struct xen_bus_type, bus); ++ ++ return xenbus_watch_pathfmt(dev, &dev->otherend_watch, bus->otherend_changed, ++ "%s/%s", dev->otherend, "state"); ++} ++ ++ ++int xenbus_read_otherend_details(struct xenbus_device *xendev, + char *id_node, char *path_node) + { + int err = xenbus_gather(XBT_NIL, xendev->nodename, +@@ -166,39 +158,11 @@ int read_otherend_details(struct xenbus_device *xendev, + + return 0; + } ++EXPORT_SYMBOL_GPL(xenbus_read_otherend_details); + +- +-static int read_backend_details(struct xenbus_device *xendev) +-{ +- return read_otherend_details(xendev, "backend-id", "backend"); +-} +- +-static struct device_attribute xenbus_dev_attrs[] = { +- __ATTR_NULL +-}; +- +-/* Bus type for frontend drivers. */ +-static struct xen_bus_type xenbus_frontend = { +- .root = "device", +- .levels = 2, /* device/type/<id> */ +- .get_bus_id = frontend_bus_id, +- .probe = xenbus_probe_frontend, +- .bus = { +- .name = "xen", +- .match = xenbus_match, +- .uevent = xenbus_uevent, +- .probe = xenbus_dev_probe, +- .remove = xenbus_dev_remove, +- .shutdown = xenbus_dev_shutdown, +- .dev_attrs = xenbus_dev_attrs, +- +- .suspend = xenbus_dev_suspend, +- .resume = xenbus_dev_resume, +- }, +-}; +- +-static void otherend_changed(struct xenbus_watch *watch, +- const char **vec, unsigned int len) ++void xenbus_otherend_changed(struct xenbus_watch *watch, ++ const char **vec, unsigned int len, ++ int ignore_on_shutdown) + { + struct xenbus_device *dev = + container_of(watch, struct xenbus_device, otherend_watch); +@@ -226,11 +190,7 @@ static void otherend_changed(struct xenbus_watch *watch, + * work that can fail e.g., when the rootfs is gone. + */ + if (system_state > SYSTEM_RUNNING) { +- struct xen_bus_type *bus = bus; +- bus = container_of(dev->dev.bus, struct xen_bus_type, bus); +- /* If we're frontend, drive the state machine to Closed. */ +- /* This should cause the backend to release our resources. */ +- if ((bus == &xenbus_frontend) && (state == XenbusStateClosing)) ++ if (ignore_on_shutdown && (state == XenbusStateClosing)) + xenbus_frontend_closed(dev); + return; + } +@@ -238,25 +198,7 @@ static void otherend_changed(struct xenbus_watch *watch, + if (drv->otherend_changed) + drv->otherend_changed(dev, state); + } +- +- +-static int talk_to_otherend(struct xenbus_device *dev) +-{ +- struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver); +- +- free_otherend_watch(dev); +- free_otherend_details(dev); +- +- return drv->read_otherend_details(dev); +-} +- +- +-static int watch_otherend(struct xenbus_device *dev) +-{ +- return xenbus_watch_pathfmt(dev, &dev->otherend_watch, otherend_changed, +- "%s/%s", dev->otherend, "state"); +-} +- ++EXPORT_SYMBOL_GPL(xenbus_otherend_changed); + + int xenbus_dev_probe(struct device *_dev) + { +@@ -300,8 +242,9 @@ int xenbus_dev_probe(struct device *_dev) + fail: + xenbus_dev_error(dev, err, "xenbus_dev_probe on %s", dev->nodename); + xenbus_switch_state(dev, XenbusStateClosed); +- return -ENODEV; ++ return err; + } ++EXPORT_SYMBOL_GPL(xenbus_dev_probe); + + int xenbus_dev_remove(struct device *_dev) + { +@@ -319,8 +262,9 @@ int xenbus_dev_remove(struct device *_dev) + xenbus_switch_state(dev, XenbusStateClosed); + return 0; + } ++EXPORT_SYMBOL_GPL(xenbus_dev_remove); + +-static void xenbus_dev_shutdown(struct device *_dev) ++void xenbus_dev_shutdown(struct device *_dev) + { + struct xenbus_device *dev = to_xenbus_device(_dev); + unsigned long timeout = 5*HZ; +@@ -341,6 +285,7 @@ static void xenbus_dev_shutdown(struct device *_dev) + out: + put_device(&dev->dev); + } ++EXPORT_SYMBOL_GPL(xenbus_dev_shutdown); + + int xenbus_register_driver_common(struct xenbus_driver *drv, + struct xen_bus_type *bus, +@@ -354,25 +299,7 @@ int xenbus_register_driver_common(struct xenbus_driver *drv, + + return driver_register(&drv->driver); + } +- +-int __xenbus_register_frontend(struct xenbus_driver *drv, +- struct module *owner, const char *mod_name) +-{ +- int ret; +- +- drv->read_otherend_details = read_backend_details; +- +- ret = xenbus_register_driver_common(drv, &xenbus_frontend, +- owner, mod_name); +- if (ret) +- return ret; +- +- /* If this driver is loaded as a module wait for devices to attach. */ +- wait_for_devices(drv); +- +- return 0; +-} +-EXPORT_SYMBOL_GPL(__xenbus_register_frontend); ++EXPORT_SYMBOL_GPL(xenbus_register_driver_common); + + void xenbus_unregister_driver(struct xenbus_driver *drv) + { +@@ -543,24 +470,7 @@ fail: + kfree(xendev); + return err; + } +- +-/* device/<typename>/<name> */ +-static int xenbus_probe_frontend(const char *type, const char *name) +-{ +- char *nodename; +- int err; +- +- nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", +- xenbus_frontend.root, type, name); +- if (!nodename) +- return -ENOMEM; +- +- DPRINTK("%s", nodename); +- +- err = xenbus_probe_node(&xenbus_frontend, type, nodename); +- kfree(nodename); +- return err; +-} ++EXPORT_SYMBOL_GPL(xenbus_probe_node); + + static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type) + { +@@ -574,10 +484,11 @@ static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type) + return PTR_ERR(dir); + + for (i = 0; i < dir_n; i++) { +- err = bus->probe(type, dir[i]); ++ err = bus->probe(bus, type, dir[i]); + if (err) + break; + } ++ + kfree(dir); + return err; + } +@@ -597,9 +508,11 @@ int xenbus_probe_devices(struct xen_bus_type *bus) + if (err) + break; + } ++ + kfree(dir); + return err; + } ++EXPORT_SYMBOL_GPL(xenbus_probe_devices); + + static unsigned int char_count(const char *str, char c) + { +@@ -662,32 +575,17 @@ void xenbus_dev_changed(const char *node, struct xen_bus_type *bus) + } + EXPORT_SYMBOL_GPL(xenbus_dev_changed); + +-static void frontend_changed(struct xenbus_watch *watch, +- const char **vec, unsigned int len) +-{ +- DPRINTK(""); +- +- xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend); +-} +- +-/* We watch for devices appearing and vanishing. */ +-static struct xenbus_watch fe_watch = { +- .node = "device", +- .callback = frontend_changed, +-}; +- +-static int xenbus_dev_suspend(struct device *dev, pm_message_t state) ++int xenbus_dev_suspend(struct device *dev, pm_message_t state) + { + int err = 0; + struct xenbus_driver *drv; +- struct xenbus_device *xdev; ++ struct xenbus_device *xdev = container_of(dev, struct xenbus_device, dev); + +- DPRINTK(""); ++ DPRINTK("%s", xdev->nodename); + + if (dev->driver == NULL) + return 0; + drv = to_xenbus_driver(dev->driver); +- xdev = container_of(dev, struct xenbus_device, dev); + if (drv->suspend) + err = drv->suspend(xdev, state); + if (err) +@@ -695,21 +593,19 @@ static int xenbus_dev_suspend(struct device *dev, pm_message_t state) + "xenbus: suspend %s failed: %i\n", dev_name(dev), err); + return 0; + } ++EXPORT_SYMBOL_GPL(xenbus_dev_suspend); + +-static int xenbus_dev_resume(struct device *dev) ++int xenbus_dev_resume(struct device *dev) + { + int err; + struct xenbus_driver *drv; +- struct xenbus_device *xdev; ++ struct xenbus_device *xdev = container_of(dev, struct xenbus_device, dev); + +- DPRINTK(""); ++ DPRINTK("%s", xdev->nodename); + + if (dev->driver == NULL) + return 0; +- + drv = to_xenbus_driver(dev->driver); +- xdev = container_of(dev, struct xenbus_device, dev); +- + err = talk_to_otherend(xdev); + if (err) { + printk(KERN_WARNING +@@ -740,6 +636,7 @@ static int xenbus_dev_resume(struct device *dev) + + return 0; + } ++EXPORT_SYMBOL_GPL(xenbus_dev_resume); + + /* A flag to determine if xenstored is 'ready' (i.e. has started) */ + int xenstored_ready = 0; +@@ -749,10 +646,7 @@ int register_xenstore_notifier(struct notifier_block *nb) + { + int ret = 0; + +- if (xenstored_ready > 0) +- ret = nb->notifier_call(nb, 0, NULL); +- else +- blocking_notifier_chain_register(&xenstore_chain, nb); ++ blocking_notifier_chain_register(&xenstore_chain, nb); + + return ret; + } +@@ -768,57 +662,93 @@ void xenbus_probe(struct work_struct *unused) + { + BUG_ON((xenstored_ready <= 0)); + +- /* Enumerate devices in xenstore and watch for changes. */ +- xenbus_probe_devices(&xenbus_frontend); +- register_xenbus_watch(&fe_watch); +- xenbus_backend_probe_and_watch(); +- + /* Notify others that xenstore is up */ + blocking_notifier_call_chain(&xenstore_chain, 0, NULL); + } ++EXPORT_SYMBOL_GPL(xenbus_probe); ++ ++static int __init xenbus_probe_initcall(void) ++{ ++ if (!xen_domain()) ++ return -ENODEV; ++ ++ if (xen_initial_domain() || xen_hvm_domain()) ++ return 0; ++ ++ xenbus_probe(NULL); ++ return 0; ++} ++ ++device_initcall(xenbus_probe_initcall); + +-static int __init xenbus_probe_init(void) ++static int __init xenbus_init(void) + { + int err = 0; ++ unsigned long page = 0; + + DPRINTK(""); + + err = -ENODEV; + if (!xen_domain()) +- goto out_error; +- +- /* Register ourselves with the kernel bus subsystem */ +- err = bus_register(&xenbus_frontend.bus); +- if (err) +- goto out_error; +- +- err = xenbus_backend_bus_register(); +- if (err) +- goto out_unreg_front; ++ return err; + + /* + * Domain0 doesn't have a store_evtchn or store_mfn yet. + */ + if (xen_initial_domain()) { +- /* dom0 not yet supported */ ++ struct evtchn_alloc_unbound alloc_unbound; ++ ++ /* Allocate Xenstore page */ ++ page = get_zeroed_page(GFP_KERNEL); ++ if (!page) ++ goto out_error; ++ ++ xen_store_mfn = xen_start_info->store_mfn = ++ pfn_to_mfn(virt_to_phys((void *)page) >> ++ PAGE_SHIFT); ++ ++ /* Next allocate a local port which xenstored can bind to */ ++ alloc_unbound.dom = DOMID_SELF; ++ alloc_unbound.remote_dom = 0; ++ ++ err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, ++ &alloc_unbound); ++ if (err == -ENOSYS) ++ goto out_error; ++ ++ BUG_ON(err); ++ xen_store_evtchn = xen_start_info->store_evtchn = ++ alloc_unbound.port; ++ ++ xen_store_interface = mfn_to_virt(xen_store_mfn); + } else { + xenstored_ready = 1; +- xen_store_evtchn = xen_start_info->store_evtchn; +- xen_store_mfn = xen_start_info->store_mfn; ++ if (xen_hvm_domain()) { ++ uint64_t v = 0; ++ err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v); ++ if (err) ++ goto out_error; ++ xen_store_evtchn = (int)v; ++ err = hvm_get_parameter(HVM_PARAM_STORE_PFN, &v); ++ if (err) ++ goto out_error; ++ xen_store_mfn = (unsigned long)v; ++ xen_store_interface = ioremap(xen_store_mfn << PAGE_SHIFT, PAGE_SIZE); ++ } else { ++ xen_store_evtchn = xen_start_info->store_evtchn; ++ xen_store_mfn = xen_start_info->store_mfn; ++ xen_store_interface = mfn_to_virt(xen_store_mfn); ++ } + } +- xen_store_interface = mfn_to_virt(xen_store_mfn); + + /* Initialize the interface to xenstore. */ + err = xs_init(); + if (err) { + printk(KERN_WARNING + "XENBUS: Error initializing xenstore comms: %i\n", err); +- goto out_unreg_back; ++ goto out_error; + } + +- if (!xen_initial_domain()) +- xenbus_probe(NULL); +- + #ifdef CONFIG_XEN_COMPAT_XENFS + /* + * Create xenfs mountpoint in /proc for compatibility with +@@ -829,128 +759,13 @@ static int __init xenbus_probe_init(void) + + return 0; + +- out_unreg_back: +- xenbus_backend_bus_unregister(); +- +- out_unreg_front: +- bus_unregister(&xenbus_frontend.bus); +- + out_error: ++ if (page != 0) ++ free_page(page); ++ + return err; + } + +-postcore_initcall(xenbus_probe_init); ++postcore_initcall(xenbus_init); + + MODULE_LICENSE("GPL"); +- +-static int is_device_connecting(struct device *dev, void *data) +-{ +- struct xenbus_device *xendev = to_xenbus_device(dev); +- struct device_driver *drv = data; +- struct xenbus_driver *xendrv; +- +- /* +- * A device with no driver will never connect. We care only about +- * devices which should currently be in the process of connecting. +- */ +- if (!dev->driver) +- return 0; +- +- /* Is this search limited to a particular driver? */ +- if (drv && (dev->driver != drv)) +- return 0; +- +- xendrv = to_xenbus_driver(dev->driver); +- return (xendev->state < XenbusStateConnected || +- (xendev->state == XenbusStateConnected && +- xendrv->is_ready && !xendrv->is_ready(xendev))); +-} +- +-static int exists_connecting_device(struct device_driver *drv) +-{ +- return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv, +- is_device_connecting); +-} +- +-static int print_device_status(struct device *dev, void *data) +-{ +- struct xenbus_device *xendev = to_xenbus_device(dev); +- struct device_driver *drv = data; +- +- /* Is this operation limited to a particular driver? */ +- if (drv && (dev->driver != drv)) +- return 0; +- +- if (!dev->driver) { +- /* Information only: is this too noisy? */ +- printk(KERN_INFO "XENBUS: Device with no driver: %s\n", +- xendev->nodename); +- } else if (xendev->state < XenbusStateConnected) { +- enum xenbus_state rstate = XenbusStateUnknown; +- if (xendev->otherend) +- rstate = xenbus_read_driver_state(xendev->otherend); +- printk(KERN_WARNING "XENBUS: Timeout connecting " +- "to device: %s (local state %d, remote state %d)\n", +- xendev->nodename, xendev->state, rstate); +- } +- +- return 0; +-} +- +-/* We only wait for device setup after most initcalls have run. */ +-static int ready_to_wait_for_devices; +- +-/* +- * On a 5-minute timeout, wait for all devices currently configured. We need +- * to do this to guarantee that the filesystems and / or network devices +- * needed for boot are available, before we can allow the boot to proceed. +- * +- * This needs to be on a late_initcall, to happen after the frontend device +- * drivers have been initialised, but before the root fs is mounted. +- * +- * A possible improvement here would be to have the tools add a per-device +- * flag to the store entry, indicating whether it is needed at boot time. +- * This would allow people who knew what they were doing to accelerate their +- * boot slightly, but of course needs tools or manual intervention to set up +- * those flags correctly. +- */ +-static void wait_for_devices(struct xenbus_driver *xendrv) +-{ +- unsigned long start = jiffies; +- struct device_driver *drv = xendrv ? &xendrv->driver : NULL; +- unsigned int seconds_waited = 0; +- +- if (!ready_to_wait_for_devices || !xen_domain()) +- return; +- +- while (exists_connecting_device(drv)) { +- if (time_after(jiffies, start + (seconds_waited+5)*HZ)) { +- if (!seconds_waited) +- printk(KERN_WARNING "XENBUS: Waiting for " +- "devices to initialise: "); +- seconds_waited += 5; +- printk("%us...", 300 - seconds_waited); +- if (seconds_waited == 300) +- break; +- } +- +- schedule_timeout_interruptible(HZ/10); +- } +- +- if (seconds_waited) +- printk("\n"); +- +- bus_for_each_dev(&xenbus_frontend.bus, NULL, drv, +- print_device_status); +-} +- +-#ifndef MODULE +-static int __init boot_wait_for_devices(void) +-{ +- ready_to_wait_for_devices = 1; +- wait_for_devices(NULL); +- return 0; +-} +- +-late_initcall(boot_wait_for_devices); +-#endif +diff --git a/drivers/xen/xenbus/xenbus_probe.h b/drivers/xen/xenbus/xenbus_probe.h +index 6c5e318..0e5fc4c 100644 +--- a/drivers/xen/xenbus/xenbus_probe.h ++++ b/drivers/xen/xenbus/xenbus_probe.h +@@ -36,26 +36,13 @@ + + #define XEN_BUS_ID_SIZE 20 + +-#ifdef CONFIG_XEN_BACKEND +-extern void xenbus_backend_suspend(int (*fn)(struct device *, void *)); +-extern void xenbus_backend_resume(int (*fn)(struct device *, void *)); +-extern void xenbus_backend_probe_and_watch(void); +-extern int xenbus_backend_bus_register(void); +-extern void xenbus_backend_bus_unregister(void); +-#else +-static inline void xenbus_backend_suspend(int (*fn)(struct device *, void *)) {} +-static inline void xenbus_backend_resume(int (*fn)(struct device *, void *)) {} +-static inline void xenbus_backend_probe_and_watch(void) {} +-static inline int xenbus_backend_bus_register(void) { return 0; } +-static inline void xenbus_backend_bus_unregister(void) {} +-#endif +- + struct xen_bus_type + { + char *root; + unsigned int levels; + int (*get_bus_id)(char bus_id[XEN_BUS_ID_SIZE], const char *nodename); +- int (*probe)(const char *type, const char *dir); ++ int (*probe)(struct xen_bus_type *bus, const char *type, const char *dir); ++ void (*otherend_changed)(struct xenbus_watch *watch, const char **vec, unsigned int len); + struct bus_type bus; + }; + +@@ -73,4 +60,16 @@ extern int xenbus_probe_devices(struct xen_bus_type *bus); + + extern void xenbus_dev_changed(const char *node, struct xen_bus_type *bus); + ++extern void xenbus_dev_shutdown(struct device *_dev); ++ ++extern int xenbus_dev_suspend(struct device *dev, pm_message_t state); ++extern int xenbus_dev_resume(struct device *dev); ++ ++extern void xenbus_otherend_changed(struct xenbus_watch *watch, ++ const char **vec, unsigned int len, ++ int ignore_on_shutdown); ++ ++extern int xenbus_read_otherend_details(struct xenbus_device *xendev, ++ char *id_node, char *path_node); ++ + #endif +diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c +new file mode 100644 +index 0000000..9b9dd36 +--- /dev/null ++++ b/drivers/xen/xenbus/xenbus_probe_backend.c +@@ -0,0 +1,293 @@ ++/****************************************************************************** ++ * Talks to Xen Store to figure out what devices we have (backend half). ++ * ++ * Copyright (C) 2005 Rusty Russell, IBM Corporation ++ * Copyright (C) 2005 Mike Wray, Hewlett-Packard ++ * Copyright (C) 2005, 2006 XenSource Ltd ++ * Copyright (C) 2007 Solarflare Communications, Inc. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#define DPRINTK(fmt, args...) \ ++ pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \ ++ __func__, __LINE__, ##args) ++ ++#include <linux/kernel.h> ++#include <linux/err.h> ++#include <linux/string.h> ++#include <linux/ctype.h> ++#include <linux/fcntl.h> ++#include <linux/mm.h> ++#include <linux/notifier.h> ++ ++#include <asm/page.h> ++#include <asm/pgtable.h> ++#include <asm/xen/hypervisor.h> ++#include <asm/hypervisor.h> ++#include <xen/xenbus.h> ++#include <xen/features.h> ++ ++#include "xenbus_comms.h" ++#include "xenbus_probe.h" ++ ++/* backend/<type>/<fe-uuid>/<id> => <type>-<fe-domid>-<id> */ ++static int backend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename) ++{ ++ int domid, err; ++ const char *devid, *type, *frontend; ++ unsigned int typelen; ++ ++ type = strchr(nodename, '/'); ++ if (!type) ++ return -EINVAL; ++ type++; ++ typelen = strcspn(type, "/"); ++ if (!typelen || type[typelen] != '/') ++ return -EINVAL; ++ ++ devid = strrchr(nodename, '/') + 1; ++ ++ err = xenbus_gather(XBT_NIL, nodename, "frontend-id", "%i", &domid, ++ "frontend", NULL, &frontend, ++ NULL); ++ if (err) ++ return err; ++ if (strlen(frontend) == 0) ++ err = -ERANGE; ++ if (!err && !xenbus_exists(XBT_NIL, frontend, "")) ++ err = -ENOENT; ++ kfree(frontend); ++ ++ if (err) ++ return err; ++ ++ if (snprintf(bus_id, XEN_BUS_ID_SIZE, ++ "%.*s-%i-%s", typelen, type, domid, devid) >= XEN_BUS_ID_SIZE) ++ return -ENOSPC; ++ return 0; ++} ++ ++static int xenbus_uevent_backend(struct device *dev, ++ struct kobj_uevent_env *env) ++{ ++ struct xenbus_device *xdev; ++ struct xenbus_driver *drv; ++ struct xen_bus_type *bus; ++ ++ DPRINTK(""); ++ ++ if (dev == NULL) ++ return -ENODEV; ++ ++ xdev = to_xenbus_device(dev); ++ bus = container_of(xdev->dev.bus, struct xen_bus_type, bus); ++ if (xdev == NULL) ++ return -ENODEV; ++ ++ /* stuff we want to pass to /sbin/hotplug */ ++ if (add_uevent_var(env, "XENBUS_TYPE=%s", xdev->devicetype)) ++ return -ENOMEM; ++ ++ if (add_uevent_var(env, "XENBUS_PATH=%s", xdev->nodename)) ++ return -ENOMEM; ++ ++ if (add_uevent_var(env, "XENBUS_BASE_PATH=%s", bus->root)) ++ return -ENOMEM; ++ ++ if (dev->driver) { ++ drv = to_xenbus_driver(dev->driver); ++ if (drv && drv->uevent) ++ return drv->uevent(xdev, env); ++ } ++ ++ return 0; ++} ++ ++/* backend/<typename>/<frontend-uuid>/<name> */ ++static int xenbus_probe_backend_unit(struct xen_bus_type *bus, ++ const char *dir, ++ const char *type, ++ const char *name) ++{ ++ char *nodename; ++ int err; ++ ++ nodename = kasprintf(GFP_KERNEL, "%s/%s", dir, name); ++ if (!nodename) ++ return -ENOMEM; ++ ++ DPRINTK("%s\n", nodename); ++ ++ err = xenbus_probe_node(bus, type, nodename); ++ kfree(nodename); ++ return err; ++} ++ ++/* backend/<typename>/<frontend-domid> */ ++static int xenbus_probe_backend(struct xen_bus_type *bus, const char *type, const char *domid) ++{ ++ char *nodename; ++ int err = 0; ++ char **dir; ++ unsigned int i, dir_n = 0; ++ ++ DPRINTK(""); ++ ++ nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", bus->root, type, domid); ++ if (!nodename) ++ return -ENOMEM; ++ ++ dir = xenbus_directory(XBT_NIL, nodename, "", &dir_n); ++ if (IS_ERR(dir)) { ++ kfree(nodename); ++ return PTR_ERR(dir); ++ } ++ ++ for (i = 0; i < dir_n; i++) { ++ err = xenbus_probe_backend_unit(bus, nodename, type, dir[i]); ++ if (err) ++ break; ++ } ++ kfree(dir); ++ kfree(nodename); ++ return err; ++} ++ ++static void frontend_changed(struct xenbus_watch *watch, ++ const char **vec, unsigned int len) ++{ ++ xenbus_otherend_changed(watch, vec, len, 0); ++} ++ ++static struct device_attribute xenbus_backend_dev_attrs[] = { ++ __ATTR_NULL ++}; ++ ++static struct xen_bus_type xenbus_backend = { ++ .root = "backend", ++ .levels = 3, /* backend/type/<frontend>/<id> */ ++ .get_bus_id = backend_bus_id, ++ .probe = xenbus_probe_backend, ++ .otherend_changed = frontend_changed, ++ .bus = { ++ .name = "xen-backend", ++ .match = xenbus_match, ++ .uevent = xenbus_uevent_backend, ++ .probe = xenbus_dev_probe, ++ .remove = xenbus_dev_remove, ++ .shutdown = xenbus_dev_shutdown, ++ .dev_attrs = xenbus_backend_dev_attrs, ++ }, ++}; ++ ++static void backend_changed(struct xenbus_watch *watch, ++ const char **vec, unsigned int len) ++{ ++ DPRINTK(""); ++ ++ xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_backend); ++} ++ ++static struct xenbus_watch be_watch = { ++ .node = "backend", ++ .callback = backend_changed, ++}; ++ ++static int read_frontend_details(struct xenbus_device *xendev) ++{ ++ return xenbus_read_otherend_details(xendev, "frontend-id", "frontend"); ++} ++ ++//void xenbus_backend_suspend(int (*fn)(struct device *, void *)) ++//{ ++// DPRINTK(""); ++// bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn); ++//} ++ ++//void xenbus_backend_resume(int (*fn)(struct device *, void *)) ++//{ ++// DPRINTK(""); ++// bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn); ++//} ++ ++//int xenbus_for_each_backend(void *arg, int (*fn)(struct device *, void *)) ++//{ ++// return bus_for_each_dev(&xenbus_backend.bus, NULL, arg, fn); ++//} ++//EXPORT_SYMBOL_GPL(xenbus_for_each_backend); ++ ++int xenbus_dev_is_online(struct xenbus_device *dev) ++{ ++ int rc, val; ++ ++ rc = xenbus_scanf(XBT_NIL, dev->nodename, "online", "%d", &val); ++ if (rc != 1) ++ val = 0; /* no online node present */ ++ ++ return val; ++} ++EXPORT_SYMBOL_GPL(xenbus_dev_is_online); ++ ++int __xenbus_register_backend(struct xenbus_driver *drv, ++ struct module *owner, const char *mod_name) ++{ ++ drv->read_otherend_details = read_frontend_details; ++ ++ return xenbus_register_driver_common(drv, &xenbus_backend, ++ owner, mod_name); ++} ++EXPORT_SYMBOL_GPL(__xenbus_register_backend); ++ ++static int backend_probe_and_watch(struct notifier_block *notifier, ++ unsigned long event, ++ void *data) ++{ ++ /* Enumerate devices in xenstore and watch for changes. */ ++ xenbus_probe_devices(&xenbus_backend); ++ register_xenbus_watch(&be_watch); ++ ++ return NOTIFY_DONE; ++} ++ ++static int __init xenbus_probe_backend_init(void) ++{ ++ static struct notifier_block xenstore_notifier = { ++ .notifier_call = backend_probe_and_watch ++ }; ++ int err; ++ ++ DPRINTK(""); ++ ++ /* Register ourselves with the kernel bus subsystem */ ++ err = bus_register(&xenbus_backend.bus); ++ if (err) ++ return err; ++ ++ register_xenstore_notifier(&xenstore_notifier); ++ ++ return 0; ++} ++subsys_initcall(xenbus_probe_backend_init); +diff --git a/drivers/xen/xenbus/xenbus_probe_frontend.c b/drivers/xen/xenbus/xenbus_probe_frontend.c +new file mode 100644 +index 0000000..5413248 +--- /dev/null ++++ b/drivers/xen/xenbus/xenbus_probe_frontend.c +@@ -0,0 +1,292 @@ ++#define DPRINTK(fmt, args...) \ ++ pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \ ++ __func__, __LINE__, ##args) ++ ++#include <linux/kernel.h> ++#include <linux/err.h> ++#include <linux/string.h> ++#include <linux/ctype.h> ++#include <linux/fcntl.h> ++#include <linux/mm.h> ++#include <linux/proc_fs.h> ++#include <linux/notifier.h> ++#include <linux/kthread.h> ++#include <linux/mutex.h> ++#include <linux/io.h> ++ ++#include <asm/page.h> ++#include <asm/pgtable.h> ++#include <asm/xen/hypervisor.h> ++#include <xen/xenbus.h> ++#include <xen/events.h> ++#include <xen/page.h> ++#include <xen/xen.h> ++#include <xen/platform_pci.h> ++ ++#include "xenbus_comms.h" ++#include "xenbus_probe.h" ++ ++/* device/<type>/<id> => <type>-<id> */ ++static int frontend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename) ++{ ++ nodename = strchr(nodename, '/'); ++ if (!nodename || strlen(nodename + 1) >= XEN_BUS_ID_SIZE) { ++ printk(KERN_WARNING "XENBUS: bad frontend %s\n", nodename); ++ return -EINVAL; ++ } ++ ++ strlcpy(bus_id, nodename + 1, XEN_BUS_ID_SIZE); ++ if (!strchr(bus_id, '/')) { ++ printk(KERN_WARNING "XENBUS: bus_id %s no slash\n", bus_id); ++ return -EINVAL; ++ } ++ *strchr(bus_id, '/') = '-'; ++ return 0; ++} ++ ++/* device/<typename>/<name> */ ++static int xenbus_probe_frontend(struct xen_bus_type *bus, const char *type, const char *name) ++{ ++ char *nodename; ++ int err; ++ ++ nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", bus->root, type, name); ++ if (!nodename) ++ return -ENOMEM; ++ ++ DPRINTK("%s", nodename); ++ ++ err = xenbus_probe_node(bus, type, nodename); ++ kfree(nodename); ++ return err; ++} ++ ++static int xenbus_uevent_frontend(struct device *_dev, struct kobj_uevent_env *env) ++{ ++ struct xenbus_device *dev = to_xenbus_device(_dev); ++ ++ if (add_uevent_var(env, "MODALIAS=xen:%s", dev->devicetype)) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++ ++static void backend_changed(struct xenbus_watch *watch, ++ const char **vec, unsigned int len) ++{ ++ xenbus_otherend_changed(watch, vec, len, 1); ++} ++ ++static struct device_attribute xenbus_frontend_dev_attrs[] = { ++ __ATTR_NULL ++}; ++ ++ ++static struct xen_bus_type xenbus_frontend = { ++ .root = "device", ++ .levels = 2, /* device/type/<id> */ ++ .get_bus_id = frontend_bus_id, ++ .probe = xenbus_probe_frontend, ++ .otherend_changed = backend_changed, ++ .bus = { ++ .name = "xen", ++ .match = xenbus_match, ++ .uevent = xenbus_uevent_frontend, ++ .probe = xenbus_dev_probe, ++ .remove = xenbus_dev_remove, ++ .shutdown = xenbus_dev_shutdown, ++ .dev_attrs= xenbus_frontend_dev_attrs, ++ ++ .suspend = xenbus_dev_suspend, ++ .resume = xenbus_dev_resume, ++ }, ++}; ++ ++static void frontend_changed(struct xenbus_watch *watch, ++ const char **vec, unsigned int len) ++{ ++ DPRINTK(""); ++ ++ xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend); ++} ++ ++ ++/* We watch for devices appearing and vanishing. */ ++static struct xenbus_watch fe_watch = { ++ .node = "device", ++ .callback = frontend_changed, ++}; ++ ++static int read_backend_details(struct xenbus_device *xendev) ++{ ++ return xenbus_read_otherend_details(xendev, "backend-id", "backend"); ++} ++ ++static int is_device_connecting(struct device *dev, void *data) ++{ ++ struct xenbus_device *xendev = to_xenbus_device(dev); ++ struct device_driver *drv = data; ++ struct xenbus_driver *xendrv; ++ ++ /* ++ * A device with no driver will never connect. We care only about ++ * devices which should currently be in the process of connecting. ++ */ ++ if (!dev->driver) ++ return 0; ++ ++ /* Is this search limited to a particular driver? */ ++ if (drv && (dev->driver != drv)) ++ return 0; ++ ++ xendrv = to_xenbus_driver(dev->driver); ++ return (xendev->state < XenbusStateConnected || ++ (xendev->state == XenbusStateConnected && ++ xendrv->is_ready && !xendrv->is_ready(xendev))); ++} ++ ++static int exists_connecting_device(struct device_driver *drv) ++{ ++ return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv, ++ is_device_connecting); ++} ++ ++static int print_device_status(struct device *dev, void *data) ++{ ++ struct xenbus_device *xendev = to_xenbus_device(dev); ++ struct device_driver *drv = data; ++ ++ /* Is this operation limited to a particular driver? */ ++ if (drv && (dev->driver != drv)) ++ return 0; ++ ++ if (!dev->driver) { ++ /* Information only: is this too noisy? */ ++ printk(KERN_INFO "XENBUS: Device with no driver: %s\n", ++ xendev->nodename); ++ } else if (xendev->state < XenbusStateConnected) { ++ enum xenbus_state rstate = XenbusStateUnknown; ++ if (xendev->otherend) ++ rstate = xenbus_read_driver_state(xendev->otherend); ++ printk(KERN_WARNING "XENBUS: Timeout connecting " ++ "to device: %s (local state %d, remote state %d)\n", ++ xendev->nodename, xendev->state, rstate); ++ } ++ ++ return 0; ++} ++ ++/* We only wait for device setup after most initcalls have run. */ ++static int ready_to_wait_for_devices; ++ ++/* ++ * On a 5-minute timeout, wait for all devices currently configured. We need ++ * to do this to guarantee that the filesystems and / or network devices ++ * needed for boot are available, before we can allow the boot to proceed. ++ * ++ * This needs to be on a late_initcall, to happen after the frontend device ++ * drivers have been initialised, but before the root fs is mounted. ++ * ++ * A possible improvement here would be to have the tools add a per-device ++ * flag to the store entry, indicating whether it is needed at boot time. ++ * This would allow people who knew what they were doing to accelerate their ++ * boot slightly, but of course needs tools or manual intervention to set up ++ * those flags correctly. ++ */ ++static void wait_for_devices(struct xenbus_driver *xendrv) ++{ ++ unsigned long start = jiffies; ++ struct device_driver *drv = xendrv ? &xendrv->driver : NULL; ++ unsigned int seconds_waited = 0; ++ ++ if (!ready_to_wait_for_devices || !xen_domain()) ++ return; ++ ++ while (exists_connecting_device(drv)) { ++ if (time_after(jiffies, start + (seconds_waited+5)*HZ)) { ++ if (!seconds_waited) ++ printk(KERN_WARNING "XENBUS: Waiting for " ++ "devices to initialise: "); ++ seconds_waited += 5; ++ printk("%us...", 300 - seconds_waited); ++ if (seconds_waited == 300) ++ break; ++ } ++ ++ schedule_timeout_interruptible(HZ/10); ++ } ++ ++ if (seconds_waited) ++ printk("\n"); ++ ++ bus_for_each_dev(&xenbus_frontend.bus, NULL, drv, ++ print_device_status); ++} ++ ++int __xenbus_register_frontend(struct xenbus_driver *drv, ++ struct module *owner, const char *mod_name) ++{ ++ int ret; ++ ++ drv->read_otherend_details = read_backend_details; ++ ++ ret = xenbus_register_driver_common(drv, &xenbus_frontend, ++ owner, mod_name); ++ if (ret) ++ return ret; ++ ++ /* If this driver is loaded as a module wait for devices to attach. */ ++ wait_for_devices(drv); ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(__xenbus_register_frontend); ++ ++static int frontend_probe_and_watch(struct notifier_block *notifier, ++ unsigned long event, ++ void *data) ++{ ++ /* Enumerate devices in xenstore and watch for changes. */ ++ xenbus_probe_devices(&xenbus_frontend); ++ register_xenbus_watch(&fe_watch); ++ ++ return NOTIFY_DONE; ++} ++ ++ ++static int __init xenbus_probe_frontend_init(void) ++{ ++ static struct notifier_block xenstore_notifier = { ++ .notifier_call = frontend_probe_and_watch ++ }; ++ int err; ++ ++ DPRINTK(""); ++ ++ /* Register ourselves with the kernel bus subsystem */ ++ err = bus_register(&xenbus_frontend.bus); ++ if (err) ++ return err; ++ ++ register_xenstore_notifier(&xenstore_notifier); ++ ++ return 0; ++} ++subsys_initcall(xenbus_probe_frontend_init); ++ ++#ifndef MODULE ++static int __init boot_wait_for_devices(void) ++{ ++ if (xen_hvm_domain() && !xen_platform_pci_unplug) ++ return -ENODEV; ++ ++ ready_to_wait_for_devices = 1; ++ wait_for_devices(NULL); ++ return 0; ++} ++ ++late_initcall(boot_wait_for_devices); ++#endif ++ ++MODULE_LICENSE("GPL"); +diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c +index 7b547f5..5534690 100644 +--- a/drivers/xen/xenbus/xenbus_xs.c ++++ b/drivers/xen/xenbus/xenbus_xs.c +@@ -76,6 +76,14 @@ struct xs_handle { + /* + * Mutex ordering: transaction_mutex -> watch_mutex -> request_mutex. + * response_mutex is never taken simultaneously with the other three. ++ * ++ * transaction_mutex must be held before incrementing ++ * transaction_count. The mutex is held when a suspend is in ++ * progress to prevent new transactions starting. ++ * ++ * When decrementing transaction_count to zero the wait queue ++ * should be woken up, the suspend code waits for count to ++ * reach zero. + */ + + /* One request at a time. */ +@@ -85,7 +93,9 @@ struct xs_handle { + struct mutex response_mutex; + + /* Protect transactions against save/restore. */ +- struct rw_semaphore transaction_mutex; ++ struct mutex transaction_mutex; ++ atomic_t transaction_count; ++ wait_queue_head_t transaction_wq; + + /* Protect watch (de)register against save/restore. */ + struct rw_semaphore watch_mutex; +@@ -157,6 +167,31 @@ static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len) + return body; + } + ++static void transaction_start(void) ++{ ++ mutex_lock(&xs_state.transaction_mutex); ++ atomic_inc(&xs_state.transaction_count); ++ mutex_unlock(&xs_state.transaction_mutex); ++} ++ ++static void transaction_end(void) ++{ ++ if (atomic_dec_and_test(&xs_state.transaction_count)) ++ wake_up(&xs_state.transaction_wq); ++} ++ ++static void transaction_suspend(void) ++{ ++ mutex_lock(&xs_state.transaction_mutex); ++ wait_event(xs_state.transaction_wq, ++ atomic_read(&xs_state.transaction_count) == 0); ++} ++ ++static void transaction_resume(void) ++{ ++ mutex_unlock(&xs_state.transaction_mutex); ++} ++ + void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg) + { + void *ret; +@@ -164,7 +199,7 @@ void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg) + int err; + + if (req_msg.type == XS_TRANSACTION_START) +- down_read(&xs_state.transaction_mutex); ++ transaction_start(); + + mutex_lock(&xs_state.request_mutex); + +@@ -180,7 +215,7 @@ void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg) + if ((msg->type == XS_TRANSACTION_END) || + ((req_msg.type == XS_TRANSACTION_START) && + (msg->type == XS_ERROR))) +- up_read(&xs_state.transaction_mutex); ++ transaction_end(); + + return ret; + } +@@ -432,11 +467,11 @@ int xenbus_transaction_start(struct xenbus_transaction *t) + { + char *id_str; + +- down_read(&xs_state.transaction_mutex); ++ transaction_start(); + + id_str = xs_single(XBT_NIL, XS_TRANSACTION_START, "", NULL); + if (IS_ERR(id_str)) { +- up_read(&xs_state.transaction_mutex); ++ transaction_end(); + return PTR_ERR(id_str); + } + +@@ -461,7 +496,7 @@ int xenbus_transaction_end(struct xenbus_transaction t, int abort) + + err = xs_error(xs_single(t, XS_TRANSACTION_END, abortstr, NULL)); + +- up_read(&xs_state.transaction_mutex); ++ transaction_end(); + + return err; + } +@@ -662,7 +697,7 @@ EXPORT_SYMBOL_GPL(unregister_xenbus_watch); + + void xs_suspend(void) + { +- down_write(&xs_state.transaction_mutex); ++ transaction_suspend(); + down_write(&xs_state.watch_mutex); + mutex_lock(&xs_state.request_mutex); + mutex_lock(&xs_state.response_mutex); +@@ -677,7 +712,7 @@ void xs_resume(void) + + mutex_unlock(&xs_state.response_mutex); + mutex_unlock(&xs_state.request_mutex); +- up_write(&xs_state.transaction_mutex); ++ transaction_resume(); + + /* No need for watches_lock: the watch_mutex is sufficient. */ + list_for_each_entry(watch, &watches, list) { +@@ -693,7 +728,7 @@ void xs_suspend_cancel(void) + mutex_unlock(&xs_state.response_mutex); + mutex_unlock(&xs_state.request_mutex); + up_write(&xs_state.watch_mutex); +- up_write(&xs_state.transaction_mutex); ++ mutex_unlock(&xs_state.transaction_mutex); + } + + static int xenwatch_thread(void *unused) +@@ -843,8 +878,10 @@ int xs_init(void) + + mutex_init(&xs_state.request_mutex); + mutex_init(&xs_state.response_mutex); +- init_rwsem(&xs_state.transaction_mutex); ++ mutex_init(&xs_state.transaction_mutex); + init_rwsem(&xs_state.watch_mutex); ++ atomic_set(&xs_state.transaction_count, 0); ++ init_waitqueue_head(&xs_state.transaction_wq); + + /* Initialize the shared memory rings to talk to xenstored */ + err = xb_init_comms(); +diff --git a/drivers/xen/xenfs/Makefile b/drivers/xen/xenfs/Makefile +index 25275c3..4fde944 100644 +--- a/drivers/xen/xenfs/Makefile ++++ b/drivers/xen/xenfs/Makefile +@@ -1,3 +1,4 @@ + obj-$(CONFIG_XENFS) += xenfs.o + +-xenfs-objs = super.o xenbus.o +\ No newline at end of file ++xenfs-y = super.o xenbus.o privcmd.o ++xenfs-$(CONFIG_XEN_DOM0) += xenstored.o +diff --git a/drivers/xen/xenfs/privcmd.c b/drivers/xen/xenfs/privcmd.c +new file mode 100644 +index 0000000..f80be7f +--- /dev/null ++++ b/drivers/xen/xenfs/privcmd.c +@@ -0,0 +1,404 @@ ++/****************************************************************************** ++ * privcmd.c ++ * ++ * Interface to privileged domain-0 commands. ++ * ++ * Copyright (c) 2002-2004, K A Fraser, B Dragovic ++ */ ++ ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/slab.h> ++#include <linux/string.h> ++#include <linux/errno.h> ++#include <linux/mm.h> ++#include <linux/mman.h> ++#include <linux/uaccess.h> ++#include <linux/swap.h> ++#include <linux/smp_lock.h> ++#include <linux/highmem.h> ++#include <linux/pagemap.h> ++#include <linux/seq_file.h> ++ ++#include <asm/pgalloc.h> ++#include <asm/pgtable.h> ++#include <asm/tlb.h> ++#include <asm/xen/hypervisor.h> ++#include <asm/xen/hypercall.h> ++ ++#include <xen/xen.h> ++#include <xen/privcmd.h> ++#include <xen/interface/xen.h> ++#include <xen/features.h> ++#include <xen/page.h> ++#include <xen/xen-ops.h> ++ ++#ifndef HAVE_ARCH_PRIVCMD_MMAP ++static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma); ++#endif ++ ++static long privcmd_ioctl_hypercall(void __user *udata) ++{ ++ struct privcmd_hypercall hypercall; ++ long ret; ++ ++ if (copy_from_user(&hypercall, udata, sizeof(hypercall))) ++ return -EFAULT; ++ ++ ret = privcmd_call(hypercall.op, ++ hypercall.arg[0], hypercall.arg[1], ++ hypercall.arg[2], hypercall.arg[3], ++ hypercall.arg[4]); ++ ++ return ret; ++} ++ ++static void free_page_list(struct list_head *pages) ++{ ++ struct page *p, *n; ++ ++ list_for_each_entry_safe(p, n, pages, lru) ++ __free_page(p); ++ ++ INIT_LIST_HEAD(pages); ++} ++ ++/* ++ * Given an array of items in userspace, return a list of pages ++ * containing the data. If copying fails, either because of memory ++ * allocation failure or a problem reading user memory, return an ++ * error code; its up to the caller to dispose of any partial list. ++ */ ++static int gather_array(struct list_head *pagelist, ++ unsigned nelem, size_t size, ++ void __user *data) ++{ ++ unsigned pageidx; ++ void *pagedata; ++ int ret; ++ ++ if (size > PAGE_SIZE) ++ return 0; ++ ++ pageidx = PAGE_SIZE; ++ pagedata = NULL; /* quiet, gcc */ ++ while (nelem--) { ++ if (pageidx > PAGE_SIZE-size) { ++ struct page *page = alloc_page(GFP_KERNEL); ++ ++ ret = -ENOMEM; ++ if (page == NULL) ++ goto fail; ++ ++ pagedata = page_address(page); ++ ++ list_add_tail(&page->lru, pagelist); ++ pageidx = 0; ++ } ++ ++ ret = -EFAULT; ++ if (copy_from_user(pagedata + pageidx, data, size)) ++ goto fail; ++ ++ data += size; ++ pageidx += size; ++ } ++ ++ ret = 0; ++ ++fail: ++ return ret; ++} ++ ++/* ++ * Call function "fn" on each element of the array fragmented ++ * over a list of pages. ++ */ ++static int traverse_pages(unsigned nelem, size_t size, ++ struct list_head *pos, ++ int (*fn)(void *data, void *state), ++ void *state) ++{ ++ void *pagedata; ++ unsigned pageidx; ++ int ret = 0; ++ ++ BUG_ON(size > PAGE_SIZE); ++ ++ pageidx = PAGE_SIZE; ++ pagedata = NULL; /* hush, gcc */ ++ ++ while (nelem--) { ++ if (pageidx > PAGE_SIZE-size) { ++ struct page *page; ++ pos = pos->next; ++ page = list_entry(pos, struct page, lru); ++ pagedata = page_address(page); ++ pageidx = 0; ++ } ++ ++ ret = (*fn)(pagedata + pageidx, state); ++ if (ret) ++ break; ++ pageidx += size; ++ } ++ ++ return ret; ++} ++ ++struct mmap_mfn_state { ++ unsigned long va; ++ struct vm_area_struct *vma; ++ domid_t domain; ++}; ++ ++static int mmap_mfn_range(void *data, void *state) ++{ ++ struct privcmd_mmap_entry *msg = data; ++ struct mmap_mfn_state *st = state; ++ struct vm_area_struct *vma = st->vma; ++ int rc; ++ ++ /* Do not allow range to wrap the address space. */ ++ if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) || ++ ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va)) ++ return -EINVAL; ++ ++ /* Range chunks must be contiguous in va space. */ ++ if ((msg->va != st->va) || ++ ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end)) ++ return -EINVAL; ++ ++ rc = xen_remap_domain_mfn_range(vma, ++ msg->va & PAGE_MASK, ++ msg->mfn, msg->npages, ++ vma->vm_page_prot, ++ st->domain); ++ if (rc < 0) ++ return rc; ++ ++ st->va += msg->npages << PAGE_SHIFT; ++ ++ return 0; ++} ++ ++static long privcmd_ioctl_mmap(void __user *udata) ++{ ++ struct privcmd_mmap mmapcmd; ++ struct mm_struct *mm = current->mm; ++ struct vm_area_struct *vma; ++ int rc; ++ LIST_HEAD(pagelist); ++ struct mmap_mfn_state state; ++ ++ if (!xen_initial_domain()) ++ return -EPERM; ++ ++ if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd))) ++ return -EFAULT; ++ ++ rc = gather_array(&pagelist, ++ mmapcmd.num, sizeof(struct privcmd_mmap_entry), ++ mmapcmd.entry); ++ ++ if (rc || list_empty(&pagelist)) ++ goto out; ++ ++ down_write(&mm->mmap_sem); ++ ++ { ++ struct page *page = list_first_entry(&pagelist, ++ struct page, lru); ++ struct privcmd_mmap_entry *msg = page_address(page); ++ ++ vma = find_vma(mm, msg->va); ++ rc = -EINVAL; ++ ++ if (!vma || (msg->va != vma->vm_start) || ++ !privcmd_enforce_singleshot_mapping(vma)) ++ goto out_up; ++ } ++ ++ state.va = vma->vm_start; ++ state.vma = vma; ++ state.domain = mmapcmd.dom; ++ ++ rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry), ++ &pagelist, ++ mmap_mfn_range, &state); ++ ++ ++out_up: ++ up_write(&mm->mmap_sem); ++ ++out: ++ free_page_list(&pagelist); ++ ++ return rc; ++} ++ ++struct mmap_batch_state { ++ domid_t domain; ++ unsigned long va; ++ struct vm_area_struct *vma; ++ int err; ++ ++ xen_pfn_t __user *user; ++}; ++ ++static int mmap_batch_fn(void *data, void *state) ++{ ++ xen_pfn_t *mfnp = data; ++ struct mmap_batch_state *st = state; ++ ++ if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, ++ st->vma->vm_page_prot, st->domain) < 0) { ++ *mfnp |= 0xf0000000U; ++ st->err++; ++ } ++ st->va += PAGE_SIZE; ++ ++ return 0; ++} ++ ++static int mmap_return_errors(void *data, void *state) ++{ ++ xen_pfn_t *mfnp = data; ++ struct mmap_batch_state *st = state; ++ ++ put_user(*mfnp, st->user++); ++ ++ return 0; ++} ++ ++static struct vm_operations_struct privcmd_vm_ops; ++ ++static long privcmd_ioctl_mmap_batch(void __user *udata) ++{ ++ int ret; ++ struct privcmd_mmapbatch m; ++ struct mm_struct *mm = current->mm; ++ struct vm_area_struct *vma; ++ unsigned long nr_pages; ++ LIST_HEAD(pagelist); ++ struct mmap_batch_state state; ++ ++ if (!xen_initial_domain()) ++ return -EPERM; ++ ++ if (copy_from_user(&m, udata, sizeof(m))) ++ return -EFAULT; ++ ++ nr_pages = m.num; ++ if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) ++ return -EINVAL; ++ ++ ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), ++ m.arr); ++ ++ if (ret || list_empty(&pagelist)) ++ goto out; ++ ++ down_write(&mm->mmap_sem); ++ ++ vma = find_vma(mm, m.addr); ++ ret = -EINVAL; ++ if (!vma || ++ vma->vm_ops != &privcmd_vm_ops || ++ (m.addr != vma->vm_start) || ++ ((m.addr + (nr_pages << PAGE_SHIFT)) != vma->vm_end) || ++ !privcmd_enforce_singleshot_mapping(vma)) { ++ up_write(&mm->mmap_sem); ++ goto out; ++ } ++ ++ state.domain = m.dom; ++ state.vma = vma; ++ state.va = m.addr; ++ state.err = 0; ++ ++ ret = traverse_pages(m.num, sizeof(xen_pfn_t), ++ &pagelist, mmap_batch_fn, &state); ++ ++ up_write(&mm->mmap_sem); ++ ++ if (state.err > 0) { ++ ret = 0; ++ ++ state.user = m.arr; ++ traverse_pages(m.num, sizeof(xen_pfn_t), ++ &pagelist, ++ mmap_return_errors, &state); ++ } ++ ++out: ++ free_page_list(&pagelist); ++ ++ return ret; ++} ++ ++static long privcmd_ioctl(struct file *file, ++ unsigned int cmd, unsigned long data) ++{ ++ int ret = -ENOSYS; ++ void __user *udata = (void __user *) data; ++ ++ switch (cmd) { ++ case IOCTL_PRIVCMD_HYPERCALL: ++ ret = privcmd_ioctl_hypercall(udata); ++ break; ++ ++ case IOCTL_PRIVCMD_MMAP: ++ ret = privcmd_ioctl_mmap(udata); ++ break; ++ ++ case IOCTL_PRIVCMD_MMAPBATCH: ++ ret = privcmd_ioctl_mmap_batch(udata); ++ break; ++ ++ default: ++ ret = -EINVAL; ++ break; ++ } ++ ++ return ret; ++} ++ ++#ifndef HAVE_ARCH_PRIVCMD_MMAP ++static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ++{ ++ printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n", ++ vma, vma->vm_start, vma->vm_end, ++ vmf->pgoff, vmf->virtual_address); ++ ++ return VM_FAULT_SIGBUS; ++} ++ ++static struct vm_operations_struct privcmd_vm_ops = { ++ .fault = privcmd_fault ++}; ++ ++static int privcmd_mmap(struct file *file, struct vm_area_struct *vma) ++{ ++ /* Unsupported for auto-translate guests. */ ++ if (xen_feature(XENFEAT_auto_translated_physmap)) ++ return -ENOSYS; ++ ++ /* DONTCOPY is essential for Xen as copy_page_range is broken. */ ++ vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY; ++ vma->vm_ops = &privcmd_vm_ops; ++ vma->vm_private_data = NULL; ++ ++ return 0; ++} ++ ++static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma) ++{ ++ return (xchg(&vma->vm_private_data, (void *)1) == NULL); ++} ++#endif ++ ++const struct file_operations privcmd_file_ops = { ++ .unlocked_ioctl = privcmd_ioctl, ++ .mmap = privcmd_mmap, ++}; +diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c +index 6559e0c..984891e 100644 +--- a/drivers/xen/xenfs/super.c ++++ b/drivers/xen/xenfs/super.c +@@ -12,6 +12,10 @@ + #include <linux/module.h> + #include <linux/fs.h> + #include <linux/magic.h> ++#include <linux/mm.h> ++#include <linux/backing-dev.h> ++ ++#include <xen/xen.h> + + #include "xenfs.h" + +@@ -20,6 +24,62 @@ + MODULE_DESCRIPTION("Xen filesystem"); + MODULE_LICENSE("GPL"); + ++static int xenfs_set_page_dirty(struct page *page) ++{ ++ return !TestSetPageDirty(page); ++} ++ ++static const struct address_space_operations xenfs_aops = { ++ .set_page_dirty = xenfs_set_page_dirty, ++}; ++ ++static struct backing_dev_info xenfs_backing_dev_info = { ++ .ra_pages = 0, /* No readahead */ ++ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, ++}; ++ ++static struct inode *xenfs_make_inode(struct super_block *sb, int mode) ++{ ++ struct inode *ret = new_inode(sb); ++ ++ if (ret) { ++ ret->i_mode = mode; ++ ret->i_mapping->a_ops = &xenfs_aops; ++ ret->i_mapping->backing_dev_info = &xenfs_backing_dev_info; ++ ret->i_uid = ret->i_gid = 0; ++ ret->i_blocks = 0; ++ ret->i_atime = ret->i_mtime = ret->i_ctime = CURRENT_TIME; ++ } ++ return ret; ++} ++ ++static struct dentry *xenfs_create_file(struct super_block *sb, ++ struct dentry *parent, ++ const char *name, ++ const struct file_operations *fops, ++ void *data, ++ int mode) ++{ ++ struct dentry *dentry; ++ struct inode *inode; ++ ++ dentry = d_alloc_name(parent, name); ++ if (!dentry) ++ return NULL; ++ ++ inode = xenfs_make_inode(sb, S_IFREG | mode); ++ if (!inode) { ++ dput(dentry); ++ return NULL; ++ } ++ ++ inode->i_fop = fops; ++ inode->i_private = data; ++ ++ d_add(dentry, inode); ++ return dentry; ++} ++ + static ssize_t capabilities_read(struct file *file, char __user *buf, + size_t size, loff_t *off) + { +@@ -41,10 +101,23 @@ static int xenfs_fill_super(struct super_block *sb, void *data, int silent) + [1] = {}, + { "xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR }, + { "capabilities", &capabilities_file_ops, S_IRUGO }, ++ { "privcmd", &privcmd_file_ops, S_IRUSR|S_IWUSR }, + {""}, + }; ++ int rc; ++ ++ rc = simple_fill_super(sb, XENFS_SUPER_MAGIC, xenfs_files); ++ if (rc < 0) ++ return rc; ++ ++ if (xen_initial_domain()) { ++ xenfs_create_file(sb, sb->s_root, "xsd_kva", ++ &xsd_kva_file_ops, NULL, S_IRUSR|S_IWUSR); ++ xenfs_create_file(sb, sb->s_root, "xsd_port", ++ &xsd_port_file_ops, NULL, S_IRUSR|S_IWUSR); ++ } + +- return simple_fill_super(sb, XENFS_SUPER_MAGIC, xenfs_files); ++ return rc; + } + + static int xenfs_get_sb(struct file_system_type *fs_type, +@@ -63,16 +136,30 @@ static struct file_system_type xenfs_type = { + + static int __init xenfs_init(void) + { +- if (xen_pv_domain()) +- return register_filesystem(&xenfs_type); ++ int err; ++ if (!xen_domain()) { ++ printk(KERN_INFO "xenfs: not registering filesystem on non-xen platform\n"); ++ return 0; ++ } ++ ++ err = register_filesystem(&xenfs_type); ++ if (err) { ++ printk(KERN_ERR "xenfs: Unable to register filesystem!\n"); ++ goto out; ++ } ++ ++ err = bdi_init(&xenfs_backing_dev_info); ++ if (err) ++ unregister_filesystem(&xenfs_type); ++ ++ out: + +- printk(KERN_INFO "XENFS: not registering filesystem on non-xen platform\n"); +- return 0; ++ return err; + } + + static void __exit xenfs_exit(void) + { +- if (xen_pv_domain()) ++ if (xen_domain()) + unregister_filesystem(&xenfs_type); + } + +diff --git a/drivers/xen/xenfs/xenbus.c b/drivers/xen/xenfs/xenbus.c +index 6c4269b..0ddef43 100644 +--- a/drivers/xen/xenfs/xenbus.c ++++ b/drivers/xen/xenfs/xenbus.c +@@ -121,8 +121,12 @@ static ssize_t xenbus_file_read(struct file *filp, + int ret; + + mutex_lock(&u->reply_mutex); ++again: + while (list_empty(&u->read_buffers)) { + mutex_unlock(&u->reply_mutex); ++ if (filp->f_flags & O_NONBLOCK) ++ return -EAGAIN; ++ + ret = wait_event_interruptible(u->read_waitq, + !list_empty(&u->read_buffers)); + if (ret) +@@ -140,7 +144,7 @@ static ssize_t xenbus_file_read(struct file *filp, + i += sz - ret; + rb->cons += sz - ret; + +- if (ret != sz) { ++ if (ret != 0) { + if (i == 0) + i = -EFAULT; + goto out; +@@ -156,6 +160,8 @@ static ssize_t xenbus_file_read(struct file *filp, + struct read_buffer, list); + } + } ++ if (i == 0) ++ goto again; + + out: + mutex_unlock(&u->reply_mutex); +@@ -403,6 +409,7 @@ static int xenbus_write_watch(unsigned msg_type, struct xenbus_file_priv *u) + + mutex_lock(&u->reply_mutex); + rc = queue_reply(&u->read_buffers, &reply, sizeof(reply)); ++ wake_up(&u->read_waitq); + mutex_unlock(&u->reply_mutex); + } + +@@ -451,7 +458,7 @@ static ssize_t xenbus_file_write(struct file *filp, + + ret = copy_from_user(u->u.buffer + u->len, ubuf, len); + +- if (ret == len) { ++ if (ret != 0) { + rc = -EFAULT; + goto out; + } +@@ -484,21 +491,6 @@ static ssize_t xenbus_file_write(struct file *filp, + msg_type = u->u.msg.type; + + switch (msg_type) { +- case XS_TRANSACTION_START: +- case XS_TRANSACTION_END: +- case XS_DIRECTORY: +- case XS_READ: +- case XS_GET_PERMS: +- case XS_RELEASE: +- case XS_GET_DOMAIN_PATH: +- case XS_WRITE: +- case XS_MKDIR: +- case XS_RM: +- case XS_SET_PERMS: +- /* Send out a transaction */ +- ret = xenbus_write_transaction(msg_type, u); +- break; +- + case XS_WATCH: + case XS_UNWATCH: + /* (Un)Ask for some path to be watched for changes */ +@@ -506,7 +498,8 @@ static ssize_t xenbus_file_write(struct file *filp, + break; + + default: +- ret = -EINVAL; ++ /* Send out a transaction */ ++ ret = xenbus_write_transaction(msg_type, u); + break; + } + if (ret != 0) +diff --git a/drivers/xen/xenfs/xenfs.h b/drivers/xen/xenfs/xenfs.h +index 51f08b2..b68aa62 100644 +--- a/drivers/xen/xenfs/xenfs.h ++++ b/drivers/xen/xenfs/xenfs.h +@@ -2,5 +2,8 @@ + #define _XENFS_XENBUS_H + + extern const struct file_operations xenbus_file_ops; ++extern const struct file_operations privcmd_file_ops; ++extern const struct file_operations xsd_kva_file_ops; ++extern const struct file_operations xsd_port_file_ops; + + #endif /* _XENFS_XENBUS_H */ +diff --git a/drivers/xen/xenfs/xenstored.c b/drivers/xen/xenfs/xenstored.c +new file mode 100644 +index 0000000..af10804 +--- /dev/null ++++ b/drivers/xen/xenfs/xenstored.c +@@ -0,0 +1,67 @@ ++#include <linux/types.h> ++#include <linux/mm.h> ++#include <linux/fs.h> ++ ++#include <xen/page.h> ++ ++#include "xenfs.h" ++#include "../xenbus/xenbus_comms.h" ++ ++static ssize_t xsd_read(struct file *file, char __user *buf, ++ size_t size, loff_t *off) ++{ ++ const char *str = (const char *)file->private_data; ++ return simple_read_from_buffer(buf, size, off, str, strlen(str)); ++} ++ ++static int xsd_release(struct inode *inode, struct file *file) ++{ ++ kfree(file->private_data); ++ return 0; ++} ++ ++static int xsd_kva_open(struct inode *inode, struct file *file) ++{ ++ file->private_data = (void *)kasprintf(GFP_KERNEL, "0x%p", ++ xen_store_interface); ++ if (!file->private_data) ++ return -ENOMEM; ++ return 0; ++} ++ ++static int xsd_kva_mmap(struct file *file, struct vm_area_struct *vma) ++{ ++ size_t size = vma->vm_end - vma->vm_start; ++ ++ if ((size > PAGE_SIZE) || (vma->vm_pgoff != 0)) ++ return -EINVAL; ++ ++ if (remap_pfn_range(vma, vma->vm_start, ++ virt_to_pfn(xen_store_interface), ++ size, vma->vm_page_prot)) ++ return -EAGAIN; ++ ++ return 0; ++} ++ ++const struct file_operations xsd_kva_file_ops = { ++ .open = xsd_kva_open, ++ .mmap = xsd_kva_mmap, ++ .read = xsd_read, ++ .release = xsd_release, ++}; ++ ++static int xsd_port_open(struct inode *inode, struct file *file) ++{ ++ file->private_data = (void *)kasprintf(GFP_KERNEL, "%d", ++ xen_store_evtchn); ++ if (!file->private_data) ++ return -ENOMEM; ++ return 0; ++} ++ ++const struct file_operations xsd_port_file_ops = { ++ .open = xsd_port_open, ++ .read = xsd_read, ++ .release = xsd_release, ++}; +diff --git a/include/acpi/acpi_drivers.h b/include/acpi/acpi_drivers.h +index f4906f6..e7233e8 100644 +--- a/include/acpi/acpi_drivers.h ++++ b/include/acpi/acpi_drivers.h +@@ -154,4 +154,25 @@ static inline void unregister_hotplug_dock_device(acpi_handle handle) + } + #endif + ++/*-------------------------------------------------------------------------- ++ Memory ++ -------------------------------------------------------------------------- */ ++#if defined(CONFIG_ACPI_HOTPLUG_MEMORY) || \ ++ defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE) ++struct acpi_memory_info { ++ struct list_head list; ++ u64 start_addr; /* Memory Range start physical addr */ ++ u64 length; /* Memory Range length */ ++ unsigned short caching; /* memory cache attribute */ ++ unsigned short write_protect; /* memory read/write attribute */ ++ unsigned int enabled:1; ++}; ++ ++struct acpi_memory_device { ++ struct acpi_device *device; ++ unsigned int state; /* State of the memory device */ ++ struct list_head res_list; ++}; ++#endif ++ + #endif /*__ACPI_DRIVERS_H__*/ +diff --git a/include/acpi/processor.h b/include/acpi/processor.h +index e7bdaaf..6aa3111 100644 +--- a/include/acpi/processor.h ++++ b/include/acpi/processor.h +@@ -239,6 +239,25 @@ struct acpi_processor_errata { + } piix4; + }; + ++extern int acpi_processor_errata(struct acpi_processor *pr); ++#ifdef CONFIG_ACPI_PROCFS ++extern int acpi_processor_add_fs(struct acpi_device *device); ++extern int acpi_processor_remove_fs(struct acpi_device *device); ++#else ++static inline int acpi_processor_add_fs(struct acpi_device *device) ++{ ++ return 0; ++} ++ ++static inline int acpi_processor_remove_fs(struct acpi_device *device) ++{ ++ return 0; ++} ++#endif ++extern int acpi_processor_set_pdc(struct acpi_processor *pr); ++extern int acpi_processor_remove(struct acpi_device *device, int type); ++extern void acpi_processor_notify(struct acpi_device *device, u32 event); ++ + extern int acpi_processor_preregister_performance(struct + acpi_processor_performance + *performance); +@@ -296,6 +315,8 @@ static inline void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx + void acpi_processor_ppc_init(void); + void acpi_processor_ppc_exit(void); + int acpi_processor_ppc_has_changed(struct acpi_processor *pr); ++int acpi_processor_get_performance_info(struct acpi_processor *pr); ++int acpi_processor_get_psd(struct acpi_processor *pr); + #else + static inline void acpi_processor_ppc_init(void) + { +@@ -332,6 +353,7 @@ int acpi_processor_power_init(struct acpi_processor *pr, + int acpi_processor_cst_has_changed(struct acpi_processor *pr); + int acpi_processor_power_exit(struct acpi_processor *pr, + struct acpi_device *device); ++int acpi_processor_get_power_info(struct acpi_processor *pr); + int acpi_processor_suspend(struct acpi_device * device, pm_message_t state); + int acpi_processor_resume(struct acpi_device * device); + extern struct cpuidle_driver acpi_idle_driver; +diff --git a/include/asm-generic/pci.h b/include/asm-generic/pci.h +index 26373cf..9fb4270 100644 +--- a/include/asm-generic/pci.h ++++ b/include/asm-generic/pci.h +@@ -43,6 +43,8 @@ pcibios_select_root(struct pci_dev *pdev, struct resource *res) + return root; + } + ++#ifndef HAVE_ARCH_PCIBIOS_SCAN_ALL_FNS ++#endif + #ifndef HAVE_ARCH_PCI_GET_LEGACY_IDE_IRQ + static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel) + { +diff --git a/include/drm/drmP.h b/include/drm/drmP.h +index 7ad3faa..cf9ddce 100644 +--- a/include/drm/drmP.h ++++ b/include/drm/drmP.h +@@ -1388,7 +1388,7 @@ extern int drm_vma_info(struct seq_file *m, void *data); + #endif + + /* Scatter Gather Support (drm_scatter.h) */ +-extern void drm_sg_cleanup(struct drm_sg_mem * entry); ++extern void drm_sg_cleanup(struct drm_device *dev, struct drm_sg_mem * entry); + extern int drm_sg_alloc_ioctl(struct drm_device *dev, void *data, + struct drm_file *file_priv); + extern int drm_sg_alloc(struct drm_device *dev, struct drm_scatter_gather * request); +diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h +index dd97fb8..b10ec49 100644 +--- a/include/linux/bootmem.h ++++ b/include/linux/bootmem.h +@@ -53,6 +53,7 @@ extern void free_bootmem_node(pg_data_t *pgdat, + unsigned long addr, + unsigned long size); + extern void free_bootmem(unsigned long addr, unsigned long size); ++extern void free_bootmem_late(unsigned long addr, unsigned long size); + + /* + * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE, +diff --git a/include/linux/dmar.h b/include/linux/dmar.h +index 4a2b162..5de4c9e 100644 +--- a/include/linux/dmar.h ++++ b/include/linux/dmar.h +@@ -208,16 +208,9 @@ struct dmar_atsr_unit { + u8 include_all:1; /* include all ports */ + }; + +-/* Intel DMAR initialization functions */ + extern int intel_iommu_init(void); +-#else +-static inline int intel_iommu_init(void) +-{ +-#ifdef CONFIG_INTR_REMAP +- return dmar_dev_scope_init(); +-#else +- return -ENODEV; +-#endif +-} +-#endif /* !CONFIG_DMAR */ ++#else /* !CONFIG_DMAR: */ ++static inline int intel_iommu_init(void) { return -ENODEV; } ++#endif /* CONFIG_DMAR */ ++ + #endif /* __DMAR_H__ */ +diff --git a/include/linux/fb.h b/include/linux/fb.h +index 862e7d4..74d67ca 100644 +--- a/include/linux/fb.h ++++ b/include/linux/fb.h +@@ -763,6 +763,7 @@ struct fb_tile_ops { + * takes over; acceleration engine should be in a quiescent state */ + + /* hints */ ++#define FBINFO_VIRTFB 0x0004 /* FB is System RAM, not device. */ + #define FBINFO_PARTIAL_PAN_OK 0x0040 /* otw use pan only for double-buffering */ + #define FBINFO_READS_FAST 0x0080 /* soft-copy faster than rendering */ + +diff --git a/include/linux/if_link.h b/include/linux/if_link.h +index 176c518..d681cc9 100644 +--- a/include/linux/if_link.h ++++ b/include/linux/if_link.h +@@ -81,6 +81,8 @@ enum + #define IFLA_LINKINFO IFLA_LINKINFO + IFLA_NET_NS_PID, + IFLA_IFALIAS, ++ IFLA_NUM_VF, /* Number of VFs if device is SR-IOV PF */ ++ IFLA_VFINFO_LIST, + __IFLA_MAX + }; + +@@ -190,4 +192,47 @@ struct ifla_vlan_qos_mapping + __u32 to; + }; + ++/* SR-IOV virtual function managment section */ ++ ++enum { ++ IFLA_VF_INFO_UNSPEC, ++ IFLA_VF_INFO, ++ __IFLA_VF_INFO_MAX, ++}; ++ ++#define IFLA_VF_INFO_MAX (__IFLA_VF_INFO_MAX - 1) ++ ++enum { ++ IFLA_VF_UNSPEC, ++ IFLA_VF_MAC, /* Hardware queue specific attributes */ ++ IFLA_VF_VLAN, ++ IFLA_VF_TX_RATE, /* TX Bandwidth Allocation */ ++ __IFLA_VF_MAX, ++}; ++ ++#define IFLA_VF_MAX (__IFLA_VF_MAX - 1) ++ ++struct ifla_vf_mac { ++ __u32 vf; ++ __u8 mac[32]; /* MAX_ADDR_LEN */ ++}; ++ ++struct ifla_vf_vlan { ++ __u32 vf; ++ __u32 vlan; /* 0 - 4095, 0 disables VLAN filter */ ++ __u32 qos; ++}; ++ ++struct ifla_vf_tx_rate { ++ __u32 vf; ++ __u32 rate; /* Max TX bandwidth in Mbps, 0 disables throttling */ ++}; ++ ++struct ifla_vf_info { ++ __u32 vf; ++ __u8 mac[32]; ++ __u32 vlan; ++ __u32 qos; ++ __u32 tx_rate; ++}; + #endif /* _LINUX_IF_LINK_H */ +diff --git a/include/linux/mm.h b/include/linux/mm.h +index 11e5be6..4c98621 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -109,6 +109,12 @@ extern unsigned int kobjsize(const void *objp); + #define VM_SAO 0x20000000 /* Strong Access Ordering (powerpc) */ + #define VM_PFN_AT_MMAP 0x40000000 /* PFNMAP vma that is fully mapped at mmap time */ + #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ ++#ifdef CONFIG_XEN ++#define VM_FOREIGN 0x20000000 /* Has pages belonging to another VM */ ++struct vm_foreign_map { ++ struct page **map; ++}; ++#endif + + #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ + #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS +@@ -199,6 +205,11 @@ struct vm_operations_struct { + */ + int (*access)(struct vm_area_struct *vma, unsigned long addr, + void *buf, int len, int write); ++ ++ /* Area-specific function for clearing the PTE at @ptep. Returns the ++ * original value of @ptep. */ ++ pte_t (*zap_pte)(struct vm_area_struct *vma, ++ unsigned long addr, pte_t *ptep, int is_fullmm); + #ifdef CONFIG_NUMA + /* + * set_policy() op must add a reference to any non-NULL @new mempolicy +diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h +index ec12f8c..3f4991c 100644 +--- a/include/linux/netdevice.h ++++ b/include/linux/netdevice.h +@@ -28,6 +28,7 @@ + #include <linux/if.h> + #include <linux/if_ether.h> + #include <linux/if_packet.h> ++#include <linux/if_link.h> + + #ifdef __KERNEL__ + #include <linux/timer.h> +@@ -577,6 +578,13 @@ struct netdev_queue { + * this function is called when a VLAN id is unregistered. + * + * void (*ndo_poll_controller)(struct net_device *dev); ++ * ++ * SR-IOV management functions. ++ * int (*ndo_set_vf_mac)(struct net_device *dev, int vf, u8* mac); ++ * int (*ndo_set_vf_vlan)(struct net_device *dev, int vf, u16 vlan, u8 qos); ++ * int (*ndo_set_vf_tx_rate)(struct net_device *dev, int vf, int rate); ++ * int (*ndo_get_vf_config)(struct net_device *dev, ++ * int vf, struct ifla_vf_info *ivf); + */ + #define HAVE_NET_DEVICE_OPS + struct net_device_ops { +@@ -626,6 +634,15 @@ struct net_device_ops { + #define HAVE_NETDEV_POLL + void (*ndo_poll_controller)(struct net_device *dev); + #endif ++ int (*ndo_set_vf_mac)(struct net_device *dev, ++ int queue, u8 *mac); ++ int (*ndo_set_vf_vlan)(struct net_device *dev, ++ int queue, u16 vlan, u8 qos); ++ int (*ndo_set_vf_tx_rate)(struct net_device *dev, ++ int vf, int rate); ++ int (*ndo_get_vf_config)(struct net_device *dev, ++ int vf, ++ struct ifla_vf_info *ivf); + #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE) + int (*ndo_fcoe_enable)(struct net_device *dev); + int (*ndo_fcoe_disable)(struct net_device *dev); +diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h +index 6b202b1..b03950e 100644 +--- a/include/linux/page-flags.h ++++ b/include/linux/page-flags.h +@@ -105,6 +105,9 @@ enum pageflags { + #ifdef CONFIG_ARCH_USES_PG_UNCACHED + PG_uncached, /* Page has been mapped as uncached */ + #endif ++#ifdef CONFIG_XEN ++ PG_foreign, ++#endif + #ifdef CONFIG_MEMORY_FAILURE + PG_hwpoison, /* hardware poisoned page. Don't touch */ + #endif +@@ -275,6 +278,23 @@ PAGEFLAG(Uncached, uncached) + PAGEFLAG_FALSE(Uncached) + #endif + ++#ifdef CONFIG_XEN ++TESTPAGEFLAG(Foreign, foreign) ++__SETPAGEFLAG(Foreign, foreign) ++CLEARPAGEFLAG(Foreign, foreign) ++#define SetPageForeign(_page, dtor) do { \ ++ __SetPageForeign(_page); \ ++ BUG_ON((dtor) == (void (*)(struct page *, unsigned int))0); \ ++ (_page)->index = (long)(dtor); \ ++} while (0) ++#define _PageForeignDestructor(_page) \ ++ ((void (*)(struct page *, unsigned int))(_page)->index) ++#define PageForeignDestructor(_page, order) \ ++ _PageForeignDestructor(_page)(_page, order) ++#else ++PAGEFLAG_FALSE(Foreign) ++#endif ++ + #ifdef CONFIG_MEMORY_FAILURE + PAGEFLAG(HWPoison, hwpoison) + TESTSETFLAG(HWPoison, hwpoison) +diff --git a/include/linux/pci.h b/include/linux/pci.h +index e07d194..ca28e46 100644 +--- a/include/linux/pci.h ++++ b/include/linux/pci.h +@@ -609,6 +609,9 @@ extern void pci_remove_bus_device(struct pci_dev *dev); + extern void pci_stop_bus_device(struct pci_dev *dev); + void pci_setup_cardbus(struct pci_bus *bus); + extern void pci_sort_breadthfirst(void); ++#define dev_is_pci(d) ((d)->bus == &pci_bus_type) ++#define dev_is_pf(d) ((dev_is_pci(d) ? to_pci_dev(d)->is_physfn : false)) ++#define dev_num_vf(d) ((dev_is_pci(d) ? pci_num_vf(to_pci_dev(d)) : 0)) + + /* Generic PCI functions exported to card drivers */ + +@@ -1124,6 +1127,9 @@ static inline struct pci_dev *pci_get_bus_and_slot(unsigned int bus, + unsigned int devfn) + { return NULL; } + ++#define dev_is_pci(d) (false) ++#define dev_is_pf(d) (false) ++#define dev_num_vf(d) (0) + #endif /* CONFIG_PCI */ + + /* Include architecture-dependent settings and functions */ +@@ -1279,6 +1285,7 @@ void __iomem *pci_ioremap_bar(struct pci_dev *pdev, int bar); + extern int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn); + extern void pci_disable_sriov(struct pci_dev *dev); + extern irqreturn_t pci_sriov_migration(struct pci_dev *dev); ++extern int pci_num_vf(struct pci_dev *dev); + #else + static inline int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn) + { +@@ -1291,6 +1298,10 @@ static inline irqreturn_t pci_sriov_migration(struct pci_dev *dev) + { + return IRQ_NONE; + } ++static inline int pci_num_vf(struct pci_dev *dev) ++{ ++ return 0; ++} + #endif + + #if defined(CONFIG_HOTPLUG_PCI) || defined(CONFIG_HOTPLUG_PCI_MODULE) +diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h +index fe2f4ee..b72b9e6 100644 +--- a/include/linux/pci_ids.h ++++ b/include/linux/pci_ids.h +@@ -2717,3 +2717,6 @@ + #define PCI_DEVICE_ID_RME_DIGI32 0x9896 + #define PCI_DEVICE_ID_RME_DIGI32_PRO 0x9897 + #define PCI_DEVICE_ID_RME_DIGI32_8 0x9898 ++ ++#define PCI_VENDOR_ID_XEN 0x5853 ++#define PCI_DEVICE_ID_XEN_PLATFORM 0x0001 +diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h +index 73b1f1c..113585a 100644 +--- a/include/linux/swiotlb.h ++++ b/include/linux/swiotlb.h +@@ -7,6 +7,8 @@ struct device; + struct dma_attrs; + struct scatterlist; + ++extern int swiotlb_force; ++ + /* + * Maximum allowable number of contiguous slabs to map, + * must be a power of 2. What is the appropriate value ? +@@ -20,9 +22,46 @@ struct scatterlist; + */ + #define IO_TLB_SHIFT 11 + +-extern void +-swiotlb_init(void); +- ++/* swiotlb-core.c */ ++extern void swiotlb_init(int verbose); ++#ifdef CONFIG_SWIOTLB ++extern void __init swiotlb_free(void); ++#else ++static inline void swiotlb_free(void) { } ++#endif ++extern void swiotlb_print_info(void); ++ ++/* swiotlb-core.c: Internal book-keeping functions. ++ * Must be linked against the library to take advantage of them.*/ ++#ifdef CONFIG_SWIOTLB ++/* ++ * Enumeration for sync targets ++ */ ++enum dma_sync_target { ++ SYNC_FOR_CPU = 0, ++ SYNC_FOR_DEVICE = 1, ++}; ++extern char *io_tlb_start; ++extern char *io_tlb_end; ++extern unsigned long io_tlb_nslabs; ++extern void *io_tlb_overflow_buffer; ++extern unsigned long io_tlb_overflow; ++extern int is_swiotlb_buffer(phys_addr_t paddr); ++extern void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size, ++ enum dma_data_direction dir); ++extern void *do_map_single(struct device *hwdev, phys_addr_t phys, ++ unsigned long start_dma_addr, size_t size, int dir); ++ ++extern void do_unmap_single(struct device *hwdev, char *dma_addr, size_t size, ++ int dir); ++ ++extern void do_sync_single(struct device *hwdev, char *dma_addr, size_t size, ++ int dir, int target); ++extern void swiotlb_full(struct device *dev, size_t size, int dir, int do_panic); ++extern void __init swiotlb_init_early(size_t default_size, int verbose); ++#endif ++ ++/* swiotlb.c: dma_ops functions. */ + extern void + *swiotlb_alloc_coherent(struct device *hwdev, size_t size, + dma_addr_t *dma_handle, gfp_t flags); +@@ -88,4 +127,74 @@ swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr); + extern int + swiotlb_dma_supported(struct device *hwdev, u64 mask); + ++/* swiotlb-xen.c: dma_ops functions. */ ++extern void xen_swiotlb_init(int verbose); ++extern void ++*xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, ++ dma_addr_t *dma_handle, gfp_t flags); ++ ++extern void ++xen_swiotlb_free_coherent(struct device *hwdev, size_t size, ++ void *vaddr, dma_addr_t dma_handle); ++ ++extern dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, ++ unsigned long offset, size_t size, ++ enum dma_data_direction dir, ++ struct dma_attrs *attrs); ++extern void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, ++ size_t size, enum dma_data_direction dir, ++ struct dma_attrs *attrs); ++ ++extern int ++xen_swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg, int nents, ++ int direction); ++ ++extern void ++xen_swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents, ++ int direction); ++ ++extern int ++xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, ++ int nelems, enum dma_data_direction dir, ++ struct dma_attrs *attrs); ++ ++extern void ++xen_swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, ++ int nelems, enum dma_data_direction dir, ++ struct dma_attrs *attrs); ++ ++extern void ++xen_swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr, ++ size_t size, enum dma_data_direction dir); ++ ++extern void ++xen_swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg, ++ int nelems, enum dma_data_direction dir); ++ ++extern void ++xen_swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr, ++ size_t size, enum dma_data_direction dir); ++ ++extern void ++xen_swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg, ++ int nelems, enum dma_data_direction dir); ++ ++extern void ++xen_swiotlb_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dev_addr, ++ unsigned long offset, size_t size, ++ enum dma_data_direction dir); ++ ++extern void ++xen_swiotlb_sync_single_range_for_device(struct device *hwdev, ++ dma_addr_t dev_addr, ++ unsigned long offset, size_t size, ++ enum dma_data_direction dir); ++ ++extern int ++xen_swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr); ++ ++extern int ++xen_swiotlb_dma_supported(struct device *hwdev, u64 mask); ++ ++ + #endif /* __LINUX_SWIOTLB_H */ +diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h +index 3c123c3..1a2ba21 100644 +--- a/include/linux/vmalloc.h ++++ b/include/linux/vmalloc.h +@@ -7,6 +7,8 @@ + + struct vm_area_struct; /* vma defining user mapping in mm_types.h */ + ++extern bool vmap_lazy_unmap; ++ + /* bits in flags of vmalloc's vm_struct below */ + #define VM_IOREMAP 0x00000001 /* ioremap() and friends */ + #define VM_ALLOC 0x00000002 /* vmalloc() */ +diff --git a/include/xen/Kbuild b/include/xen/Kbuild +index 4e65c16..84ad8f0 100644 +--- a/include/xen/Kbuild ++++ b/include/xen/Kbuild +@@ -1 +1,2 @@ + header-y += evtchn.h ++header-y += privcmd.h +diff --git a/include/xen/acpi.h b/include/xen/acpi.h +new file mode 100644 +index 0000000..279142d +--- /dev/null ++++ b/include/xen/acpi.h +@@ -0,0 +1,106 @@ ++#ifndef _XEN_ACPI_H ++#define _XEN_ACPI_H ++ ++#include <linux/types.h> ++#include <acpi/acpi_drivers.h> ++#include <acpi/processor.h> ++#include <xen/xen.h> ++ ++#ifdef CONFIG_XEN_S3 ++#include <asm/xen/hypervisor.h> ++ ++static inline bool xen_pv_acpi(void) ++{ ++ return xen_pv_domain(); ++} ++#else ++static inline bool xen_pv_acpi(void) ++{ ++ return false; ++} ++#endif ++ ++int acpi_notify_hypervisor_state(u8 sleep_state, ++ u32 pm1a_cnt, u32 pm1b_cnd); ++ ++/* ++ * Following are interfaces for xen acpi processor control ++ */ ++ ++/* Events notified to xen */ ++#define PROCESSOR_PM_INIT 1 ++#define PROCESSOR_PM_CHANGE 2 ++#define PROCESSOR_HOTPLUG 3 ++ ++/* Objects for the PM events */ ++#define PM_TYPE_IDLE 0 ++#define PM_TYPE_PERF 1 ++#define PM_TYPE_THR 2 ++#define PM_TYPE_MAX 3 ++ ++#define XEN_MAX_ACPI_ID 255 ++ ++/* Processor hotplug events */ ++#define HOTPLUG_TYPE_ADD 0 ++#define HOTPLUG_TYPE_REMOVE 1 ++ ++int xen_acpi_processor_init(void); ++void xen_acpi_processor_exit(void); ++ ++int xen_acpi_processor_power_init(struct acpi_processor *pr, ++ struct acpi_device *device); ++int xen_acpi_processor_cst_has_changed(struct acpi_processor *pr); ++ ++void xen_arch_acpi_processor_init_pdc(struct acpi_processor *pr); ++ ++#ifdef CONFIG_CPU_FREQ ++int xen_acpi_processor_ppc_has_changed(struct acpi_processor *pr); ++int xen_acpi_processor_get_performance(struct acpi_processor *pr); ++#else ++static inline int xen_acpi_processor_ppc_has_changed(struct acpi_processor *pr) ++{ ++ return acpi_processor_ppc_has_changed(pr); ++} ++static inline int xen_acpi_processor_get_performance(struct acpi_processor *pr) ++{ ++ printk(KERN_WARNING ++ "Warning: xen_acpi_processor_get_performance not supported\n" ++ "Consider compiling CPUfreq support into your kernel.\n"); ++ return 0; ++} ++#endif ++ ++#if defined(CONFIG_ACPI_HOTPLUG_MEMORY) || \ ++ defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE) ++int xen_hotadd_memory(struct acpi_memory_device *mem_device); ++#endif ++ ++#if defined(CONFIG_ACPI_PROCESSOR_XEN) || \ ++defined(CONFIG_ACPI_PROCESSOR_XEN_MODULE) ++ ++struct processor_cntl_xen_ops { ++ /* Transfer processor PM events to xen */ ++int (*pm_ops[PM_TYPE_MAX])(struct acpi_processor *pr, int event); ++ /* Notify physical processor status to xen */ ++ int (*hotplug)(struct acpi_processor *pr, int type); ++}; ++ ++extern int processor_cntl_xen_notify(struct acpi_processor *pr, ++ int event, int type); ++extern int processor_cntl_xen_power_cache(int cpu, int cx, ++ struct acpi_power_register *reg); ++#else ++ ++static inline int processor_cntl_xen_notify(struct acpi_processor *pr, ++ int event, int type) ++{ ++ return 0; ++} ++static inline int processor_cntl_xen_power_cache(int cpu, int cx, ++ struct acpi_power_register *reg) ++{ ++ return 0; ++} ++#endif /* CONFIG_ACPI_PROCESSOR_XEN */ ++ ++#endif /* _XEN_ACPI_H */ +diff --git a/include/xen/balloon.h b/include/xen/balloon.h +new file mode 100644 +index 0000000..e751514 +--- /dev/null ++++ b/include/xen/balloon.h +@@ -0,0 +1,8 @@ ++#ifndef _XEN_BALLOON_H ++#define _XEN_BALLOON_H ++ ++/* Allocate/free a set of empty pages in low memory (i.e., no RAM mapped). */ ++struct page **alloc_empty_pages_and_pagevec(int nr_pages); ++void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages); ++ ++#endif +diff --git a/include/xen/blkif.h b/include/xen/blkif.h +new file mode 100644 +index 0000000..7172081 +--- /dev/null ++++ b/include/xen/blkif.h +@@ -0,0 +1,123 @@ ++/* ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER ++ * DEALINGS IN THE SOFTWARE. ++ */ ++ ++#ifndef __XEN_BLKIF_H__ ++#define __XEN_BLKIF_H__ ++ ++#include <xen/interface/xen.h> ++#include <xen/interface/io/ring.h> ++#include <xen/interface/io/blkif.h> ++#include <xen/interface/io/protocols.h> ++ ++/* Not a real protocol. Used to generate ring structs which contain ++ * the elements common to all protocols only. This way we get a ++ * compiler-checkable way to use common struct elements, so we can ++ * avoid using switch(protocol) in a number of places. */ ++struct blkif_common_request { ++ char dummy; ++}; ++struct blkif_common_response { ++ char dummy; ++}; ++ ++/* i386 protocol version */ ++#pragma pack(push, 4) ++struct blkif_x86_32_request { ++ uint8_t operation; /* BLKIF_OP_??? */ ++ uint8_t nr_segments; /* number of segments */ ++ blkif_vdev_t handle; /* only for read/write requests */ ++ uint64_t id; /* private guest value, echoed in resp */ ++ blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ ++ struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; ++}; ++struct blkif_x86_32_response { ++ uint64_t id; /* copied from request */ ++ uint8_t operation; /* copied from request */ ++ int16_t status; /* BLKIF_RSP_??? */ ++}; ++typedef struct blkif_x86_32_request blkif_x86_32_request_t; ++typedef struct blkif_x86_32_response blkif_x86_32_response_t; ++#pragma pack(pop) ++ ++/* x86_64 protocol version */ ++struct blkif_x86_64_request { ++ uint8_t operation; /* BLKIF_OP_??? */ ++ uint8_t nr_segments; /* number of segments */ ++ blkif_vdev_t handle; /* only for read/write requests */ ++ uint64_t __attribute__((__aligned__(8))) id; ++ blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ ++ struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; ++}; ++struct blkif_x86_64_response { ++ uint64_t __attribute__((__aligned__(8))) id; ++ uint8_t operation; /* copied from request */ ++ int16_t status; /* BLKIF_RSP_??? */ ++}; ++typedef struct blkif_x86_64_request blkif_x86_64_request_t; ++typedef struct blkif_x86_64_response blkif_x86_64_response_t; ++ ++DEFINE_RING_TYPES(blkif_common, struct blkif_common_request, struct blkif_common_response); ++DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request, struct blkif_x86_32_response); ++DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request, struct blkif_x86_64_response); ++ ++union blkif_back_rings { ++ struct blkif_back_ring native; ++ struct blkif_common_back_ring common; ++ struct blkif_x86_32_back_ring x86_32; ++ struct blkif_x86_64_back_ring x86_64; ++}; ++ ++enum blkif_protocol { ++ BLKIF_PROTOCOL_NATIVE = 1, ++ BLKIF_PROTOCOL_X86_32 = 2, ++ BLKIF_PROTOCOL_X86_64 = 3, ++}; ++ ++static void inline blkif_get_x86_32_req(struct blkif_request *dst, struct blkif_x86_32_request *src) ++{ ++ int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST; ++ dst->operation = src->operation; ++ dst->nr_segments = src->nr_segments; ++ dst->handle = src->handle; ++ dst->id = src->id; ++ dst->sector_number = src->sector_number; ++ barrier(); ++ if (n > dst->nr_segments) ++ n = dst->nr_segments; ++ for (i = 0; i < n; i++) ++ dst->seg[i] = src->seg[i]; ++} ++ ++static void inline blkif_get_x86_64_req(struct blkif_request *dst, struct blkif_x86_64_request *src) ++{ ++ int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST; ++ dst->operation = src->operation; ++ dst->nr_segments = src->nr_segments; ++ dst->handle = src->handle; ++ dst->id = src->id; ++ dst->sector_number = src->sector_number; ++ barrier(); ++ if (n > dst->nr_segments) ++ n = dst->nr_segments; ++ for (i = 0; i < n; i++) ++ dst->seg[i] = src->seg[i]; ++} ++ ++#endif /* __XEN_BLKIF_H__ */ +diff --git a/include/xen/events.h b/include/xen/events.h +index e68d59a..7e17e2a 100644 +--- a/include/xen/events.h ++++ b/include/xen/events.h +@@ -12,6 +12,8 @@ int bind_evtchn_to_irqhandler(unsigned int evtchn, + irq_handler_t handler, + unsigned long irqflags, const char *devname, + void *dev_id); ++int bind_virq_to_irq(unsigned int virq, unsigned int cpu); ++ + int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, + irq_handler_t handler, + unsigned long irqflags, const char *devname, +@@ -22,6 +24,12 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi, + unsigned long irqflags, + const char *devname, + void *dev_id); ++int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain, ++ unsigned int remote_port, ++ irq_handler_t handler, ++ unsigned long irqflags, ++ const char *devname, ++ void *dev_id); + + /* + * Common unbind function for all event sources. Takes IRQ to unbind from. +@@ -53,7 +61,42 @@ bool xen_test_irq_pending(int irq); + irq will be disabled so it won't deliver an interrupt. */ + void xen_poll_irq(int irq); + ++/* Poll waiting for an irq to become pending with a timeout. In the usual case, the ++ irq will be disabled so it won't deliver an interrupt. */ ++void xen_poll_irq_timeout(int irq, u64 timeout); ++ + /* Determine the IRQ which is bound to an event channel */ + unsigned irq_from_evtchn(unsigned int evtchn); + ++/* Allocate an irq for a physical interrupt, given a gsi. "Legacy" ++ GSIs are identity mapped; others are dynamically allocated as ++ usual. */ ++int xen_allocate_pirq(unsigned gsi, int shareable, char *name); ++ ++/* De-allocates the above mentioned physical interrupt. */ ++int xen_destroy_irq(int irq); ++ ++/* Return vector allocated to pirq */ ++int xen_vector_from_irq(unsigned pirq); ++ ++/* Return gsi allocated to pirq */ ++int xen_gsi_from_irq(unsigned pirq); ++ ++#ifdef CONFIG_XEN_DOM0_PCI ++void xen_setup_pirqs(void); ++#else ++static inline void xen_setup_pirqs(void) ++{ ++} ++#endif ++ ++/* Determine whether to ignore this IRQ if passed to a guest. */ ++int xen_ignore_irq(int irq); ++/* Xen HVM evtchn vector callback */ ++extern void xen_hvm_callback_vector(void); ++extern int xen_have_vector_callback; ++int xen_set_callback_via(uint64_t via); ++void xen_evtchn_do_upcall(struct pt_regs *regs); ++void xen_hvm_evtchn_do_upcall(void); ++ + #endif /* _XEN_EVENTS_H */ +diff --git a/include/xen/gntdev.h b/include/xen/gntdev.h +new file mode 100644 +index 0000000..8bd1467 +--- /dev/null ++++ b/include/xen/gntdev.h +@@ -0,0 +1,119 @@ ++/****************************************************************************** ++ * gntdev.h ++ * ++ * Interface to /dev/xen/gntdev. ++ * ++ * Copyright (c) 2007, D G Murray ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#ifndef __LINUX_PUBLIC_GNTDEV_H__ ++#define __LINUX_PUBLIC_GNTDEV_H__ ++ ++struct ioctl_gntdev_grant_ref { ++ /* The domain ID of the grant to be mapped. */ ++ uint32_t domid; ++ /* The grant reference of the grant to be mapped. */ ++ uint32_t ref; ++}; ++ ++/* ++ * Inserts the grant references into the mapping table of an instance ++ * of gntdev. N.B. This does not perform the mapping, which is deferred ++ * until mmap() is called with @index as the offset. ++ */ ++#define IOCTL_GNTDEV_MAP_GRANT_REF \ ++_IOC(_IOC_NONE, 'G', 0, sizeof(struct ioctl_gntdev_map_grant_ref)) ++struct ioctl_gntdev_map_grant_ref { ++ /* IN parameters */ ++ /* The number of grants to be mapped. */ ++ uint32_t count; ++ uint32_t pad; ++ /* OUT parameters */ ++ /* The offset to be used on a subsequent call to mmap(). */ ++ uint64_t index; ++ /* Variable IN parameter. */ ++ /* Array of grant references, of size @count. */ ++ struct ioctl_gntdev_grant_ref refs[1]; ++}; ++ ++/* ++ * Removes the grant references from the mapping table of an instance of ++ * of gntdev. N.B. munmap() must be called on the relevant virtual address(es) ++ * before this ioctl is called, or an error will result. ++ */ ++#define IOCTL_GNTDEV_UNMAP_GRANT_REF \ ++_IOC(_IOC_NONE, 'G', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref)) ++struct ioctl_gntdev_unmap_grant_ref { ++ /* IN parameters */ ++ /* The offset was returned by the corresponding map operation. */ ++ uint64_t index; ++ /* The number of pages to be unmapped. */ ++ uint32_t count; ++ uint32_t pad; ++}; ++ ++/* ++ * Returns the offset in the driver's address space that corresponds ++ * to @vaddr. This can be used to perform a munmap(), followed by an ++ * UNMAP_GRANT_REF ioctl, where no state about the offset is retained by ++ * the caller. The number of pages that were allocated at the same time as ++ * @vaddr is returned in @count. ++ * ++ * N.B. Where more than one page has been mapped into a contiguous range, the ++ * supplied @vaddr must correspond to the start of the range; otherwise ++ * an error will result. It is only possible to munmap() the entire ++ * contiguously-allocated range at once, and not any subrange thereof. ++ */ ++#define IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR \ ++_IOC(_IOC_NONE, 'G', 2, sizeof(struct ioctl_gntdev_get_offset_for_vaddr)) ++struct ioctl_gntdev_get_offset_for_vaddr { ++ /* IN parameters */ ++ /* The virtual address of the first mapped page in a range. */ ++ uint64_t vaddr; ++ /* OUT parameters */ ++ /* The offset that was used in the initial mmap() operation. */ ++ uint64_t offset; ++ /* The number of pages mapped in the VM area that begins at @vaddr. */ ++ uint32_t count; ++ uint32_t pad; ++}; ++ ++/* ++ * Sets the maximum number of grants that may mapped at once by this gntdev ++ * instance. ++ * ++ * N.B. This must be called before any other ioctl is performed on the device. ++ */ ++#define IOCTL_GNTDEV_SET_MAX_GRANTS \ ++_IOC(_IOC_NONE, 'G', 3, sizeof(struct ioctl_gntdev_set_max_grants)) ++struct ioctl_gntdev_set_max_grants { ++ /* IN parameter */ ++ /* The maximum number of grants that may be mapped at once. */ ++ uint32_t count; ++}; ++ ++#endif /* __LINUX_PUBLIC_GNTDEV_H__ */ +diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h +index a40f1cd..871b553 100644 +--- a/include/xen/grant_table.h ++++ b/include/xen/grant_table.h +@@ -37,10 +37,16 @@ + #ifndef __ASM_GNTTAB_H__ + #define __ASM_GNTTAB_H__ + +-#include <asm/xen/hypervisor.h> ++#include <asm/page.h> ++ ++#include <xen/interface/xen.h> + #include <xen/interface/grant_table.h> ++ ++#include <asm/xen/hypervisor.h> + #include <asm/xen/grant_table.h> + ++#include <xen/features.h> ++ + /* NR_GRANT_FRAMES must be less than or equal to that configured in Xen */ + #define NR_GRANT_FRAMES 4 + +@@ -51,6 +57,9 @@ struct gnttab_free_callback { + u16 count; + }; + ++void gnttab_reset_grant_page(struct page *page); ++ ++int gnttab_init(void); + int gnttab_suspend(void); + int gnttab_resume(void); + +@@ -80,6 +89,8 @@ unsigned long gnttab_end_foreign_transfer(grant_ref_t ref); + + int gnttab_query_foreign_access(grant_ref_t ref); + ++int gnttab_copy_grant_page(grant_ref_t ref, struct page **pagep); ++ + /* + * operations on reserved batches of grant references + */ +@@ -106,12 +117,46 @@ void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid, + void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid, + unsigned long pfn); + ++static inline void ++gnttab_set_map_op(struct gnttab_map_grant_ref *map, phys_addr_t addr, ++ uint32_t flags, grant_ref_t ref, domid_t domid) ++{ ++ if (flags & GNTMAP_contains_pte) ++ map->host_addr = addr; ++ else if (xen_feature(XENFEAT_auto_translated_physmap)) ++ map->host_addr = __pa(addr); ++ else ++ map->host_addr = addr; ++ ++ map->flags = flags; ++ map->ref = ref; ++ map->dom = domid; ++} ++ ++static inline void ++gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, phys_addr_t addr, ++ uint32_t flags, grant_handle_t handle) ++{ ++ if (flags & GNTMAP_contains_pte) ++ unmap->host_addr = addr; ++ else if (xen_feature(XENFEAT_auto_translated_physmap)) ++ unmap->host_addr = __pa(addr); ++ else ++ unmap->host_addr = addr; ++ ++ unmap->handle = handle; ++ unmap->dev_bus_addr = 0; ++} ++ + int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, + unsigned long max_nr_gframes, + struct grant_entry **__shared); + void arch_gnttab_unmap_shared(struct grant_entry *shared, + unsigned long nr_gframes); + ++extern unsigned long xen_hvm_resume_frames; ++unsigned int gnttab_max_grant_frames(void); ++ + #define gnttab_map_vaddr(map) ((void *)(map.host_virt_addr)) + + #endif /* __ASM_GNTTAB_H__ */ +diff --git a/include/xen/hvm.h b/include/xen/hvm.h +new file mode 100644 +index 0000000..b193fa2 +--- /dev/null ++++ b/include/xen/hvm.h +@@ -0,0 +1,30 @@ ++/* Simple wrappers around HVM functions */ ++#ifndef XEN_HVM_H__ ++#define XEN_HVM_H__ ++ ++#include <xen/interface/hvm/params.h> ++#include <asm/xen/hypercall.h> ++ ++static inline int hvm_get_parameter(int idx, uint64_t *value) ++{ ++ struct xen_hvm_param xhv; ++ int r; ++ ++ xhv.domid = DOMID_SELF; ++ xhv.index = idx; ++ r = HYPERVISOR_hvm_op(HVMOP_get_param, &xhv); ++ if (r < 0) { ++ printk(KERN_ERR "Cannot get hvm parameter %d: %d!\n", ++ idx, r); ++ return r; ++ } ++ *value = xhv.value; ++ return r; ++} ++ ++#define HVM_CALLBACK_VIA_TYPE_VECTOR 0x2 ++#define HVM_CALLBACK_VIA_TYPE_SHIFT 56 ++#define HVM_CALLBACK_VECTOR(x) (((uint64_t)HVM_CALLBACK_VIA_TYPE_VECTOR)<<\ ++ HVM_CALLBACK_VIA_TYPE_SHIFT | (x)) ++ ++#endif /* XEN_HVM_H__ */ +diff --git a/include/xen/interface/features.h b/include/xen/interface/features.h +index f51b641..70d2563 100644 +--- a/include/xen/interface/features.h ++++ b/include/xen/interface/features.h +@@ -41,6 +41,12 @@ + /* x86: Does this Xen host support the MMU_PT_UPDATE_PRESERVE_AD hypercall? */ + #define XENFEAT_mmu_pt_update_preserve_ad 5 + ++/* x86: Does this Xen host support the HVM callback vector type? */ ++#define XENFEAT_hvm_callback_vector 8 ++ ++/* x86: pvclock algorithm is safe to use on HVM */ ++#define XENFEAT_hvm_safe_pvclock 9 ++ + #define XENFEAT_NR_SUBMAPS 1 + + #endif /* __XEN_PUBLIC_FEATURES_H__ */ +diff --git a/include/xen/interface/grant_table.h b/include/xen/interface/grant_table.h +index 39da93c..c704fe5 100644 +--- a/include/xen/interface/grant_table.h ++++ b/include/xen/interface/grant_table.h +@@ -28,6 +28,7 @@ + #ifndef __XEN_PUBLIC_GRANT_TABLE_H__ + #define __XEN_PUBLIC_GRANT_TABLE_H__ + ++#include <xen/interface/xen.h> + + /*********************************** + * GRANT TABLE REPRESENTATION +@@ -321,6 +322,28 @@ struct gnttab_query_size { + DEFINE_GUEST_HANDLE_STRUCT(gnttab_query_size); + + /* ++ * GNTTABOP_unmap_and_replace: Destroy one or more grant-reference mappings ++ * tracked by <handle> but atomically replace the page table entry with one ++ * pointing to the machine address under <new_addr>. <new_addr> will be ++ * redirected to the null entry. ++ * NOTES: ++ * 1. The call may fail in an undefined manner if either mapping is not ++ * tracked by <handle>. ++ * 2. After executing a batch of unmaps, it is guaranteed that no stale ++ * mappings will remain in the device or host TLBs. ++ */ ++#define GNTTABOP_unmap_and_replace 7 ++struct gnttab_unmap_and_replace { ++ /* IN parameters. */ ++ uint64_t host_addr; ++ uint64_t new_addr; ++ grant_handle_t handle; ++ /* OUT parameters. */ ++ int16_t status; /* GNTST_* */ ++}; ++DEFINE_GUEST_HANDLE_STRUCT(gnttab_unmap_and_replace); ++ ++/* + * Bitfield values for update_pin_status.flags. + */ + /* Map the grant entry for access by I/O devices. */ +diff --git a/include/xen/interface/hvm/hvm_op.h b/include/xen/interface/hvm/hvm_op.h +new file mode 100644 +index 0000000..a4827f4 +--- /dev/null ++++ b/include/xen/interface/hvm/hvm_op.h +@@ -0,0 +1,46 @@ ++/* ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER ++ * DEALINGS IN THE SOFTWARE. ++ */ ++ ++#ifndef __XEN_PUBLIC_HVM_HVM_OP_H__ ++#define __XEN_PUBLIC_HVM_HVM_OP_H__ ++ ++/* Get/set subcommands: the second argument of the hypercall is a ++ * pointer to a xen_hvm_param struct. */ ++#define HVMOP_set_param 0 ++#define HVMOP_get_param 1 ++struct xen_hvm_param { ++ domid_t domid; /* IN */ ++ uint32_t index; /* IN */ ++ uint64_t value; /* IN/OUT */ ++}; ++DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_param); ++ ++/* Hint from PV drivers for pagetable destruction. */ ++#define HVMOP_pagetable_dying 9 ++struct xen_hvm_pagetable_dying { ++ /* Domain with a pagetable about to be destroyed. */ ++ domid_t domid; ++ /* guest physical address of the toplevel pagetable dying */ ++ aligned_u64 gpa; ++}; ++typedef struct xen_hvm_pagetable_dying xen_hvm_pagetable_dying_t; ++DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_pagetable_dying_t); ++ ++#endif /* __XEN_PUBLIC_HVM_HVM_OP_H__ */ +diff --git a/include/xen/interface/hvm/params.h b/include/xen/interface/hvm/params.h +new file mode 100644 +index 0000000..1888d8c +--- /dev/null ++++ b/include/xen/interface/hvm/params.h +@@ -0,0 +1,95 @@ ++/* ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER ++ * DEALINGS IN THE SOFTWARE. ++ */ ++ ++#ifndef __XEN_PUBLIC_HVM_PARAMS_H__ ++#define __XEN_PUBLIC_HVM_PARAMS_H__ ++ ++#include "hvm_op.h" ++ ++/* ++ * Parameter space for HVMOP_{set,get}_param. ++ */ ++ ++/* ++ * How should CPU0 event-channel notifications be delivered? ++ * val[63:56] == 0: val[55:0] is a delivery GSI (Global System Interrupt). ++ * val[63:56] == 1: val[55:0] is a delivery PCI INTx line, as follows: ++ * Domain = val[47:32], Bus = val[31:16], ++ * DevFn = val[15: 8], IntX = val[ 1: 0] ++ * val[63:56] == 2: val[7:0] is a vector number. ++ * If val == 0 then CPU0 event-channel notifications are not delivered. ++ */ ++#define HVM_PARAM_CALLBACK_IRQ 0 ++ ++#define HVM_PARAM_STORE_PFN 1 ++#define HVM_PARAM_STORE_EVTCHN 2 ++ ++#define HVM_PARAM_PAE_ENABLED 4 ++ ++#define HVM_PARAM_IOREQ_PFN 5 ++ ++#define HVM_PARAM_BUFIOREQ_PFN 6 ++ ++/* ++ * Set mode for virtual timers (currently x86 only): ++ * delay_for_missed_ticks (default): ++ * Do not advance a vcpu's time beyond the correct delivery time for ++ * interrupts that have been missed due to preemption. Deliver missed ++ * interrupts when the vcpu is rescheduled and advance the vcpu's virtual ++ * time stepwise for each one. ++ * no_delay_for_missed_ticks: ++ * As above, missed interrupts are delivered, but guest time always tracks ++ * wallclock (i.e., real) time while doing so. ++ * no_missed_ticks_pending: ++ * No missed interrupts are held pending. Instead, to ensure ticks are ++ * delivered at some non-zero rate, if we detect missed ticks then the ++ * internal tick alarm is not disabled if the VCPU is preempted during the ++ * next tick period. ++ * one_missed_tick_pending: ++ * Missed interrupts are collapsed together and delivered as one 'late tick'. ++ * Guest time always tracks wallclock (i.e., real) time. ++ */ ++#define HVM_PARAM_TIMER_MODE 10 ++#define HVMPTM_delay_for_missed_ticks 0 ++#define HVMPTM_no_delay_for_missed_ticks 1 ++#define HVMPTM_no_missed_ticks_pending 2 ++#define HVMPTM_one_missed_tick_pending 3 ++ ++/* Boolean: Enable virtual HPET (high-precision event timer)? (x86-only) */ ++#define HVM_PARAM_HPET_ENABLED 11 ++ ++/* Identity-map page directory used by Intel EPT when CR0.PG=0. */ ++#define HVM_PARAM_IDENT_PT 12 ++ ++/* Device Model domain, defaults to 0. */ ++#define HVM_PARAM_DM_DOMAIN 13 ++ ++/* ACPI S state: currently support S0 and S3 on x86. */ ++#define HVM_PARAM_ACPI_S_STATE 14 ++ ++/* TSS used on Intel when CR0.PE=0. */ ++#define HVM_PARAM_VM86_TSS 15 ++ ++/* Boolean: Enable aligning all periodic vpts to reduce interrupts */ ++#define HVM_PARAM_VPT_ALIGN 16 ++ ++#define HVM_NR_PARAMS 17 ++ ++#endif /* __XEN_PUBLIC_HVM_PARAMS_H__ */ +diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h +index c2d1fa4..68dd2b4 100644 +--- a/include/xen/interface/io/blkif.h ++++ b/include/xen/interface/io/blkif.h +@@ -91,4 +91,25 @@ DEFINE_RING_TYPES(blkif, struct blkif_request, struct blkif_response); + #define VDISK_REMOVABLE 0x2 + #define VDISK_READONLY 0x4 + ++/* Xen-defined major numbers for virtual disks, they look strangely ++ * familiar */ ++#define XEN_IDE0_MAJOR 3 ++#define XEN_IDE1_MAJOR 22 ++#define XEN_SCSI_DISK0_MAJOR 8 ++#define XEN_SCSI_DISK1_MAJOR 65 ++#define XEN_SCSI_DISK2_MAJOR 66 ++#define XEN_SCSI_DISK3_MAJOR 67 ++#define XEN_SCSI_DISK4_MAJOR 68 ++#define XEN_SCSI_DISK5_MAJOR 69 ++#define XEN_SCSI_DISK6_MAJOR 70 ++#define XEN_SCSI_DISK7_MAJOR 71 ++#define XEN_SCSI_DISK8_MAJOR 128 ++#define XEN_SCSI_DISK9_MAJOR 129 ++#define XEN_SCSI_DISK10_MAJOR 130 ++#define XEN_SCSI_DISK11_MAJOR 131 ++#define XEN_SCSI_DISK12_MAJOR 132 ++#define XEN_SCSI_DISK13_MAJOR 133 ++#define XEN_SCSI_DISK14_MAJOR 134 ++#define XEN_SCSI_DISK15_MAJOR 135 ++ + #endif /* __XEN_PUBLIC_IO_BLKIF_H__ */ +diff --git a/include/xen/interface/io/netif.h b/include/xen/interface/io/netif.h +index 518481c..8309344 100644 +--- a/include/xen/interface/io/netif.h ++++ b/include/xen/interface/io/netif.h +@@ -131,6 +131,10 @@ struct xen_netif_rx_request { + #define _NETRXF_extra_info (3) + #define NETRXF_extra_info (1U<<_NETRXF_extra_info) + ++/* GSO Prefix descriptor. */ ++#define _NETRXF_gso_prefix (4) ++#define NETRXF_gso_prefix (1U<<_NETRXF_gso_prefix) ++ + struct xen_netif_rx_response { + uint16_t id; + uint16_t offset; /* Offset in page of start of received packet */ +diff --git a/include/xen/interface/io/pciif.h b/include/xen/interface/io/pciif.h +new file mode 100644 +index 0000000..c4177f3 +--- /dev/null ++++ b/include/xen/interface/io/pciif.h +@@ -0,0 +1,124 @@ ++/* ++ * PCI Backend/Frontend Common Data Structures & Macros ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER ++ * DEALINGS IN THE SOFTWARE. ++ * ++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> ++ */ ++#ifndef __XEN_PCI_COMMON_H__ ++#define __XEN_PCI_COMMON_H__ ++ ++/* Be sure to bump this number if you change this file */ ++#define XEN_PCI_MAGIC "7" ++ ++/* xen_pci_sharedinfo flags */ ++#define _XEN_PCIF_active (0) ++#define XEN_PCIF_active (1<<_XEN_PCIF_active) ++#define _XEN_PCIB_AERHANDLER (1) ++#define XEN_PCIB_AERHANDLER (1<<_XEN_PCIB_AERHANDLER) ++#define _XEN_PCIB_active (2) ++#define XEN_PCIB_active (1<<_XEN_PCIB_active) ++ ++/* xen_pci_op commands */ ++#define XEN_PCI_OP_conf_read (0) ++#define XEN_PCI_OP_conf_write (1) ++#define XEN_PCI_OP_enable_msi (2) ++#define XEN_PCI_OP_disable_msi (3) ++#define XEN_PCI_OP_enable_msix (4) ++#define XEN_PCI_OP_disable_msix (5) ++#define XEN_PCI_OP_aer_detected (6) ++#define XEN_PCI_OP_aer_resume (7) ++#define XEN_PCI_OP_aer_mmio (8) ++#define XEN_PCI_OP_aer_slotreset (9) ++ ++/* xen_pci_op error numbers */ ++#define XEN_PCI_ERR_success (0) ++#define XEN_PCI_ERR_dev_not_found (-1) ++#define XEN_PCI_ERR_invalid_offset (-2) ++#define XEN_PCI_ERR_access_denied (-3) ++#define XEN_PCI_ERR_not_implemented (-4) ++/* XEN_PCI_ERR_op_failed - backend failed to complete the operation */ ++#define XEN_PCI_ERR_op_failed (-5) ++ ++/* ++ * it should be PAGE_SIZE-sizeof(struct xen_pci_op))/sizeof(struct msix_entry)) ++ * Should not exceed 128 ++ */ ++#define SH_INFO_MAX_VEC 128 ++ ++struct xen_msix_entry { ++ uint16_t vector; ++ uint16_t entry; ++}; ++struct xen_pci_op { ++ /* IN: what action to perform: XEN_PCI_OP_* */ ++ uint32_t cmd; ++ ++ /* OUT: will contain an error number (if any) from errno.h */ ++ int32_t err; ++ ++ /* IN: which device to touch */ ++ uint32_t domain; /* PCI Domain/Segment */ ++ uint32_t bus; ++ uint32_t devfn; ++ ++ /* IN: which configuration registers to touch */ ++ int32_t offset; ++ int32_t size; ++ ++ /* IN/OUT: Contains the result after a READ or the value to WRITE */ ++ uint32_t value; ++ /* IN: Contains extra infor for this operation */ ++ uint32_t info; ++ /*IN: param for msi-x */ ++ struct xen_msix_entry msix_entries[SH_INFO_MAX_VEC]; ++}; ++ ++/*used for pcie aer handling*/ ++struct xen_pcie_aer_op ++{ ++ ++ /* IN: what action to perform: XEN_PCI_OP_* */ ++ uint32_t cmd; ++ /*IN/OUT: return aer_op result or carry error_detected state as input*/ ++ int32_t err; ++ ++ /* IN: which device to touch */ ++ uint32_t domain; /* PCI Domain/Segment*/ ++ uint32_t bus; ++ uint32_t devfn; ++}; ++struct xen_pci_sharedinfo { ++ /* flags - XEN_PCIF_* */ ++ uint32_t flags; ++ struct xen_pci_op op; ++ struct xen_pcie_aer_op aer_op; ++}; ++ ++#endif /* __XEN_PCI_COMMON_H__ */ ++ ++/* ++ * Local variables: ++ * mode: C ++ * c-set-style: "BSD" ++ * c-basic-offset: 4 ++ * tab-width: 4 ++ * indent-tabs-mode: nil ++ * End: ++ */ +diff --git a/include/xen/interface/io/ring.h b/include/xen/interface/io/ring.h +index e8cbf43..c9ba846 100644 +--- a/include/xen/interface/io/ring.h ++++ b/include/xen/interface/io/ring.h +@@ -24,8 +24,15 @@ typedef unsigned int RING_IDX; + * A ring contains as many entries as will fit, rounded down to the nearest + * power of two (so we can mask with (size-1) to loop around). + */ +-#define __RING_SIZE(_s, _sz) \ +- (__RD32(((_sz) - (long)&(_s)->ring + (long)(_s)) / sizeof((_s)->ring[0]))) ++#define __CONST_RING_SIZE(_s, _sz) \ ++ (__RD32(((_sz) - offsetof(struct _s##_sring, ring)) / \ ++ sizeof(((struct _s##_sring *)0)->ring[0]))) ++ ++/* ++ * The same for passing in an actual pointer instead of a name tag. ++ */ ++#define __RING_SIZE(_s, _sz) \ ++ (__RD32(((_sz) - (long)&(_s)->ring + (long)(_s)) / sizeof((_s)->ring[0]))) + + /* + * Macros to make the correct C datatypes for a new kind of ring. +@@ -73,7 +80,16 @@ union __name##_sring_entry { \ + struct __name##_sring { \ + RING_IDX req_prod, req_event; \ + RING_IDX rsp_prod, rsp_event; \ +- uint8_t pad[48]; \ ++ union { \ ++ struct { \ ++ uint8_t smartpoll_active; \ ++ } netif; \ ++ struct { \ ++ uint8_t msg; \ ++ } tapif_user; \ ++ uint8_t pvt_pad[4]; \ ++ } private; \ ++ uint8_t pad[44]; \ + union __name##_sring_entry ring[1]; /* variable-length */ \ + }; \ + \ +diff --git a/include/xen/interface/io/xenbus.h b/include/xen/interface/io/xenbus.h +index 46508c7..9fda532 100644 +--- a/include/xen/interface/io/xenbus.h ++++ b/include/xen/interface/io/xenbus.h +@@ -27,8 +27,14 @@ enum xenbus_state + XenbusStateClosing = 5, /* The device is being closed + due to an error or an unplug + event. */ +- XenbusStateClosed = 6 ++ XenbusStateClosed = 6, + ++ /* ++ * Reconfiguring: The device is being reconfigured. ++ */ ++ XenbusStateReconfiguring = 7, ++ ++ XenbusStateReconfigured = 8 + }; + + #endif /* _XEN_PUBLIC_IO_XENBUS_H */ +diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h +index af36ead..aa4e368 100644 +--- a/include/xen/interface/memory.h ++++ b/include/xen/interface/memory.h +@@ -9,6 +9,8 @@ + #ifndef __XEN_PUBLIC_MEMORY_H__ + #define __XEN_PUBLIC_MEMORY_H__ + ++#include <linux/spinlock.h> ++ + /* + * Increase or decrease the specified domain's memory reservation. Returns a + * -ve errcode on failure, or the # extents successfully allocated or freed. +@@ -53,6 +55,48 @@ struct xen_memory_reservation { + DEFINE_GUEST_HANDLE_STRUCT(xen_memory_reservation); + + /* ++ * An atomic exchange of memory pages. If return code is zero then ++ * @out.extent_list provides GMFNs of the newly-allocated memory. ++ * Returns zero on complete success, otherwise a negative error code. ++ * On complete success then always @nr_exchanged == @in.nr_extents. ++ * On partial success @nr_exchanged indicates how much work was done. ++ */ ++#define XENMEM_exchange 11 ++struct xen_memory_exchange { ++ /* ++ * [IN] Details of memory extents to be exchanged (GMFN bases). ++ * Note that @in.address_bits is ignored and unused. ++ */ ++ struct xen_memory_reservation in; ++ ++ /* ++ * [IN/OUT] Details of new memory extents. ++ * We require that: ++ * 1. @in.domid == @out.domid ++ * 2. @in.nr_extents << @in.extent_order == ++ * @out.nr_extents << @out.extent_order ++ * 3. @in.extent_start and @out.extent_start lists must not overlap ++ * 4. @out.extent_start lists GPFN bases to be populated ++ * 5. @out.extent_start is overwritten with allocated GMFN bases ++ */ ++ struct xen_memory_reservation out; ++ ++ /* ++ * [OUT] Number of input extents that were successfully exchanged: ++ * 1. The first @nr_exchanged input extents were successfully ++ * deallocated. ++ * 2. The corresponding first entries in the output extent list correctly ++ * indicate the GMFNs that were successfully exchanged. ++ * 3. All other input and output extents are untouched. ++ * 4. If not all input exents are exchanged then the return code of this ++ * command will be non-zero. ++ * 5. THIS FIELD MUST BE INITIALISED TO ZERO BY THE CALLER! ++ */ ++ unsigned long nr_exchanged; ++}; ++ ++DEFINE_GUEST_HANDLE_STRUCT(xen_memory_exchange); ++/* + * Returns the maximum machine frame number of mapped RAM in this system. + * This command always succeeds (it never returns an error code). + * arg == NULL. +@@ -97,6 +141,19 @@ struct xen_machphys_mfn_list { + DEFINE_GUEST_HANDLE_STRUCT(xen_machphys_mfn_list); + + /* ++ * Returns the location in virtual address space of the machine_to_phys ++ * mapping table. Architectures which do not have a m2p table, or which do not ++ * map it by default into guest address space, do not implement this command. ++ * arg == addr of xen_machphys_mapping_t. ++ */ ++#define XENMEM_machphys_mapping 12 ++struct xen_machphys_mapping { ++ unsigned long v_start, v_end; /* Start and end virtual addresses. */ ++ unsigned long max_mfn; /* Maximum MFN that can be looked up. */ ++}; ++DEFINE_GUEST_HANDLE_STRUCT(xen_machphys_mapping_t); ++ ++/* + * Sets the GPFN at which a particular page appears in the specified guest's + * pseudophysical address space. + * arg == addr of xen_add_to_physmap_t. +@@ -142,4 +199,38 @@ struct xen_translate_gpfn_list { + }; + DEFINE_GUEST_HANDLE_STRUCT(xen_translate_gpfn_list); + ++/* ++ * Returns the pseudo-physical memory map as it was when the domain ++ * was started (specified by XENMEM_set_memory_map). ++ * arg == addr of struct xen_memory_map. ++ */ ++#define XENMEM_memory_map 9 ++struct xen_memory_map { ++ /* ++ * On call the number of entries which can be stored in buffer. On ++ * return the number of entries which have been stored in ++ * buffer. ++ */ ++ unsigned int nr_entries; ++ ++ /* ++ * Entries in the buffer are in the same format as returned by the ++ * BIOS INT 0x15 EAX=0xE820 call. ++ */ ++ GUEST_HANDLE(void) buffer; ++}; ++DEFINE_GUEST_HANDLE_STRUCT(xen_memory_map); ++ ++/* ++ * Returns the real physical memory map. Passes the same structure as ++ * XENMEM_memory_map. ++ * arg == addr of struct xen_memory_map. ++ */ ++#define XENMEM_machine_memory_map 10 ++ ++/* ++ * Prevent the balloon driver from changing the memory reservation ++ * during a driver critical region. ++ */ ++extern spinlock_t xen_reservation_lock; + #endif /* __XEN_PUBLIC_MEMORY_H__ */ +diff --git a/include/xen/interface/physdev.h b/include/xen/interface/physdev.h +index cd69391..0703ef6 100644 +--- a/include/xen/interface/physdev.h ++++ b/include/xen/interface/physdev.h +@@ -39,6 +39,19 @@ struct physdev_eoi { + }; + + /* ++ * Register a shared page for the hypervisor to indicate whether the guest ++ * must issue PHYSDEVOP_eoi. The semantics of PHYSDEVOP_eoi change slightly ++ * once the guest used this function in that the associated event channel ++ * will automatically get unmasked. The page registered is used as a bit ++ * array indexed by Xen's PIRQ value. ++ */ ++#define PHYSDEVOP_pirq_eoi_gmfn 17 ++struct physdev_pirq_eoi_gmfn { ++ /* IN */ ++ unsigned long gmfn; ++}; ++ ++/* + * Query the status of an IRQ line. + * @arg == pointer to physdev_irq_status_query structure. + */ +@@ -106,6 +119,64 @@ struct physdev_irq { + uint32_t vector; + }; + ++#define MAP_PIRQ_TYPE_MSI 0x0 ++#define MAP_PIRQ_TYPE_GSI 0x1 ++#define MAP_PIRQ_TYPE_UNKNOWN 0x2 ++ ++#define PHYSDEVOP_map_pirq 13 ++struct physdev_map_pirq { ++ domid_t domid; ++ /* IN */ ++ int type; ++ /* IN */ ++ int index; ++ /* IN or OUT */ ++ int pirq; ++ /* IN */ ++ int bus; ++ /* IN */ ++ int devfn; ++ /* IN */ ++ int entry_nr; ++ /* IN */ ++ uint64_t table_base; ++}; ++ ++#define PHYSDEVOP_unmap_pirq 14 ++struct physdev_unmap_pirq { ++ domid_t domid; ++ /* IN */ ++ int pirq; ++}; ++ ++#define PHYSDEVOP_manage_pci_add 15 ++#define PHYSDEVOP_manage_pci_remove 16 ++struct physdev_manage_pci { ++ /* IN */ ++ uint8_t bus; ++ uint8_t devfn; ++}; ++ ++#define PHYSDEVOP_restore_msi 19 ++struct physdev_restore_msi { ++ /* IN */ ++ uint8_t bus; ++ uint8_t devfn; ++}; ++ ++#define PHYSDEVOP_manage_pci_add_ext 20 ++struct physdev_manage_pci_ext { ++ /* IN */ ++ uint8_t bus; ++ uint8_t devfn; ++ unsigned is_extfn; ++ unsigned is_virtfn; ++ struct { ++ uint8_t bus; ++ uint8_t devfn; ++ } physfn; ++}; ++ + /* + * Argument to physdev_op_compat() hypercall. Superceded by new physdev_op() + * hypercall since 0x00030202. +@@ -121,6 +192,16 @@ struct physdev_op { + } u; + }; + ++#define PHYSDEVOP_setup_gsi 21 ++struct physdev_setup_gsi { ++ int gsi; ++ /* IN */ ++ uint8_t triggering; ++ /* IN */ ++ uint8_t polarity; ++ /* IN */ ++}; ++ + /* + * Notify that some PIRQ-bound event channels have been unmasked. + * ** This command is obsolete since interface version 0x00030202 and is ** +diff --git a/include/xen/interface/platform.h b/include/xen/interface/platform.h +new file mode 100644 +index 0000000..17ae622 +--- /dev/null ++++ b/include/xen/interface/platform.h +@@ -0,0 +1,381 @@ ++/****************************************************************************** ++ * platform.h ++ * ++ * Hardware platform operations. Intended for use by domain-0 kernel. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER ++ * DEALINGS IN THE SOFTWARE. ++ * ++ * Copyright (c) 2002-2006, K Fraser ++ */ ++ ++#ifndef __XEN_PUBLIC_PLATFORM_H__ ++#define __XEN_PUBLIC_PLATFORM_H__ ++ ++#include "xen.h" ++ ++#define XENPF_INTERFACE_VERSION 0x03000001 ++ ++/* ++ * Set clock such that it would read <secs,nsecs> after 00:00:00 UTC, ++ * 1 January, 1970 if the current system time was <system_time>. ++ */ ++#define XENPF_settime 17 ++struct xenpf_settime { ++ /* IN variables. */ ++ uint32_t secs; ++ uint32_t nsecs; ++ uint64_t system_time; ++}; ++typedef struct xenpf_settime xenpf_settime_t; ++DEFINE_GUEST_HANDLE_STRUCT(xenpf_settime_t); ++ ++/* ++ * Request memory range (@mfn, @mfn+@nr_mfns-1) to have type @type. ++ * On x86, @type is an architecture-defined MTRR memory type. ++ * On success, returns the MTRR that was used (@reg) and a handle that can ++ * be passed to XENPF_DEL_MEMTYPE to accurately tear down the new setting. ++ * (x86-specific). ++ */ ++#define XENPF_add_memtype 31 ++struct xenpf_add_memtype { ++ /* IN variables. */ ++ unsigned long mfn; ++ uint64_t nr_mfns; ++ uint32_t type; ++ /* OUT variables. */ ++ uint32_t handle; ++ uint32_t reg; ++}; ++typedef struct xenpf_add_memtype xenpf_add_memtype_t; ++DEFINE_GUEST_HANDLE_STRUCT(xenpf_add_memtype_t); ++ ++/* ++ * Tear down an existing memory-range type. If @handle is remembered then it ++ * should be passed in to accurately tear down the correct setting (in case ++ * of overlapping memory regions with differing types). If it is not known ++ * then @handle should be set to zero. In all cases @reg must be set. ++ * (x86-specific). ++ */ ++#define XENPF_del_memtype 32 ++struct xenpf_del_memtype { ++ /* IN variables. */ ++ uint32_t handle; ++ uint32_t reg; ++}; ++typedef struct xenpf_del_memtype xenpf_del_memtype_t; ++DEFINE_GUEST_HANDLE_STRUCT(xenpf_del_memtype_t); ++ ++/* Read current type of an MTRR (x86-specific). */ ++#define XENPF_read_memtype 33 ++struct xenpf_read_memtype { ++ /* IN variables. */ ++ uint32_t reg; ++ /* OUT variables. */ ++ unsigned long mfn; ++ uint64_t nr_mfns; ++ uint32_t type; ++}; ++typedef struct xenpf_read_memtype xenpf_read_memtype_t; ++DEFINE_GUEST_HANDLE_STRUCT(xenpf_read_memtype_t); ++ ++#define XENPF_microcode_update 35 ++struct xenpf_microcode_update { ++ /* IN variables. */ ++ GUEST_HANDLE(void) data; /* Pointer to microcode data */ ++ uint32_t length; /* Length of microcode data. */ ++}; ++typedef struct xenpf_microcode_update xenpf_microcode_update_t; ++DEFINE_GUEST_HANDLE_STRUCT(xenpf_microcode_update_t); ++ ++#define XENPF_platform_quirk 39 ++#define QUIRK_NOIRQBALANCING 1 /* Do not restrict IO-APIC RTE targets */ ++#define QUIRK_IOAPIC_BAD_REGSEL 2 /* IO-APIC REGSEL forgets its value */ ++#define QUIRK_IOAPIC_GOOD_REGSEL 3 /* IO-APIC REGSEL behaves properly */ ++struct xenpf_platform_quirk { ++ /* IN variables. */ ++ uint32_t quirk_id; ++}; ++typedef struct xenpf_platform_quirk xenpf_platform_quirk_t; ++DEFINE_GUEST_HANDLE_STRUCT(xenpf_platform_quirk_t); ++ ++#define XENPF_firmware_info 50 ++#define XEN_FW_DISK_INFO 1 /* from int 13 AH=08/41/48 */ ++#define XEN_FW_DISK_MBR_SIGNATURE 2 /* from MBR offset 0x1b8 */ ++#define XEN_FW_VBEDDC_INFO 3 /* from int 10 AX=4f15 */ ++struct xenpf_firmware_info { ++ /* IN variables. */ ++ uint32_t type; ++ uint32_t index; ++ /* OUT variables. */ ++ union { ++ struct { ++ /* Int13, Fn48: Check Extensions Present. */ ++ uint8_t device; /* %dl: bios device number */ ++ uint8_t version; /* %ah: major version */ ++ uint16_t interface_support; /* %cx: support bitmap */ ++ /* Int13, Fn08: Legacy Get Device Parameters. */ ++ uint16_t legacy_max_cylinder; /* %cl[7:6]:%ch: max cyl # */ ++ uint8_t legacy_max_head; /* %dh: max head # */ ++ uint8_t legacy_sectors_per_track; /* %cl[5:0]: max sector # */ ++ /* Int13, Fn41: Get Device Parameters (as filled into %ds:%esi). */ ++ /* NB. First uint16_t of buffer must be set to buffer size. */ ++ GUEST_HANDLE(void) edd_params; ++ } disk_info; /* XEN_FW_DISK_INFO */ ++ struct { ++ uint8_t device; /* bios device number */ ++ uint32_t mbr_signature; /* offset 0x1b8 in mbr */ ++ } disk_mbr_signature; /* XEN_FW_DISK_MBR_SIGNATURE */ ++ struct { ++ /* Int10, AX=4F15: Get EDID info. */ ++ uint8_t capabilities; ++ uint8_t edid_transfer_time; ++ /* must refer to 128-byte buffer */ ++ GUEST_HANDLE(uchar) edid; ++ } vbeddc_info; /* XEN_FW_VBEDDC_INFO */ ++ } u; ++}; ++typedef struct xenpf_firmware_info xenpf_firmware_info_t; ++DEFINE_GUEST_HANDLE_STRUCT(xenpf_firmware_info_t); ++ ++#define XENPF_enter_acpi_sleep 51 ++struct xenpf_enter_acpi_sleep { ++ /* IN variables */ ++ uint16_t pm1a_cnt_val; /* PM1a control value. */ ++ uint16_t pm1b_cnt_val; /* PM1b control value. */ ++ uint32_t sleep_state; /* Which state to enter (Sn). */ ++ uint32_t flags; /* Must be zero. */ ++}; ++typedef struct xenpf_enter_acpi_sleep xenpf_enter_acpi_sleep_t; ++DEFINE_GUEST_HANDLE_STRUCT(xenpf_enter_acpi_sleep_t); ++ ++#define XENPF_change_freq 52 ++struct xenpf_change_freq { ++ /* IN variables */ ++ uint32_t flags; /* Must be zero. */ ++ uint32_t cpu; /* Physical cpu. */ ++ uint64_t freq; /* New frequency (Hz). */ ++}; ++typedef struct xenpf_change_freq xenpf_change_freq_t; ++DEFINE_GUEST_HANDLE_STRUCT(xenpf_change_freq_t); ++ ++/* ++ * Get idle times (nanoseconds since boot) for physical CPUs specified in the ++ * @cpumap_bitmap with range [0..@cpumap_nr_cpus-1]. The @idletime array is ++ * indexed by CPU number; only entries with the corresponding @cpumap_bitmap ++ * bit set are written to. On return, @cpumap_bitmap is modified so that any ++ * non-existent CPUs are cleared. Such CPUs have their @idletime array entry ++ * cleared. ++ */ ++#define XENPF_getidletime 53 ++struct xenpf_getidletime { ++ /* IN/OUT variables */ ++ /* IN: CPUs to interrogate; OUT: subset of IN which are present */ ++ GUEST_HANDLE(uchar) cpumap_bitmap; ++ /* IN variables */ ++ /* Size of cpumap bitmap. */ ++ uint32_t cpumap_nr_cpus; ++ /* Must be indexable for every cpu in cpumap_bitmap. */ ++ GUEST_HANDLE(uint64_t) idletime; ++ /* OUT variables */ ++ /* System time when the idletime snapshots were taken. */ ++ uint64_t now; ++}; ++typedef struct xenpf_getidletime xenpf_getidletime_t; ++DEFINE_GUEST_HANDLE_STRUCT(xenpf_getidletime_t); ++ ++#define XENPF_set_processor_pminfo 54 ++ ++/* ability bits */ ++#define XEN_PROCESSOR_PM_CX 1 ++#define XEN_PROCESSOR_PM_PX 2 ++#define XEN_PROCESSOR_PM_TX 4 ++ ++/* cmd type */ ++#define XEN_PM_CX 0 ++#define XEN_PM_PX 1 ++#define XEN_PM_TX 2 ++ ++/* Px sub info type */ ++#define XEN_PX_PCT 1 ++#define XEN_PX_PSS 2 ++#define XEN_PX_PPC 4 ++#define XEN_PX_PSD 8 ++ ++struct xen_power_register { ++ uint32_t space_id; ++ uint32_t bit_width; ++ uint32_t bit_offset; ++ uint32_t access_size; ++ uint64_t address; ++}; ++ ++struct xen_processor_csd { ++ uint32_t domain; /* domain number of one dependent group */ ++ uint32_t coord_type; /* coordination type */ ++ uint32_t num; /* number of processors in same domain */ ++}; ++typedef struct xen_processor_csd xen_processor_csd_t; ++DEFINE_GUEST_HANDLE_STRUCT(xen_processor_csd); ++ ++struct xen_processor_cx { ++ struct xen_power_register reg; /* GAS for Cx trigger register */ ++ uint8_t type; /* cstate value, c0: 0, c1: 1, ... */ ++ uint32_t latency; /* worst latency (ms) to enter/exit this cstate */ ++ uint32_t power; /* average power consumption(mW) */ ++ uint32_t dpcnt; /* number of dependency entries */ ++ GUEST_HANDLE(xen_processor_csd) dp; /* NULL if no dependency */ ++}; ++typedef struct xen_processor_cx xen_processor_cx_t; ++DEFINE_GUEST_HANDLE_STRUCT(xen_processor_cx); ++ ++struct xen_processor_flags { ++ uint32_t bm_control:1; ++ uint32_t bm_check:1; ++ uint32_t has_cst:1; ++ uint32_t power_setup_done:1; ++ uint32_t bm_rld_set:1; ++}; ++ ++struct xen_processor_power { ++ uint32_t count; /* number of C state entries in array below */ ++ struct xen_processor_flags flags; /* global flags of this processor */ ++ GUEST_HANDLE(xen_processor_cx) states; /* supported c states */ ++}; ++ ++struct xen_pct_register { ++ uint8_t descriptor; ++ uint16_t length; ++ uint8_t space_id; ++ uint8_t bit_width; ++ uint8_t bit_offset; ++ uint8_t reserved; ++ uint64_t address; ++}; ++ ++struct xen_processor_px { ++ uint64_t core_frequency; /* megahertz */ ++ uint64_t power; /* milliWatts */ ++ uint64_t transition_latency; /* microseconds */ ++ uint64_t bus_master_latency; /* microseconds */ ++ uint64_t control; /* control value */ ++ uint64_t status; /* success indicator */ ++}; ++typedef struct xen_processor_px xen_processor_px_t; ++DEFINE_GUEST_HANDLE_STRUCT(xen_processor_px); ++ ++struct xen_psd_package { ++ uint64_t num_entries; ++ uint64_t revision; ++ uint64_t domain; ++ uint64_t coord_type; ++ uint64_t num_processors; ++}; ++ ++struct xen_processor_performance { ++ uint32_t flags; /* flag for Px sub info type */ ++ uint32_t platform_limit; /* Platform limitation on freq usage */ ++ struct xen_pct_register control_register; ++ struct xen_pct_register status_register; ++ uint32_t state_count; /* total available performance states */ ++ GUEST_HANDLE(xen_processor_px) states; ++ struct xen_psd_package domain_info; ++ uint32_t shared_type; /* coordination type of this processor */ ++}; ++typedef struct xen_processor_performance xen_processor_performance_t; ++DEFINE_GUEST_HANDLE_STRUCT(xen_processor_performance); ++ ++struct xenpf_set_processor_pminfo { ++ /* IN variables */ ++ uint32_t id; /* ACPI CPU ID */ ++ uint32_t type; /* {XEN_PM_CX, XEN_PM_PX} */ ++ union { ++ struct xen_processor_power power;/* Cx: _CST/_CSD */ ++ struct xen_processor_performance perf; /* Px: _PPC/_PCT/_PSS/_PSD */ ++ }; ++}; ++typedef struct xenpf_set_processor_pminfo xenpf_set_processor_pminfo_t; ++DEFINE_GUEST_HANDLE_STRUCT(xenpf_set_processor_pminfo); ++ ++#define XENPF_get_cpuinfo 55 ++struct xenpf_pcpuinfo { ++ /* IN */ ++ uint32_t xen_cpuid; ++ /* OUT */ ++ /* The maxium cpu_id that is present */ ++ uint32_t max_present; ++#define XEN_PCPU_FLAGS_ONLINE 1 ++ /* Correponding xen_cpuid is not present*/ ++#define XEN_PCPU_FLAGS_INVALID 2 ++ uint32_t flags; ++ uint32_t apic_id; ++ uint32_t acpi_id; ++}; ++typedef struct xenpf_pcpuinfo xenpf_pcpuinfo_t; ++DEFINE_GUEST_HANDLE_STRUCT(xenpf_pcpuinfo_t); ++ ++#define XENPF_cpu_online 56 ++#define XENPF_cpu_offline 57 ++struct xenpf_cpu_ol { ++ uint32_t cpuid; ++}; ++typedef struct xenpf_cpu_ol xenpf_cpu_ol_t; ++DEFINE_GUEST_HANDLE_STRUCT(xenpf_cpu_ol_t); ++ ++#define XENPF_cpu_hotadd 58 ++struct xenpf_cpu_hotadd { ++ uint32_t apic_id; ++ uint32_t acpi_id; ++ uint32_t pxm; ++}; ++ ++ ++#define XENPF_mem_hotadd 59 ++struct xenpf_mem_hotadd { ++ uint64_t spfn; ++ uint64_t epfn; ++ uint32_t pxm; ++ uint32_t flags; ++}; ++ ++struct xen_platform_op { ++ uint32_t cmd; ++ uint32_t interface_version; /* XENPF_INTERFACE_VERSION */ ++ union { ++ struct xenpf_settime settime; ++ struct xenpf_add_memtype add_memtype; ++ struct xenpf_del_memtype del_memtype; ++ struct xenpf_read_memtype read_memtype; ++ struct xenpf_microcode_update microcode; ++ struct xenpf_platform_quirk platform_quirk; ++ struct xenpf_firmware_info firmware_info; ++ struct xenpf_enter_acpi_sleep enter_acpi_sleep; ++ struct xenpf_change_freq change_freq; ++ struct xenpf_getidletime getidletime; ++ struct xenpf_set_processor_pminfo set_pminfo; ++ struct xenpf_pcpuinfo pcpu_info; ++ struct xenpf_cpu_ol cpu_ol; ++ struct xenpf_cpu_hotadd cpu_add; ++ struct xenpf_mem_hotadd mem_add; ++ uint8_t pad[128]; ++ } u; ++}; ++typedef struct xen_platform_op xen_platform_op_t; ++DEFINE_GUEST_HANDLE_STRUCT(xen_platform_op_t); ++ ++#endif /* __XEN_PUBLIC_PLATFORM_H__ */ +diff --git a/include/xen/interface/sched.h b/include/xen/interface/sched.h +index 5fec575..dd55dac 100644 +--- a/include/xen/interface/sched.h ++++ b/include/xen/interface/sched.h +@@ -65,6 +65,39 @@ struct sched_poll { + DEFINE_GUEST_HANDLE_STRUCT(sched_poll); + + /* ++ * Declare a shutdown for another domain. The main use of this function is ++ * in interpreting shutdown requests and reasons for fully-virtualized ++ * domains. A para-virtualized domain may use SCHEDOP_shutdown directly. ++ * @arg == pointer to sched_remote_shutdown structure. ++ */ ++#define SCHEDOP_remote_shutdown 4 ++struct sched_remote_shutdown { ++ domid_t domain_id; /* Remote domain ID */ ++ unsigned int reason; /* SHUTDOWN_xxx reason */ ++}; ++ ++/* ++ * Latch a shutdown code, so that when the domain later shuts down it ++ * reports this code to the control tools. ++ * @arg == as for SCHEDOP_shutdown. ++ */ ++#define SCHEDOP_shutdown_code 5 ++ ++/* ++ * Setup, poke and destroy a domain watchdog timer. ++ * @arg == pointer to sched_watchdog structure. ++ * With id == 0, setup a domain watchdog timer to cause domain shutdown ++ * after timeout, returns watchdog id. ++ * With id != 0 and timeout == 0, destroy domain watchdog timer. ++ * With id != 0 and timeout != 0, poke watchdog timer and set new timeout. ++ */ ++#define SCHEDOP_watchdog 6 ++struct sched_watchdog { ++ uint32_t id; /* watchdog ID */ ++ uint32_t timeout; /* timeout */ ++}; ++ ++/* + * Reason codes for SCHEDOP_shutdown. These may be interpreted by control + * software to determine the appropriate action. For the most part, Xen does + * not care about the shutdown code. +@@ -73,5 +106,6 @@ DEFINE_GUEST_HANDLE_STRUCT(sched_poll); + #define SHUTDOWN_reboot 1 /* Clean up, kill, and then restart. */ + #define SHUTDOWN_suspend 2 /* Clean up, save suspend info, kill. */ + #define SHUTDOWN_crash 3 /* Tell controller we've crashed. */ ++#define SHUTDOWN_watchdog 4 /* Restart because watchdog time expired. */ + + #endif /* __XEN_PUBLIC_SCHED_H__ */ +diff --git a/include/xen/interface/xen-mca.h b/include/xen/interface/xen-mca.h +new file mode 100644 +index 0000000..f31fdab +--- /dev/null ++++ b/include/xen/interface/xen-mca.h +@@ -0,0 +1,429 @@ ++/****************************************************************************** ++ * arch-x86/mca.h ++ * ++ * Contributed by Advanced Micro Devices, Inc. ++ * Author: Christoph Egger <Christoph.Egger@amd.com> ++ * ++ * Guest OS machine check interface to x86 Xen. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER ++ * DEALINGS IN THE SOFTWARE. ++ */ ++ ++/* Full MCA functionality has the following Usecases from the guest side: ++ * ++ * Must have's: ++ * 1. Dom0 and DomU register machine check trap callback handlers ++ * (already done via "set_trap_table" hypercall) ++ * 2. Dom0 registers machine check event callback handler ++ * (doable via EVTCHNOP_bind_virq) ++ * 3. Dom0 and DomU fetches machine check data ++ * 4. Dom0 wants Xen to notify a DomU ++ * 5. Dom0 gets DomU ID from physical address ++ * 6. Dom0 wants Xen to kill DomU (already done for "xm destroy") ++ * ++ * Nice to have's: ++ * 7. Dom0 wants Xen to deactivate a physical CPU ++ * This is better done as separate task, physical CPU hotplugging, ++ * and hypercall(s) should be sysctl's ++ * 8. Page migration proposed from Xen NUMA work, where Dom0 can tell Xen to ++ * move a DomU (or Dom0 itself) away from a malicious page ++ * producing correctable errors. ++ * 9. offlining physical page: ++ * Xen free's and never re-uses a certain physical page. ++ * 10. Testfacility: Allow Dom0 to write values into machine check MSR's ++ * and tell Xen to trigger a machine check ++ */ ++ ++#ifndef __XEN_PUBLIC_ARCH_X86_MCA_H__ ++#define __XEN_PUBLIC_ARCH_X86_MCA_H__ ++ ++/* Hypercall */ ++#define __HYPERVISOR_mca __HYPERVISOR_arch_0 ++ ++/* ++ * The xen-unstable repo has interface version 0x03000001; out interface ++ * is incompatible with that and any future minor revisions, so we ++ * choose a different version number range that is numerically less ++ * than that used in xen-unstable. ++ */ ++#define XEN_MCA_INTERFACE_VERSION 0x01ecc003 ++ ++/* IN: Dom0 calls hypercall to retrieve nonurgent error log entry */ ++#define XEN_MC_NONURGENT 0x0001 ++/* IN: Dom0/DomU calls hypercall to retrieve urgent error log entry */ ++#define XEN_MC_URGENT 0x0002 ++/* IN: Dom0 acknowledges previosly-fetched error log entry */ ++#define XEN_MC_ACK 0x0004 ++ ++/* OUT: All is ok */ ++#define XEN_MC_OK 0x0 ++/* OUT: Domain could not fetch data. */ ++#define XEN_MC_FETCHFAILED 0x1 ++/* OUT: There was no machine check data to fetch. */ ++#define XEN_MC_NODATA 0x2 ++/* OUT: Between notification time and this hypercall an other ++ * (most likely) correctable error happened. The fetched data, ++ * does not match the original machine check data. */ ++#define XEN_MC_NOMATCH 0x4 ++ ++/* OUT: DomU did not register MC NMI handler. Try something else. */ ++#define XEN_MC_CANNOTHANDLE 0x8 ++/* OUT: Notifying DomU failed. Retry later or try something else. */ ++#define XEN_MC_NOTDELIVERED 0x10 ++/* Note, XEN_MC_CANNOTHANDLE and XEN_MC_NOTDELIVERED are mutually exclusive. */ ++ ++ ++#ifndef __ASSEMBLY__ ++ ++#define VIRQ_MCA VIRQ_ARCH_0 /* G. (DOM0) Machine Check Architecture */ ++ ++/* ++ * Machine Check Architecure: ++ * structs are read-only and used to report all kinds of ++ * correctable and uncorrectable errors detected by the HW. ++ * Dom0 and DomU: register a handler to get notified. ++ * Dom0 only: Correctable errors are reported via VIRQ_MCA ++ */ ++#define MC_TYPE_GLOBAL 0 ++#define MC_TYPE_BANK 1 ++#define MC_TYPE_EXTENDED 2 ++#define MC_TYPE_RECOVERY 3 ++ ++struct mcinfo_common { ++ uint16_t type; /* structure type */ ++ uint16_t size; /* size of this struct in bytes */ ++}; ++ ++ ++#define MC_FLAG_CORRECTABLE (1 << 0) ++#define MC_FLAG_UNCORRECTABLE (1 << 1) ++#define MC_FLAG_RECOVERABLE (1 << 2) ++#define MC_FLAG_POLLED (1 << 3) ++#define MC_FLAG_RESET (1 << 4) ++#define MC_FLAG_CMCI (1 << 5) ++#define MC_FLAG_MCE (1 << 6) ++/* contains global x86 mc information */ ++struct mcinfo_global { ++ struct mcinfo_common common; ++ ++ /* running domain at the time in error (most likely ++ * the impacted one) */ ++ uint16_t mc_domid; ++ uint16_t mc_vcpuid; /* virtual cpu scheduled for mc_domid */ ++ uint32_t mc_socketid; /* physical socket of the physical core */ ++ uint16_t mc_coreid; /* physical impacted core */ ++ uint16_t mc_core_threadid; /* core thread of physical core */ ++ uint32_t mc_apicid; ++ uint32_t mc_flags; ++ uint64_t mc_gstatus; /* global status */ ++}; ++ ++/* contains bank local x86 mc information */ ++struct mcinfo_bank { ++ struct mcinfo_common common; ++ ++ uint16_t mc_bank; /* bank nr */ ++ uint16_t mc_domid; /* Usecase 5: domain referenced by mc_addr on ++ * privileged pv-ops dom and if mc_addr is valid. ++ * Never valid on DomU. */ ++ uint64_t mc_status; /* bank status */ ++ uint64_t mc_addr; /* bank address, only valid ++ * if addr bit is set in mc_status */ ++ uint64_t mc_misc; ++ uint64_t mc_ctrl2; ++ uint64_t mc_tsc; ++}; ++ ++ ++struct mcinfo_msr { ++ uint64_t reg; /* MSR */ ++ uint64_t value; /* MSR value */ ++}; ++ ++/* contains mc information from other ++ * or additional mc MSRs */ ++struct mcinfo_extended { ++ struct mcinfo_common common; ++ ++ /* You can fill up to five registers. ++ * If you need more, then use this structure ++ * multiple times. */ ++ ++ uint32_t mc_msrs; /* Number of msr with valid values. */ ++ /* ++ * Currently Intel extended MSR (32/64) include all gp registers ++ * and E(R)FLAGS, E(R)IP, E(R)MISC, up to 11/19 of them might be ++ * useful at present. So expand this array to 16/32 to leave room. ++ */ ++ struct mcinfo_msr mc_msr[sizeof(void *) * 4]; ++}; ++ ++/* Recovery Action flags. Giving recovery result information to DOM0 */ ++ ++/* Xen takes successful recovery action, the error is recovered */ ++#define REC_ACTION_RECOVERED (0x1 << 0) ++/* No action is performed by XEN */ ++#define REC_ACTION_NONE (0x1 << 1) ++/* It's possible DOM0 might take action ownership in some case */ ++#define REC_ACTION_NEED_RESET (0x1 << 2) ++ ++/* Different Recovery Action types, if the action is performed successfully, ++ * REC_ACTION_RECOVERED flag will be returned. ++ */ ++ ++/* Page Offline Action */ ++#define MC_ACTION_PAGE_OFFLINE (0x1 << 0) ++/* CPU offline Action */ ++#define MC_ACTION_CPU_OFFLINE (0x1 << 1) ++/* L3 cache disable Action */ ++#define MC_ACTION_CACHE_SHRINK (0x1 << 2) ++ ++/* Below interface used between XEN/DOM0 for passing XEN's recovery action ++ * information to DOM0. ++ * usage Senario: After offlining broken page, XEN might pass its page offline ++ * recovery action result to DOM0. DOM0 will save the information in ++ * non-volatile memory for further proactive actions, such as offlining the ++ * easy broken page earlier when doing next reboot. ++*/ ++struct page_offline_action { ++ /* Params for passing the offlined page number to DOM0 */ ++ uint64_t mfn; ++ uint64_t status; ++}; ++ ++struct cpu_offline_action { ++ /* Params for passing the identity of the offlined CPU to DOM0 */ ++ uint32_t mc_socketid; ++ uint16_t mc_coreid; ++ uint16_t mc_core_threadid; ++}; ++ ++#define MAX_UNION_SIZE 16 ++struct mcinfo_recovery { ++ struct mcinfo_common common; ++ uint16_t mc_bank; /* bank nr */ ++ /* Recovery Action Flags defined above such as REC_ACTION_DONE */ ++ uint8_t action_flags; ++ /* Recovery Action types defined above such as MC_ACTION_PAGE_OFFLINE */ ++ uint8_t action_types; ++ /* In future if more than one recovery action permitted per error bank, ++ * a mcinfo_recovery data array will be returned ++ */ ++ union { ++ struct page_offline_action page_retire; ++ struct cpu_offline_action cpu_offline; ++ uint8_t pad[MAX_UNION_SIZE]; ++ } action_info; ++}; ++ ++ ++#define MCINFO_HYPERCALLSIZE 1024 ++#define MCINFO_MAXSIZE 768 ++ ++struct mc_info { ++ /* Number of mcinfo_* entries in mi_data */ ++ uint32_t mi_nentries; ++ uint32_t _pad0; ++ uint64_t mi_data[(MCINFO_MAXSIZE - 1) / 8]; ++}; ++typedef struct mc_info mc_info_t; ++DEFINE_GUEST_HANDLE_STRUCT(mc_info); ++ ++#define __MC_MSR_ARRAYSIZE 8 ++#define __MC_NMSRS 1 ++#define MC_NCAPS 7 /* 7 CPU feature flag words */ ++#define MC_CAPS_STD_EDX 0 /* cpuid level 0x00000001 (%edx) */ ++#define MC_CAPS_AMD_EDX 1 /* cpuid level 0x80000001 (%edx) */ ++#define MC_CAPS_TM 2 /* cpuid level 0x80860001 (TransMeta) */ ++#define MC_CAPS_LINUX 3 /* Linux-defined */ ++#define MC_CAPS_STD_ECX 4 /* cpuid level 0x00000001 (%ecx) */ ++#define MC_CAPS_VIA 5 /* cpuid level 0xc0000001 */ ++#define MC_CAPS_AMD_ECX 6 /* cpuid level 0x80000001 (%ecx) */ ++ ++struct mcinfo_logical_cpu { ++ uint32_t mc_cpunr; ++ uint32_t mc_chipid; ++ uint16_t mc_coreid; ++ uint16_t mc_threadid; ++ uint32_t mc_apicid; ++ uint32_t mc_clusterid; ++ uint32_t mc_ncores; ++ uint32_t mc_ncores_active; ++ uint32_t mc_nthreads; ++ int32_t mc_cpuid_level; ++ uint32_t mc_family; ++ uint32_t mc_vendor; ++ uint32_t mc_model; ++ uint32_t mc_step; ++ char mc_vendorid[16]; ++ char mc_brandid[64]; ++ uint32_t mc_cpu_caps[MC_NCAPS]; ++ uint32_t mc_cache_size; ++ uint32_t mc_cache_alignment; ++ int32_t mc_nmsrvals; ++ struct mcinfo_msr mc_msrvalues[__MC_MSR_ARRAYSIZE]; ++}; ++typedef struct mcinfo_logical_cpu mcinfo_logical_cpu_t; ++DEFINE_GUEST_HANDLE_STRUCT(mcinfo_logical_cpu); ++ ++ ++/* ++ * OS's should use these instead of writing their own lookup function ++ * each with its own bugs and drawbacks. ++ * We use macros instead of static inline functions to allow guests ++ * to include this header in assembly files (*.S). ++ */ ++/* Prototype: ++ * uint32_t x86_mcinfo_nentries(struct mc_info *mi); ++ */ ++#define x86_mcinfo_nentries(_mi) \ ++ ((_mi)->mi_nentries) ++/* Prototype: ++ * struct mcinfo_common *x86_mcinfo_first(struct mc_info *mi); ++ */ ++#define x86_mcinfo_first(_mi) \ ++ ((struct mcinfo_common *)(_mi)->mi_data) ++/* Prototype: ++ * struct mcinfo_common *x86_mcinfo_next(struct mcinfo_common *mic); ++ */ ++#define x86_mcinfo_next(_mic) \ ++ ((struct mcinfo_common *)((uint8_t *)(_mic) + (_mic)->size)) ++ ++/* Prototype: ++ * void x86_mcinfo_lookup(void *ret, struct mc_info *mi, uint16_t type); ++ */ ++ ++static inline void x86_mcinfo_lookup ++ (struct mcinfo_common **ret, struct mc_info *mi, uint16_t type) ++{ ++ uint32_t found = 0, i; ++ struct mcinfo_common *mic; ++ ++ *ret = NULL; ++ if (!mi) ++ return; ++ mic = x86_mcinfo_first(mi); ++ ++ for (i = 0; i < x86_mcinfo_nentries(mi); i++) { ++ if (mic->type == type) { ++ found = 1; ++ break; ++ } ++ mic = x86_mcinfo_next(mic); ++ } ++ ++ *ret = found ? mic : NULL; ++} ++/* Usecase 1 ++ * Register machine check trap callback handler ++ * (already done via "set_trap_table" hypercall) ++ */ ++ ++/* Usecase 2 ++ * Dom0 registers machine check event callback handler ++ * done by EVTCHNOP_bind_virq ++ */ ++ ++/* Usecase 3 ++ * Fetch machine check data from hypervisor. ++ * Note, this hypercall is special, because both Dom0 and DomU must use this. ++ */ ++#define XEN_MC_fetch 1 ++struct xen_mc_fetch { ++ /* IN/OUT variables. ++ * IN: XEN_MC_NONURGENT, XEN_MC_URGENT, ++ * XEN_MC_ACK if ack'king an earlier fetch ++ * OUT: XEN_MC_OK, XEN_MC_FETCHAILED, ++ * XEN_MC_NODATA, XEN_MC_NOMATCH ++ */ ++ uint32_t flags; ++ uint32_t _pad0; ++ /* OUT: id for ack, IN: id we are ack'ing */ ++ uint64_t fetch_id; ++ ++ /* OUT variables. */ ++ GUEST_HANDLE(mc_info) data; ++}; ++typedef struct xen_mc_fetch xen_mc_fetch_t; ++DEFINE_GUEST_HANDLE_STRUCT(xen_mc_fetch); ++ ++ ++/* Usecase 4 ++ * This tells the hypervisor to notify a DomU about the machine check error ++ */ ++#define XEN_MC_notifydomain 2 ++struct xen_mc_notifydomain { ++ /* IN variables. */ ++ uint16_t mc_domid;/* The unprivileged domain to notify. */ ++ uint16_t mc_vcpuid;/* The vcpu in mc_domid to notify. ++ * Usually echo'd value from the fetch hypercall. */ ++ ++ /* IN/OUT variables. */ ++ uint32_t flags; ++ ++/* OUT: XEN_MC_OK, XEN_MC_CANNOTHANDLE, XEN_MC_NOTDELIVERED, XEN_MC_NOMATCH */ ++}; ++typedef struct xen_mc_notifydomain xen_mc_notifydomain_t; ++DEFINE_GUEST_HANDLE_STRUCT(xen_mc_notifydomain); ++ ++#define XEN_MC_physcpuinfo 3 ++struct xen_mc_physcpuinfo { ++ /* IN/OUT */ ++ uint32_t ncpus; ++ uint32_t _pad0; ++ /* OUT */ ++ GUEST_HANDLE(mcinfo_logical_cpu) info; ++}; ++ ++#define XEN_MC_msrinject 4 ++#define MC_MSRINJ_MAXMSRS 8 ++struct xen_mc_msrinject { ++ /* IN */ ++ uint32_t mcinj_cpunr;/* target processor id */ ++ uint32_t mcinj_flags;/* see MC_MSRINJ_F_* below */ ++ uint32_t mcinj_count;/* 0 .. count-1 in array are valid */ ++ uint32_t _pad0; ++ struct mcinfo_msr mcinj_msr[MC_MSRINJ_MAXMSRS]; ++}; ++ ++/* Flags for mcinj_flags above; bits 16-31 are reserved */ ++#define MC_MSRINJ_F_INTERPOSE 0x1 ++ ++#define XEN_MC_mceinject 5 ++struct xen_mc_mceinject { ++ unsigned int mceinj_cpunr; /* target processor id */ ++}; ++ ++struct xen_mc { ++ uint32_t cmd; ++ uint32_t interface_version; /* XEN_MCA_INTERFACE_VERSION */ ++ union { ++ struct xen_mc_fetch mc_fetch; ++ struct xen_mc_notifydomain mc_notifydomain; ++ struct xen_mc_physcpuinfo mc_physcpuinfo; ++ struct xen_mc_msrinject mc_msrinject; ++ struct xen_mc_mceinject mc_mceinject; ++ } u; ++}; ++typedef struct xen_mc xen_mc_t; ++DEFINE_GUEST_HANDLE_STRUCT(xen_mc); ++ ++#endif /* __ASSEMBLY__ */ ++ ++#endif /* __XEN_PUBLIC_ARCH_X86_MCA_H__ */ +diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h +index 2befa3e..9ffaee0 100644 +--- a/include/xen/interface/xen.h ++++ b/include/xen/interface/xen.h +@@ -79,6 +79,7 @@ + #define VIRQ_CONSOLE 2 /* (DOM0) Bytes received on emergency console. */ + #define VIRQ_DOM_EXC 3 /* (DOM0) Exceptional event for some domain. */ + #define VIRQ_DEBUGGER 6 /* (DOM0) A domain has paused for debugging. */ ++#define VIRQ_PCPU_STATE 9 /* (DOM0) PCPU state changed */ + + /* Architecture-specific VIRQ definitions. */ + #define VIRQ_ARCH_0 16 +@@ -184,6 +185,8 @@ + #define MMUEXT_NEW_USER_BASEPTR 15 + + #ifndef __ASSEMBLY__ ++#include <linux/types.h> ++ + struct mmuext_op { + unsigned int cmd; + union { +@@ -449,9 +452,49 @@ struct start_info { + int8_t cmd_line[MAX_GUEST_CMDLINE]; + }; + ++struct dom0_vga_console_info { ++ uint8_t video_type; /* DOM0_VGA_CONSOLE_??? */ ++#define XEN_VGATYPE_TEXT_MODE_3 0x03 ++#define XEN_VGATYPE_VESA_LFB 0x23 ++ ++ union { ++ struct { ++ /* Font height, in pixels. */ ++ uint16_t font_height; ++ /* Cursor location (column, row). */ ++ uint16_t cursor_x, cursor_y; ++ /* Number of rows and columns (dimensions in characters). */ ++ uint16_t rows, columns; ++ } text_mode_3; ++ ++ struct { ++ /* Width and height, in pixels. */ ++ uint16_t width, height; ++ /* Bytes per scan line. */ ++ uint16_t bytes_per_line; ++ /* Bits per pixel. */ ++ uint16_t bits_per_pixel; ++ /* LFB physical address, and size (in units of 64kB). */ ++ uint32_t lfb_base; ++ uint32_t lfb_size; ++ /* RGB mask offsets and sizes, as defined by VBE 1.2+ */ ++ uint8_t red_pos, red_size; ++ uint8_t green_pos, green_size; ++ uint8_t blue_pos, blue_size; ++ uint8_t rsvd_pos, rsvd_size; ++ ++ /* VESA capabilities (offset 0xa, VESA command 0x4f00). */ ++ uint32_t gbl_caps; ++ /* Mode attributes (offset 0x0, VESA command 0x4f01). */ ++ uint16_t mode_attrs; ++ } vesa_lfb; ++ } u; ++}; ++ + /* These flags are passed in the 'flags' field of start_info_t. */ + #define SIF_PRIVILEGED (1<<0) /* Is the domain privileged? */ + #define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */ ++#define SIF_PM_MASK (0xFF<<8) /* reserve 1 byte for xen-pm options */ + + typedef uint64_t cpumap_t; + +@@ -461,6 +504,8 @@ typedef uint8_t xen_domain_handle_t[16]; + #define __mk_unsigned_long(x) x ## UL + #define mk_unsigned_long(x) __mk_unsigned_long(x) + ++DEFINE_GUEST_HANDLE(uint64_t); ++ + #else /* __ASSEMBLY__ */ + + /* In assembly code we cannot use C numeric constant suffixes. */ +diff --git a/include/xen/page.h b/include/xen/page.h +index eaf85fa..0be36b9 100644 +--- a/include/xen/page.h ++++ b/include/xen/page.h +@@ -1 +1,8 @@ ++#ifndef _XEN_PAGE_H ++#define _XEN_PAGE_H ++ + #include <asm/xen/page.h> ++ ++extern phys_addr_t xen_extra_mem_start, xen_extra_mem_size; ++ ++#endif /* _XEN_PAGE_H */ +diff --git a/include/xen/pcpu.h b/include/xen/pcpu.h +new file mode 100644 +index 0000000..7e8f9d1 +--- /dev/null ++++ b/include/xen/pcpu.h +@@ -0,0 +1,32 @@ ++#ifndef _XEN_PCPU_H ++#define _XEN_PCPU_H ++ ++#include <xen/interface/platform.h> ++#include <linux/sysdev.h> ++ ++extern int xen_pcpu_hotplug(int type, uint32_t apic_id); ++#define XEN_PCPU_ONLINE 0x01 ++#define XEN_PCPU_OFFLINE 0x02 ++#define XEN_PCPU_ADD 0x04 ++#define XEN_PCPU_REMOVE 0x08 ++ ++struct pcpu { ++ struct list_head pcpu_list; ++ struct sys_device sysdev; ++ uint32_t xen_id; ++ uint32_t apic_id; ++ uint32_t acpi_id; ++ uint32_t flags; ++}; ++ ++static inline int xen_pcpu_online(uint32_t flags) ++{ ++ return !!(flags & XEN_PCPU_FLAGS_ONLINE); ++} ++ ++extern int register_xen_pcpu_notifier(struct notifier_block *nb); ++ ++extern void unregister_xen_pcpu_notifier(struct notifier_block *nb); ++ ++extern int xen_pcpu_index(uint32_t acpi_id, int is_acpiid); ++#endif +diff --git a/include/xen/platform_pci.h b/include/xen/platform_pci.h +new file mode 100644 +index 0000000..a785a3b +--- /dev/null ++++ b/include/xen/platform_pci.h +@@ -0,0 +1,53 @@ ++#ifndef _XEN_PLATFORM_PCI_H ++#define _XEN_PLATFORM_PCI_H ++ ++#define XEN_IOPORT_MAGIC_VAL 0x49d2 ++#define XEN_IOPORT_LINUX_PRODNUM 0x0003 ++#define XEN_IOPORT_LINUX_DRVVER 0x0001 ++ ++#define XEN_IOPORT_BASE 0x10 ++ ++#define XEN_IOPORT_PLATFLAGS (XEN_IOPORT_BASE + 0) /* 1 byte access (R/W) */ ++#define XEN_IOPORT_MAGIC (XEN_IOPORT_BASE + 0) /* 2 byte access (R) */ ++#define XEN_IOPORT_UNPLUG (XEN_IOPORT_BASE + 0) /* 2 byte access (W) */ ++#define XEN_IOPORT_DRVVER (XEN_IOPORT_BASE + 0) /* 4 byte access (W) */ ++ ++#define XEN_IOPORT_SYSLOG (XEN_IOPORT_BASE + 2) /* 1 byte access (W) */ ++#define XEN_IOPORT_PROTOVER (XEN_IOPORT_BASE + 2) /* 1 byte access (R) */ ++#define XEN_IOPORT_PRODNUM (XEN_IOPORT_BASE + 2) /* 2 byte access (W) */ ++ ++#define XEN_UNPLUG_ALL_IDE_DISKS (1<<0) ++#define XEN_UNPLUG_ALL_NICS (1<<1) ++#define XEN_UNPLUG_AUX_IDE_DISKS (1<<2) ++#define XEN_UNPLUG_ALL (XEN_UNPLUG_ALL_IDE_DISKS|\ ++ XEN_UNPLUG_ALL_NICS|\ ++ XEN_UNPLUG_AUX_IDE_DISKS) ++ ++#define XEN_UNPLUG_UNNECESSARY (1<<16) ++#define XEN_UNPLUG_NEVER (1<<17) ++ ++static inline int xen_must_unplug_nics(void) { ++#if (defined(CONFIG_XEN_NETDEV_FRONTEND) || \ ++ defined(CONFIG_XEN_NETDEV_FRONTEND_MODULE)) && \ ++ (defined(CONFIG_XEN_PLATFORM_PCI) || \ ++ defined(CONFIG_XEN_PLATFORM_PCI_MODULE)) ++ return 1; ++#else ++ return 0; ++#endif ++} ++ ++static inline int xen_must_unplug_disks(void) { ++#if (defined(CONFIG_XEN_BLKDEV_FRONTEND) || \ ++ defined(CONFIG_XEN_BLKDEV_FRONTEND_MODULE)) && \ ++ (defined(CONFIG_XEN_PLATFORM_PCI) || \ ++ defined(CONFIG_XEN_PLATFORM_PCI_MODULE)) ++ return 1; ++#else ++ return 0; ++#endif ++} ++ ++extern int xen_platform_pci_unplug; ++ ++#endif /* _XEN_PLATFORM_PCI_H */ +diff --git a/include/xen/privcmd.h b/include/xen/privcmd.h +new file mode 100644 +index 0000000..b42cdfd +--- /dev/null ++++ b/include/xen/privcmd.h +@@ -0,0 +1,80 @@ ++/****************************************************************************** ++ * privcmd.h ++ * ++ * Interface to /proc/xen/privcmd. ++ * ++ * Copyright (c) 2003-2005, K A Fraser ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#ifndef __LINUX_PUBLIC_PRIVCMD_H__ ++#define __LINUX_PUBLIC_PRIVCMD_H__ ++ ++#include <linux/types.h> ++ ++typedef unsigned long xen_pfn_t; ++ ++#ifndef __user ++#define __user ++#endif ++ ++struct privcmd_hypercall { ++ __u64 op; ++ __u64 arg[5]; ++}; ++ ++struct privcmd_mmap_entry { ++ __u64 va; ++ __u64 mfn; ++ __u64 npages; ++}; ++ ++struct privcmd_mmap { ++ int num; ++ domid_t dom; /* target domain */ ++ struct privcmd_mmap_entry __user *entry; ++}; ++ ++struct privcmd_mmapbatch { ++ int num; /* number of pages to populate */ ++ domid_t dom; /* target domain */ ++ __u64 addr; /* virtual address */ ++ xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */ ++}; ++ ++/* ++ * @cmd: IOCTL_PRIVCMD_HYPERCALL ++ * @arg: &privcmd_hypercall_t ++ * Return: Value returned from execution of the specified hypercall. ++ */ ++#define IOCTL_PRIVCMD_HYPERCALL \ ++ _IOC(_IOC_NONE, 'P', 0, sizeof(struct privcmd_hypercall)) ++#define IOCTL_PRIVCMD_MMAP \ ++ _IOC(_IOC_NONE, 'P', 2, sizeof(struct privcmd_mmap)) ++#define IOCTL_PRIVCMD_MMAPBATCH \ ++ _IOC(_IOC_NONE, 'P', 3, sizeof(struct privcmd_mmapbatch)) ++ ++#endif /* __LINUX_PUBLIC_PRIVCMD_H__ */ +diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h +index 883a21b..7058f8a 100644 +--- a/include/xen/xen-ops.h ++++ b/include/xen/xen-ops.h +@@ -7,6 +7,7 @@ DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu); + + void xen_pre_suspend(void); + void xen_post_suspend(int suspend_cancelled); ++void xen_hvm_post_suspend(int suspend_cancelled); + + void xen_mm_pin_all(void); + void xen_mm_unpin_all(void); +@@ -14,4 +15,16 @@ void xen_mm_unpin_all(void); + void xen_timer_resume(void); + void xen_arch_resume(void); + ++int xen_remap_domain_mfn_range(struct vm_area_struct *vma, ++ unsigned long addr, ++ unsigned long mfn, int nr, ++ pgprot_t prot, unsigned domid); ++ ++extern unsigned long *xen_contiguous_bitmap; ++int xen_create_contiguous_region(unsigned long vstart, unsigned int order, ++ unsigned int address_bits); ++ ++void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order); ++int xen_setup_shutdown_event(void); ++ + #endif /* INCLUDE_XEN_OPS_H */ +diff --git a/include/xen/xen.h b/include/xen/xen.h +new file mode 100644 +index 0000000..77604ed +--- /dev/null ++++ b/include/xen/xen.h +@@ -0,0 +1,34 @@ ++#ifndef _XEN_XEN_H ++#define _XEN_XEN_H ++ ++enum xen_domain_type { ++ XEN_NATIVE, /* running on bare hardware */ ++ XEN_PV_DOMAIN, /* running in a PV domain */ ++ XEN_HVM_DOMAIN, /* running in a Xen hvm domain */ ++}; ++ ++#ifdef CONFIG_XEN ++extern enum xen_domain_type xen_domain_type; ++extern void xen_hvm_guest_init(void); ++#else ++#define xen_domain_type XEN_NATIVE ++#define xen_hvm_guest_init() do { } while (0) ++#endif ++ ++#define xen_domain() (xen_domain_type != XEN_NATIVE) ++#define xen_pv_domain() (xen_domain() && \ ++ xen_domain_type == XEN_PV_DOMAIN) ++#define xen_hvm_domain() (xen_domain() && \ ++ xen_domain_type == XEN_HVM_DOMAIN) ++ ++#ifdef CONFIG_XEN_DOM0 ++#include <xen/interface/xen.h> ++#include <asm/xen/hypervisor.h> ++ ++#define xen_initial_domain() (xen_pv_domain() && \ ++ xen_start_info->flags & SIF_INITDOMAIN) ++#else /* !CONFIG_XEN_DOM0 */ ++#define xen_initial_domain() (0) ++#endif /* CONFIG_XEN_DOM0 */ ++ ++#endif /* _XEN_XEN_H */ +diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h +index b9763ba..542ca7c 100644 +--- a/include/xen/xenbus.h ++++ b/include/xen/xenbus.h +@@ -93,7 +93,7 @@ struct xenbus_driver { + int (*remove)(struct xenbus_device *dev); + int (*suspend)(struct xenbus_device *dev, pm_message_t state); + int (*resume)(struct xenbus_device *dev); +- int (*uevent)(struct xenbus_device *, char **, int, char *, int); ++ int (*uevent)(struct xenbus_device *, struct kobj_uevent_env *); + struct device_driver driver; + int (*read_otherend_details)(struct xenbus_device *dev); + int (*is_ready)(struct xenbus_device *dev); +diff --git a/lib/Makefile b/lib/Makefile +index 452f188..001e918 100644 +--- a/lib/Makefile ++++ b/lib/Makefile +@@ -77,7 +77,8 @@ obj-$(CONFIG_TEXTSEARCH_FSM) += ts_fsm.o + obj-$(CONFIG_SMP) += percpu_counter.o + obj-$(CONFIG_AUDIT_GENERIC) += audit.o + +-obj-$(CONFIG_SWIOTLB) += swiotlb.o ++obj-$(CONFIG_SWIOTLB) += swiotlb-core.o swiotlb.o ++obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o + obj-$(CONFIG_IOMMU_HELPER) += iommu-helper.o + obj-$(CONFIG_FAULT_INJECTION) += fault-inject.o + +diff --git a/lib/swiotlb-core.c b/lib/swiotlb-core.c +new file mode 100644 +index 0000000..a17c89e +--- /dev/null ++++ b/lib/swiotlb-core.c +@@ -0,0 +1,572 @@ ++/* ++ * Dynamic DMA mapping support. ++ * ++ * This implementation is a fallback for platforms that do not support ++ * I/O TLBs (aka DMA address translation hardware). ++ * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick@intel.com> ++ * Copyright (C) 2000 Goutham Rao <goutham.rao@intel.com> ++ * Copyright (C) 2000, 2003 Hewlett-Packard Co ++ * David Mosberger-Tang <davidm@hpl.hp.com> ++ * ++ * 03/05/07 davidm Switch from PCI-DMA to generic device DMA API. ++ * 00/12/13 davidm Rename to swiotlb.c and add mark_clean() to avoid ++ * unnecessary i-cache flushing. ++ * 04/07/.. ak Better overflow handling. Assorted fixes. ++ * 05/09/10 linville Add support for syncing ranges, support syncing for ++ * DMA_BIDIRECTIONAL mappings, miscellaneous cleanup. ++ * 08/12/11 beckyb Add highmem support ++ */ ++ ++#include <linux/cache.h> ++#include <linux/dma-mapping.h> ++#include <linux/mm.h> ++#include <linux/module.h> ++#include <linux/spinlock.h> ++#include <linux/string.h> ++#include <linux/swiotlb.h> ++#include <linux/pfn.h> ++#include <linux/types.h> ++#include <linux/ctype.h> ++#include <linux/highmem.h> ++ ++#include <linux/io.h> ++#include <asm/dma.h> ++#include <linux/scatterlist.h> ++ ++#include <linux/init.h> ++#include <linux/bootmem.h> ++#include <linux/iommu-helper.h> ++ ++#define OFFSET(val, align) ((unsigned long) ((val) & ((align) - 1))) ++ ++#define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT)) ++ ++/* ++ * Minimum IO TLB size to bother booting with. Systems with mainly ++ * 64bit capable cards will only lightly use the swiotlb. If we can't ++ * allocate a contiguous 1MB, we're probably in trouble anyway. ++ */ ++#define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) ++ ++int swiotlb_force; ++ ++/* ++ * Used to do a quick range check in do_unmap_single and ++ * do_sync_single_*, to see if the memory was in fact allocated by this ++ * API. ++ */ ++char *io_tlb_start, *io_tlb_end; ++ ++/* ++ * The number of IO TLB blocks (in groups of 64) betweeen io_tlb_start and ++ * io_tlb_end. This is command line adjustable via setup_io_tlb_npages. ++ */ ++unsigned long io_tlb_nslabs; ++ ++/* ++ * When the IOMMU overflows we return a fallback buffer. This sets the size. ++ */ ++unsigned long io_tlb_overflow = 32*1024; ++ ++void *io_tlb_overflow_buffer; ++ ++/* ++ * This is a free list describing the number of free entries available from ++ * each index ++ */ ++static unsigned int *io_tlb_list; ++static unsigned int io_tlb_index; ++ ++/* ++ * We need to save away the original address corresponding to a mapped entry ++ * for the sync operations. ++ */ ++static phys_addr_t *io_tlb_orig_addr; ++ ++/* ++ * Protect the above data structures in the map and unmap calls ++ */ ++static DEFINE_SPINLOCK(io_tlb_lock); ++ ++static int late_alloc; ++ ++static int __init ++setup_io_tlb_npages(char *str) ++{ ++ int get_value(const char *token, char *str, char **endp) ++ { ++ ssize_t len; ++ int val = 0; ++ ++ len = strlen(token); ++ if (!strncmp(str, token, len)) { ++ str += len; ++ if (*str == '=') ++ ++str; ++ if (*str != '\0') ++ val = simple_strtoul(str, endp, 0); ++ } ++ *endp = str; ++ return val; ++ } ++ ++ int val; ++ ++ while (*str) { ++ /* The old syntax */ ++ if (isdigit(*str)) { ++ io_tlb_nslabs = simple_strtoul(str, &str, 0); ++ /* avoid tail segment of size < IO_TLB_SEGSIZE */ ++ io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); ++ } ++ if (!strncmp(str, "force", 5)) ++ swiotlb_force = 1; ++ /* The new syntax: swiotlb=nslabs=16384,overflow=32768,force */ ++ val = get_value("nslabs", str, &str); ++ if (val) ++ io_tlb_nslabs = ALIGN(val, IO_TLB_SEGSIZE); ++ ++ val = get_value("overflow", str, &str); ++ if (val) ++ io_tlb_overflow = val; ++ str = strpbrk(str, ","); ++ if (!str) ++ break; ++ str++; /* skip ',' */ ++ } ++ return 1; ++} ++__setup("swiotlb=", setup_io_tlb_npages); ++ ++void swiotlb_print_info(void) ++{ ++ unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT; ++ phys_addr_t pstart, pend; ++ ++ pstart = virt_to_phys(io_tlb_start); ++ pend = virt_to_phys(io_tlb_end); ++ ++ printk(KERN_INFO "DMA: Placing %luMB software IO TLB between %p - %p\n", ++ bytes >> 20, io_tlb_start, io_tlb_end); ++ printk(KERN_INFO "DMA: software IO TLB at phys %#llx - %#llx\n", ++ (unsigned long long)pstart, ++ (unsigned long long)pend); ++} ++ ++/* ++ * Statically reserve bounce buffer space and initialize bounce buffer data ++ * structures for the software IO TLB used to implement the DMA API. ++ */ ++void __init ++swiotlb_init_early(size_t default_size, int verbose) ++{ ++ unsigned long i, bytes; ++ ++ if (!io_tlb_nslabs) { ++ io_tlb_nslabs = (default_size >> IO_TLB_SHIFT); ++ io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); ++ } ++ ++ bytes = io_tlb_nslabs << IO_TLB_SHIFT; ++ ++ /* ++ * Get IO TLB memory from the low pages ++ */ ++ io_tlb_start = alloc_bootmem_low_pages(bytes); ++ if (!io_tlb_start) ++ panic("DMA: Cannot allocate SWIOTLB buffer"); ++ io_tlb_end = io_tlb_start + bytes; ++ ++ /* ++ * Allocate and initialize the free list array. This array is used ++ * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE ++ * between io_tlb_start and io_tlb_end. ++ */ ++ io_tlb_list = alloc_bootmem(io_tlb_nslabs * sizeof(int)); ++ for (i = 0; i < io_tlb_nslabs; i++) ++ io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); ++ io_tlb_index = 0; ++ io_tlb_orig_addr = alloc_bootmem(io_tlb_nslabs * sizeof(phys_addr_t)); ++ ++ /* ++ * Get the overflow emergency buffer ++ */ ++ io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow); ++ if (!io_tlb_overflow_buffer) ++ panic("DMA: Cannot allocate SWIOTLB overflow buffer!\n"); ++ if (verbose) ++ swiotlb_print_info(); ++} ++ ++void __init ++swiotlb_init(int verbose) ++{ ++ swiotlb_init_early(64 * (1<<20), verbose); /* default to 64MB */ ++} ++ ++/* ++ * Systems with larger DMA zones (those that don't support ISA) can ++ * initialize the swiotlb later using the slab allocator if needed. ++ * This should be just like above, but with some error catching. ++ */ ++int ++swiotlb_init_late(size_t default_size) ++{ ++ unsigned long i, bytes, req_nslabs = io_tlb_nslabs; ++ unsigned int order; ++ ++ if (!io_tlb_nslabs) { ++ io_tlb_nslabs = (default_size >> IO_TLB_SHIFT); ++ io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); ++ } ++ ++ /* ++ * Get IO TLB memory from the low pages ++ */ ++ order = get_order(io_tlb_nslabs << IO_TLB_SHIFT); ++ io_tlb_nslabs = SLABS_PER_PAGE << order; ++ bytes = io_tlb_nslabs << IO_TLB_SHIFT; ++ ++ while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { ++ io_tlb_start = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, ++ order); ++ if (io_tlb_start) ++ break; ++ order--; ++ } ++ ++ if (!io_tlb_start) ++ goto cleanup1; ++ ++ if (order != get_order(bytes)) { ++ printk(KERN_WARNING "DMA: Warning: only able to allocate %ld MB" ++ " for software IO TLB\n", (PAGE_SIZE << order) >> 20); ++ io_tlb_nslabs = SLABS_PER_PAGE << order; ++ bytes = io_tlb_nslabs << IO_TLB_SHIFT; ++ } ++ io_tlb_end = io_tlb_start + bytes; ++ memset(io_tlb_start, 0, bytes); ++ ++ /* ++ * Allocate and initialize the free list array. This array is used ++ * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE ++ * between io_tlb_start and io_tlb_end. ++ */ ++ io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL, ++ get_order(io_tlb_nslabs * sizeof(int))); ++ if (!io_tlb_list) ++ goto cleanup2; ++ ++ for (i = 0; i < io_tlb_nslabs; i++) ++ io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); ++ io_tlb_index = 0; ++ ++ io_tlb_orig_addr = (phys_addr_t *) __get_free_pages(GFP_KERNEL, ++ get_order(io_tlb_nslabs * sizeof(phys_addr_t))); ++ if (!io_tlb_orig_addr) ++ goto cleanup3; ++ ++ memset(io_tlb_orig_addr, 0, io_tlb_nslabs * sizeof(phys_addr_t)); ++ ++ /* ++ * Get the overflow emergency buffer ++ */ ++ io_tlb_overflow_buffer = (void *)__get_free_pages(GFP_DMA, ++ get_order(io_tlb_overflow)); ++ if (!io_tlb_overflow_buffer) ++ goto cleanup4; ++ ++ swiotlb_print_info(); ++ ++ late_alloc = 1; ++ ++ return 0; ++ ++cleanup4: ++ free_pages((unsigned long)io_tlb_orig_addr, ++ get_order(io_tlb_nslabs * sizeof(phys_addr_t))); ++ io_tlb_orig_addr = NULL; ++cleanup3: ++ free_pages((unsigned long)io_tlb_list, ++ get_order(io_tlb_nslabs * sizeof(int))); ++ io_tlb_list = NULL; ++cleanup2: ++ io_tlb_end = NULL; ++ free_pages((unsigned long)io_tlb_start, order); ++ io_tlb_start = NULL; ++cleanup1: ++ io_tlb_nslabs = req_nslabs; ++ return -ENOMEM; ++} ++ ++void __init swiotlb_free(void) ++{ ++ if (!io_tlb_overflow_buffer) ++ return; ++ ++ if (late_alloc) { ++ free_pages((unsigned long)io_tlb_overflow_buffer, ++ get_order(io_tlb_overflow)); ++ free_pages((unsigned long)io_tlb_orig_addr, ++ get_order(io_tlb_nslabs * sizeof(phys_addr_t))); ++ free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs * ++ sizeof(int))); ++ free_pages((unsigned long)io_tlb_start, ++ get_order(io_tlb_nslabs << IO_TLB_SHIFT)); ++ } else { ++ free_bootmem_late(__pa(io_tlb_overflow_buffer), ++ io_tlb_overflow); ++ free_bootmem_late(__pa(io_tlb_orig_addr), ++ io_tlb_nslabs * sizeof(phys_addr_t)); ++ free_bootmem_late(__pa(io_tlb_list), ++ io_tlb_nslabs * sizeof(int)); ++ free_bootmem_late(__pa(io_tlb_start), ++ io_tlb_nslabs << IO_TLB_SHIFT); ++ } ++} ++ ++int is_swiotlb_buffer(phys_addr_t paddr) ++{ ++ return paddr >= virt_to_phys(io_tlb_start) && ++ paddr < virt_to_phys(io_tlb_end); ++} ++ ++/* ++ * Bounce: copy the swiotlb buffer back to the original dma location ++ */ ++void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size, ++ enum dma_data_direction dir) ++{ ++ unsigned long pfn = PFN_DOWN(phys); ++ ++ if (PageHighMem(pfn_to_page(pfn))) { ++ /* The buffer does not have a mapping. Map it in and copy */ ++ unsigned int offset = phys & ~PAGE_MASK; ++ char *buffer; ++ unsigned int sz = 0; ++ unsigned long flags; ++ ++ while (size) { ++ sz = min_t(size_t, PAGE_SIZE - offset, size); ++ ++ local_irq_save(flags); ++ buffer = kmap_atomic(pfn_to_page(pfn), ++ KM_BOUNCE_READ); ++ if (dir == DMA_TO_DEVICE) ++ memcpy(dma_addr, buffer + offset, sz); ++ else ++ memcpy(buffer + offset, dma_addr, sz); ++ kunmap_atomic(buffer, KM_BOUNCE_READ); ++ local_irq_restore(flags); ++ ++ size -= sz; ++ pfn++; ++ dma_addr += sz; ++ offset = 0; ++ } ++ } else { ++ if (dir == DMA_TO_DEVICE) ++ memcpy(dma_addr, phys_to_virt(phys), size); ++ else ++ memcpy(phys_to_virt(phys), dma_addr, size); ++ } ++} ++ ++/* ++ * Allocates bounce buffer and returns its kernel virtual address. ++ */ ++void * ++do_map_single(struct device *hwdev, phys_addr_t phys, ++ unsigned long start_dma_addr, size_t size, int dir) ++{ ++ unsigned long flags; ++ char *dma_addr; ++ unsigned int nslots, stride, index, wrap; ++ int i; ++ unsigned long mask; ++ unsigned long offset_slots; ++ unsigned long max_slots; ++ ++ mask = dma_get_seg_boundary(hwdev); ++ start_dma_addr = start_dma_addr & mask; ++ offset_slots = ALIGN(start_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; ++ ++ /* ++ * Carefully handle integer overflow which can occur when mask == ~0UL. ++ */ ++ max_slots = mask + 1 ++ ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT ++ : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT); ++ ++ /* ++ * For mappings greater than a page, we limit the stride (and ++ * hence alignment) to a page size. ++ */ ++ nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; ++ if (size > PAGE_SIZE) ++ stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT)); ++ else ++ stride = 1; ++ ++ BUG_ON(!nslots); ++ ++ /* ++ * Find suitable number of IO TLB entries size that will fit this ++ * request and allocate a buffer from that IO TLB pool. ++ */ ++ spin_lock_irqsave(&io_tlb_lock, flags); ++ index = ALIGN(io_tlb_index, stride); ++ if (index >= io_tlb_nslabs) ++ index = 0; ++ wrap = index; ++ ++ do { ++ while (iommu_is_span_boundary(index, nslots, offset_slots, ++ max_slots)) { ++ index += stride; ++ if (index >= io_tlb_nslabs) ++ index = 0; ++ if (index == wrap) ++ goto not_found; ++ } ++ ++ /* ++ * If we find a slot that indicates we have 'nslots' number of ++ * contiguous buffers, we allocate the buffers from that slot ++ * and mark the entries as '0' indicating unavailable. ++ */ ++ if (io_tlb_list[index] >= nslots) { ++ int count = 0; ++ ++ for (i = index; i < (int) (index + nslots); i++) ++ io_tlb_list[i] = 0; ++ for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) ++ != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--) ++ io_tlb_list[i] = ++count; ++ dma_addr = io_tlb_start + (index << IO_TLB_SHIFT); ++ ++ /* ++ * Update the indices to avoid searching in the next ++ * round. ++ */ ++ io_tlb_index = ((index + nslots) < io_tlb_nslabs ++ ? (index + nslots) : 0); ++ ++ goto found; ++ } ++ index += stride; ++ if (index >= io_tlb_nslabs) ++ index = 0; ++ } while (index != wrap); ++ ++not_found: ++ spin_unlock_irqrestore(&io_tlb_lock, flags); ++ return NULL; ++found: ++ spin_unlock_irqrestore(&io_tlb_lock, flags); ++ ++ /* ++ * Save away the mapping from the original address to the DMA address. ++ * This is needed when we sync the memory. Then we sync the buffer if ++ * needed. ++ */ ++ for (i = 0; i < nslots; i++) ++ io_tlb_orig_addr[index+i] = phys + (i << IO_TLB_SHIFT); ++ if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) ++ swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE); ++ ++ return dma_addr; ++} ++ ++/* ++ * dma_addr is the kernel virtual address of the bounce buffer to unmap. ++ */ ++void ++do_unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir) ++{ ++ unsigned long flags; ++ int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; ++ int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT; ++ phys_addr_t phys = io_tlb_orig_addr[index]; ++ ++ /* ++ * First, sync the memory before unmapping the entry ++ */ ++ if (phys && ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))) ++ swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE); ++ ++ /* ++ * Return the buffer to the free list by setting the corresponding ++ * entries to indicate the number of contigous entries available. ++ * While returning the entries to the free list, we merge the entries ++ * with slots below and above the pool being returned. ++ */ ++ spin_lock_irqsave(&io_tlb_lock, flags); ++ { ++ count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ? ++ io_tlb_list[index + nslots] : 0); ++ /* ++ * Step 1: return the slots to the free list, merging the ++ * slots with superceeding slots ++ */ ++ for (i = index + nslots - 1; i >= index; i--) ++ io_tlb_list[i] = ++count; ++ /* ++ * Step 2: merge the returned slots with the preceding slots, ++ * if available (non zero) ++ */ ++ for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != ++ IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--) ++ io_tlb_list[i] = ++count; ++ } ++ spin_unlock_irqrestore(&io_tlb_lock, flags); ++} ++ ++void ++do_sync_single(struct device *hwdev, char *dma_addr, size_t size, ++ int dir, int target) ++{ ++ int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT; ++ phys_addr_t phys = io_tlb_orig_addr[index]; ++ ++ phys += ((unsigned long)dma_addr & ((1 << IO_TLB_SHIFT) - 1)); ++ ++ switch (target) { ++ case SYNC_FOR_CPU: ++ if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)) ++ swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE); ++ else ++ BUG_ON(dir != DMA_TO_DEVICE); ++ break; ++ case SYNC_FOR_DEVICE: ++ if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) ++ swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE); ++ else ++ BUG_ON(dir != DMA_FROM_DEVICE); ++ break; ++ default: ++ BUG(); ++ } ++} ++void ++swiotlb_full(struct device *dev, size_t size, int dir, int do_panic) ++{ ++ /* ++ * Ran out of IOMMU space for this operation. This is very bad. ++ * Unfortunately the drivers cannot handle this operation properly. ++ * unless they check for dma_mapping_error (most don't) ++ * When the mapping is small enough return a static buffer to limit ++ * the damage, or panic when the transfer is too big. ++ */ ++ dev_err(dev, "DMA: Out of SW-IOMMU space for %zu bytes.", size); ++ ++ if (size <= io_tlb_overflow || !do_panic) ++ return; ++ ++ if (dir == DMA_BIDIRECTIONAL) ++ panic("DMA: Random memory could be DMA accessed\n"); ++ if (dir == DMA_FROM_DEVICE) ++ panic("DMA: Random memory could be DMA written\n"); ++ if (dir == DMA_TO_DEVICE) ++ panic("DMA: Random memory could be DMA read\n"); ++} +diff --git a/lib/swiotlb-xen.c b/lib/swiotlb-xen.c +new file mode 100644 +index 0000000..bee577f +--- /dev/null ++++ b/lib/swiotlb-xen.c +@@ -0,0 +1,504 @@ ++/* An software based IOMMU that utilizes the swiotlb-core fuctionality. ++ * It can function on Xen when there are PCI devices present.*/ ++ ++ ++#include <linux/dma-mapping.h> ++#include <linux/io.h> ++#include <asm/dma.h> ++#include <linux/scatterlist.h> ++#include <xen/interface/xen.h> ++#include <xen/grant_table.h> ++ ++#include <asm/xen/page.h> ++#include <xen/page.h> ++#include <xen/xen-ops.h> ++ ++static dma_addr_t xen_phys_to_bus(phys_addr_t paddr) ++{ ++ return phys_to_machine(XPADDR(paddr)).maddr;; ++} ++ ++static phys_addr_t xen_bus_to_phys(dma_addr_t baddr) ++{ ++ return machine_to_phys(XMADDR(baddr)).paddr; ++} ++ ++static dma_addr_t xen_virt_to_bus(void *address) ++{ ++ return xen_phys_to_bus(virt_to_phys(address)); ++} ++ ++static int check_pages_physically_contiguous(unsigned long pfn, ++ unsigned int offset, ++ size_t length) ++{ ++ unsigned long next_mfn; ++ int i; ++ int nr_pages; ++ ++ next_mfn = pfn_to_mfn(pfn); ++ nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT; ++ ++ for (i = 1; i < nr_pages; i++) { ++ if (pfn_to_mfn(++pfn) != ++next_mfn) ++ return 0; ++ } ++ return 1; ++} ++ ++static int range_straddles_page_boundary(phys_addr_t p, size_t size) ++{ ++ unsigned long pfn = PFN_DOWN(p); ++ unsigned int offset = p & ~PAGE_MASK; ++ ++ if (offset + size <= PAGE_SIZE) ++ return 0; ++ if (check_pages_physically_contiguous(pfn, offset, size)) ++ return 0; ++ return 1; ++} ++ ++ ++bool xen_dma_capable(struct device *dev, dma_addr_t dev_addr, ++ phys_addr_t phys, size_t size) ++{ ++ int rc = 0; ++ ++ rc = dma_capable(dev, dev_addr, size) && ++ !range_straddles_page_boundary(phys, size); ++ return rc; ++} ++ ++static int is_xen_swiotlb_buffer(dma_addr_t dma_addr) ++{ ++ unsigned long mfn = PFN_DOWN(dma_addr); ++ unsigned long pfn = mfn_to_local_pfn(mfn); ++ ++ /* If the address is outside our domain, it CAN have the same virtual ++ * address as another address in our domain. Hence only check address ++ * within our domain. */ ++ if (pfn_valid(pfn)) ++ return is_swiotlb_buffer(PFN_PHYS(pfn)); ++ ++ return 0; ++} ++void * ++xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, ++ dma_addr_t *dma_handle, gfp_t flags) ++{ ++ void *ret; ++ int order = get_order(size); ++ u64 dma_mask = DMA_BIT_MASK(32); ++ unsigned long vstart; ++ ++ /* ++ * Ignore region specifiers - the kernel's ideas of ++ * pseudo-phys memory layout has nothing to do with the ++ * machine physical layout. We can't allocate highmem ++ * because we can't return a pointer to it. ++ */ ++ flags &= ~(__GFP_DMA | __GFP_HIGHMEM); ++ ++ if (dma_alloc_from_coherent(hwdev, size, dma_handle, &ret)) ++ return ret; ++ ++ vstart = __get_free_pages(flags, order); ++ ret = (void *)vstart; ++ ++ if (hwdev && hwdev->coherent_dma_mask) ++ dma_mask = dma_alloc_coherent_mask(hwdev, flags); ++ ++ if (ret) { ++ if (xen_create_contiguous_region(vstart, order, ++ fls64(dma_mask)) != 0) { ++ free_pages(vstart, order); ++ return NULL; ++ } ++ memset(ret, 0, size); ++ *dma_handle = virt_to_machine(ret).maddr; ++ } ++ return ret; ++} ++EXPORT_SYMBOL(xen_swiotlb_alloc_coherent); ++ ++void ++xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, ++ dma_addr_t dev_addr) ++{ ++ int order = get_order(size); ++ ++ if (dma_release_from_coherent(hwdev, order, vaddr)) ++ return; ++ ++ xen_destroy_contiguous_region((unsigned long)vaddr, order); ++ free_pages((unsigned long)vaddr, order); ++} ++EXPORT_SYMBOL(xen_swiotlb_free_coherent); ++ ++ ++static int max_dma_bits = 32; ++ ++static int ++xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs) ++{ ++ int i, rc; ++ int dma_bits; ++ ++ printk(KERN_INFO "xen_swiotlb_fixup: buf=%p size=%zu\n", ++ buf, size); ++ ++ dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT; ++ ++ i = 0; ++ do { ++ int slabs = min(nslabs - i, (unsigned long)IO_TLB_SEGSIZE); ++ ++ do { ++ rc = xen_create_contiguous_region( ++ (unsigned long)buf + (i << IO_TLB_SHIFT), ++ get_order(slabs << IO_TLB_SHIFT), ++ dma_bits); ++ } while (rc && dma_bits++ < max_dma_bits); ++ if (rc) ++ return rc; ++ ++ i += slabs; ++ } while(i < nslabs); ++ return 0; ++} ++ ++void __init xen_swiotlb_init(int verbose) ++{ ++ int rc = 0; ++ ++ swiotlb_init_early(64 * (1<<20), verbose); ++ ++ if ((rc = xen_swiotlb_fixup(io_tlb_start, ++ io_tlb_nslabs << IO_TLB_SHIFT, ++ io_tlb_nslabs))) ++ goto error; ++ ++ if ((rc = xen_swiotlb_fixup(io_tlb_overflow_buffer, ++ io_tlb_overflow, ++ io_tlb_overflow >> IO_TLB_SHIFT))) ++ goto error; ++ ++ return; ++error: ++ panic("DMA(%d): Failed to exchange pages allocated for DMA with Xen! "\ ++ "We either don't have the permission or you do not have enough"\ ++ "free memory under 4GB!\n", rc); ++} ++ ++/* ++ * Map a single buffer of the indicated size for DMA in streaming mode. The ++ * physical address to use is returned. ++ * ++ * Once the device is given the dma address, the device owns this memory until ++ * either xen_swiotlb_unmap_page or xen_swiotlb_dma_sync_single is performed. ++ */ ++dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, ++ unsigned long offset, size_t size, ++ enum dma_data_direction dir, ++ struct dma_attrs *attrs) ++{ ++ unsigned long start_dma_addr; ++ phys_addr_t phys = page_to_phys(page) + offset; ++ dma_addr_t dev_addr = xen_phys_to_bus(phys); ++ void *map; ++ ++ BUG_ON(dir == DMA_NONE); ++ /* ++ * If the address happens to be in the device's DMA window, ++ * we can safely return the device addr and not worry about bounce ++ * buffering it. ++ */ ++ if (dma_capable(dev, dev_addr, size) && ++ !range_straddles_page_boundary(phys, size) && !swiotlb_force) ++ return dev_addr; ++ ++ /* ++ * Oh well, have to allocate and map a bounce buffer. ++ */ ++ start_dma_addr = xen_virt_to_bus(io_tlb_start); ++ map = do_map_single(dev, phys, start_dma_addr, size, dir); ++ if (!map) { ++ swiotlb_full(dev, size, dir, 1); ++ map = io_tlb_overflow_buffer; ++ } ++ ++ dev_addr = xen_virt_to_bus(map); ++ ++ /* ++ * Ensure that the address returned is DMA'ble ++ */ ++ if (!dma_capable(dev, dev_addr, size)) ++ panic("DMA: xen_swiotlb_map_single: bounce buffer is not " \ ++ "DMA'ble\n"); ++ return dev_addr; ++} ++EXPORT_SYMBOL_GPL(xen_swiotlb_map_page); ++ ++/* ++ * Unmap a single streaming mode DMA translation. The dma_addr and size must ++ * match what was provided for in a previous xen_swiotlb_map_page call. All ++ * other usages are undefined. ++ * ++ * After this call, reads by the cpu to the buffer are guaranteed to see ++ * whatever the device wrote there. ++ */ ++static void unmap_single(struct device *hwdev, dma_addr_t dev_addr, ++ size_t size, int dir) ++{ ++ phys_addr_t paddr = xen_bus_to_phys(dev_addr); ++ ++ BUG_ON(dir == DMA_NONE); ++ ++ /* NOTE: We use dev_addr here, not paddr! */ ++ if (is_xen_swiotlb_buffer(dev_addr)) { ++ do_unmap_single(hwdev, phys_to_virt(paddr), size, dir); ++ return; ++ } ++ ++ if (dir != DMA_FROM_DEVICE) ++ return; ++ ++ /* ++ * phys_to_virt doesn't work with hihgmem page but we could ++ * call dma_mark_clean() with hihgmem page here. However, we ++ * are fine since dma_mark_clean() is null on POWERPC. We can ++ * make dma_mark_clean() take a physical address if necessary. ++ */ ++ dma_mark_clean(phys_to_virt(paddr), size); ++} ++ ++void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, ++ size_t size, enum dma_data_direction dir, ++ struct dma_attrs *attrs) ++{ ++ unmap_single(hwdev, dev_addr, size, dir); ++} ++EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_page); ++ ++/* ++ * Make physical memory consistent for a single streaming mode DMA translation ++ * after a transfer. ++ * ++ * If you perform a xen_swiotlb_map_page() but wish to interrogate the buffer ++ * using the cpu, yet do not wish to teardown the dma mapping, you must ++ * call this function before doing so. At the next point you give the dma ++ * address back to the card, you must first perform a ++ * xen_swiotlb_dma_sync_for_device, and then the device again owns the buffer ++ */ ++static void ++xen_swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, ++ size_t size, int dir, int target) ++{ ++ phys_addr_t paddr = xen_bus_to_phys(dev_addr); ++ ++ BUG_ON(dir == DMA_NONE); ++ ++ if (is_xen_swiotlb_buffer(dev_addr)) { ++ do_sync_single(hwdev, phys_to_virt(paddr), size, dir, target); ++ return; ++ } ++ ++ if (dir != DMA_FROM_DEVICE) ++ return; ++ ++ dma_mark_clean(phys_to_virt(paddr), size); ++} ++ ++void ++xen_swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr, ++ size_t size, enum dma_data_direction dir) ++{ ++ xen_swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_CPU); ++} ++EXPORT_SYMBOL(xen_swiotlb_sync_single_for_cpu); ++ ++void ++xen_swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr, ++ size_t size, enum dma_data_direction dir) ++{ ++ xen_swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_DEVICE); ++} ++EXPORT_SYMBOL(xen_swiotlb_sync_single_for_device); ++ ++/* ++ * Same as above, but for a sub-range of the mapping. ++ */ ++static void ++xen_swiotlb_sync_single_range(struct device *hwdev, dma_addr_t dev_addr, ++ unsigned long offset, size_t size, ++ int dir, int target) ++{ ++ xen_swiotlb_sync_single(hwdev, dev_addr + offset, size, dir, target); ++} ++ ++void ++xen_swiotlb_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dev_addr, ++ unsigned long offset, size_t size, ++ enum dma_data_direction dir) ++{ ++ xen_swiotlb_sync_single_range(hwdev, dev_addr, offset, size, dir, ++ SYNC_FOR_CPU); ++} ++EXPORT_SYMBOL_GPL(xen_swiotlb_sync_single_range_for_cpu); ++ ++void ++xen_swiotlb_sync_single_range_for_device(struct device *hwdev, ++ dma_addr_t dev_addr, ++ unsigned long offset, size_t size, ++ enum dma_data_direction dir) ++{ ++ xen_swiotlb_sync_single_range(hwdev, dev_addr, offset, size, dir, ++ SYNC_FOR_DEVICE); ++} ++EXPORT_SYMBOL_GPL(xen_swiotlb_sync_single_range_for_device); ++ ++/* ++ * Map a set of buffers described by scatterlist in streaming mode for DMA. ++ * This is the scatter-gather version of the above xen_swiotlb_map_page ++ * interface. Here the scatter gather list elements are each tagged with the ++ * appropriate dma address and length. They are obtained via ++ * sg_dma_{address,length}(SG). ++ * ++ * NOTE: An implementation may be able to use a smaller number of ++ * DMA address/length pairs than there are SG table elements. ++ * (for example via virtual mapping capabilities) ++ * The routine returns the number of addr/length pairs actually ++ * used, at most nents. ++ * ++ * Device ownership issues as mentioned above for xen_swiotlb_map_page are the ++ * same here. ++ */ ++int ++xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, ++ int nelems, enum dma_data_direction dir, ++ struct dma_attrs *attrs) ++{ ++ unsigned long start_dma_addr; ++ struct scatterlist *sg; ++ int i; ++ BUG_ON(dir == DMA_NONE); ++ ++ start_dma_addr = xen_virt_to_bus(io_tlb_start); ++ for_each_sg(sgl, sg, nelems, i) { ++ phys_addr_t paddr = sg_phys(sg); ++ dma_addr_t dev_addr = xen_phys_to_bus(paddr); ++ ++ if (swiotlb_force || ++ !dma_capable(hwdev, dev_addr, sg->length) || ++ range_straddles_page_boundary(paddr, sg->length)) { ++ void *map = do_map_single(hwdev, sg_phys(sg), ++ start_dma_addr, ++ sg->length, dir); ++ if (!map) { ++ /* Don't panic here, we expect map_sg users ++ to do proper error handling. */ ++ swiotlb_full(hwdev, sg->length, dir, 0); ++ xen_swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir, ++ attrs); ++ sgl[0].dma_length = 0; ++ return 0; ++ } ++ sg->dma_address = xen_virt_to_bus(map); ++ } else ++ sg->dma_address = dev_addr; ++ sg->dma_length = sg->length; ++ } ++ return nelems; ++} ++EXPORT_SYMBOL(xen_swiotlb_map_sg_attrs); ++ ++int ++xen_swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, ++ int dir) ++{ ++ return xen_swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, NULL); ++} ++EXPORT_SYMBOL(xen_swiotlb_map_sg); ++ ++/* ++ * Unmap a set of streaming mode DMA translations. Again, cpu read rules ++ * concerning calls here are the same as for xen_swiotlb_unmap_page() above. ++ */ ++void ++xen_swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, ++ int nelems, enum dma_data_direction dir, ++ struct dma_attrs *attrs) ++{ ++ struct scatterlist *sg; ++ int i; ++ ++ BUG_ON(dir == DMA_NONE); ++ ++ for_each_sg(sgl, sg, nelems, i) ++ unmap_single(hwdev, sg->dma_address, sg->dma_length, dir); ++ ++} ++EXPORT_SYMBOL(xen_swiotlb_unmap_sg_attrs); ++ ++void ++xen_swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, ++ int dir) ++{ ++ return xen_swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, NULL); ++} ++EXPORT_SYMBOL(xen_swiotlb_unmap_sg); ++ ++/* ++ * Make physical memory consistent for a set of streaming mode DMA translations ++ * after a transfer. ++ * ++ * The same as xen_swiotlb_sync_single_* but for a scatter-gather list, ++ * same rules and usage. ++ */ ++static void ++xen_swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl, ++ int nelems, int dir, int target) ++{ ++ struct scatterlist *sg; ++ int i; ++ ++ for_each_sg(sgl, sg, nelems, i) ++ xen_swiotlb_sync_single(hwdev, sg->dma_address, ++ sg->dma_length, dir, target); ++} ++ ++void ++xen_swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg, ++ int nelems, enum dma_data_direction dir) ++{ ++ xen_swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_CPU); ++} ++EXPORT_SYMBOL(xen_swiotlb_sync_sg_for_cpu); ++ ++void ++xen_swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg, ++ int nelems, enum dma_data_direction dir) ++{ ++ xen_swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE); ++} ++EXPORT_SYMBOL(xen_swiotlb_sync_sg_for_device); ++ ++int ++xen_swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr) ++{ ++ return (dma_addr == xen_virt_to_bus(io_tlb_overflow_buffer)); ++} ++EXPORT_SYMBOL(xen_swiotlb_dma_mapping_error); ++ ++/* ++ * Return whether the given device DMA address mask can be supported ++ * properly. For example, if your device can only drive the low 24-bits ++ * during bus mastering, then you would pass 0x00ffffff as the mask to ++ * this function. ++ */ ++int ++xen_swiotlb_dma_supported(struct device *hwdev, u64 mask) ++{ ++ return xen_virt_to_bus(io_tlb_end - 1) <= mask; ++} ++EXPORT_SYMBOL(xen_swiotlb_dma_supported); +diff --git a/lib/swiotlb.c b/lib/swiotlb.c +index ac25cd2..f6bbcd1 100644 +--- a/lib/swiotlb.c ++++ b/lib/swiotlb.c +@@ -1,118 +1,11 @@ +-/* +- * Dynamic DMA mapping support. +- * +- * This implementation is a fallback for platforms that do not support +- * I/O TLBs (aka DMA address translation hardware). +- * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick@intel.com> +- * Copyright (C) 2000 Goutham Rao <goutham.rao@intel.com> +- * Copyright (C) 2000, 2003 Hewlett-Packard Co +- * David Mosberger-Tang <davidm@hpl.hp.com> +- * +- * 03/05/07 davidm Switch from PCI-DMA to generic device DMA API. +- * 00/12/13 davidm Rename to swiotlb.c and add mark_clean() to avoid +- * unnecessary i-cache flushing. +- * 04/07/.. ak Better overflow handling. Assorted fixes. +- * 05/09/10 linville Add support for syncing ranges, support syncing for +- * DMA_BIDIRECTIONAL mappings, miscellaneous cleanup. +- * 08/12/11 beckyb Add highmem support +- */ + +-#include <linux/cache.h> + #include <linux/dma-mapping.h> +-#include <linux/mm.h> + #include <linux/module.h> +-#include <linux/spinlock.h> +-#include <linux/string.h> + #include <linux/swiotlb.h> +-#include <linux/pfn.h> +-#include <linux/types.h> +-#include <linux/ctype.h> +-#include <linux/highmem.h> + +-#include <asm/io.h> +-#include <asm/dma.h> + #include <asm/scatterlist.h> +- +-#include <linux/init.h> +-#include <linux/bootmem.h> + #include <linux/iommu-helper.h> + +-#define OFFSET(val,align) ((unsigned long) \ +- ( (val) & ( (align) - 1))) +- +-#define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT)) +- +-/* +- * Minimum IO TLB size to bother booting with. Systems with mainly +- * 64bit capable cards will only lightly use the swiotlb. If we can't +- * allocate a contiguous 1MB, we're probably in trouble anyway. +- */ +-#define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) +- +-/* +- * Enumeration for sync targets +- */ +-enum dma_sync_target { +- SYNC_FOR_CPU = 0, +- SYNC_FOR_DEVICE = 1, +-}; +- +-int swiotlb_force; +- +-/* +- * Used to do a quick range check in unmap_single and +- * sync_single_*, to see if the memory was in fact allocated by this +- * API. +- */ +-static char *io_tlb_start, *io_tlb_end; +- +-/* +- * The number of IO TLB blocks (in groups of 64) betweeen io_tlb_start and +- * io_tlb_end. This is command line adjustable via setup_io_tlb_npages. +- */ +-static unsigned long io_tlb_nslabs; +- +-/* +- * When the IOMMU overflows we return a fallback buffer. This sets the size. +- */ +-static unsigned long io_tlb_overflow = 32*1024; +- +-void *io_tlb_overflow_buffer; +- +-/* +- * This is a free list describing the number of free entries available from +- * each index +- */ +-static unsigned int *io_tlb_list; +-static unsigned int io_tlb_index; +- +-/* +- * We need to save away the original address corresponding to a mapped entry +- * for the sync operations. +- */ +-static phys_addr_t *io_tlb_orig_addr; +- +-/* +- * Protect the above data structures in the map and unmap calls +- */ +-static DEFINE_SPINLOCK(io_tlb_lock); +- +-static int __init +-setup_io_tlb_npages(char *str) +-{ +- if (isdigit(*str)) { +- io_tlb_nslabs = simple_strtoul(str, &str, 0); +- /* avoid tail segment of size < IO_TLB_SEGSIZE */ +- io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); +- } +- if (*str == ',') +- ++str; +- if (!strcmp(str, "force")) +- swiotlb_force = 1; +- return 1; +-} +-__setup("swiotlb=", setup_io_tlb_npages); +-/* make io_tlb_overflow tunable too? */ + + /* Note that this doesn't work with highmem page */ + static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev, +@@ -120,390 +13,6 @@ static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev, + { + return phys_to_dma(hwdev, virt_to_phys(address)); + } +- +-static void swiotlb_print_info(unsigned long bytes) +-{ +- phys_addr_t pstart, pend; +- +- pstart = virt_to_phys(io_tlb_start); +- pend = virt_to_phys(io_tlb_end); +- +- printk(KERN_INFO "Placing %luMB software IO TLB between %p - %p\n", +- bytes >> 20, io_tlb_start, io_tlb_end); +- printk(KERN_INFO "software IO TLB at phys %#llx - %#llx\n", +- (unsigned long long)pstart, +- (unsigned long long)pend); +-} +- +-/* +- * Statically reserve bounce buffer space and initialize bounce buffer data +- * structures for the software IO TLB used to implement the DMA API. +- */ +-void __init +-swiotlb_init_with_default_size(size_t default_size) +-{ +- unsigned long i, bytes; +- +- if (!io_tlb_nslabs) { +- io_tlb_nslabs = (default_size >> IO_TLB_SHIFT); +- io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); +- } +- +- bytes = io_tlb_nslabs << IO_TLB_SHIFT; +- +- /* +- * Get IO TLB memory from the low pages +- */ +- io_tlb_start = alloc_bootmem_low_pages(bytes); +- if (!io_tlb_start) +- panic("Cannot allocate SWIOTLB buffer"); +- io_tlb_end = io_tlb_start + bytes; +- +- /* +- * Allocate and initialize the free list array. This array is used +- * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE +- * between io_tlb_start and io_tlb_end. +- */ +- io_tlb_list = alloc_bootmem(io_tlb_nslabs * sizeof(int)); +- for (i = 0; i < io_tlb_nslabs; i++) +- io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); +- io_tlb_index = 0; +- io_tlb_orig_addr = alloc_bootmem(io_tlb_nslabs * sizeof(phys_addr_t)); +- +- /* +- * Get the overflow emergency buffer +- */ +- io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow); +- if (!io_tlb_overflow_buffer) +- panic("Cannot allocate SWIOTLB overflow buffer!\n"); +- +- swiotlb_print_info(bytes); +-} +- +-void __init +-swiotlb_init(void) +-{ +- swiotlb_init_with_default_size(64 * (1<<20)); /* default to 64MB */ +-} +- +-/* +- * Systems with larger DMA zones (those that don't support ISA) can +- * initialize the swiotlb later using the slab allocator if needed. +- * This should be just like above, but with some error catching. +- */ +-int +-swiotlb_late_init_with_default_size(size_t default_size) +-{ +- unsigned long i, bytes, req_nslabs = io_tlb_nslabs; +- unsigned int order; +- +- if (!io_tlb_nslabs) { +- io_tlb_nslabs = (default_size >> IO_TLB_SHIFT); +- io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); +- } +- +- /* +- * Get IO TLB memory from the low pages +- */ +- order = get_order(io_tlb_nslabs << IO_TLB_SHIFT); +- io_tlb_nslabs = SLABS_PER_PAGE << order; +- bytes = io_tlb_nslabs << IO_TLB_SHIFT; +- +- while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { +- io_tlb_start = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, +- order); +- if (io_tlb_start) +- break; +- order--; +- } +- +- if (!io_tlb_start) +- goto cleanup1; +- +- if (order != get_order(bytes)) { +- printk(KERN_WARNING "Warning: only able to allocate %ld MB " +- "for software IO TLB\n", (PAGE_SIZE << order) >> 20); +- io_tlb_nslabs = SLABS_PER_PAGE << order; +- bytes = io_tlb_nslabs << IO_TLB_SHIFT; +- } +- io_tlb_end = io_tlb_start + bytes; +- memset(io_tlb_start, 0, bytes); +- +- /* +- * Allocate and initialize the free list array. This array is used +- * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE +- * between io_tlb_start and io_tlb_end. +- */ +- io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL, +- get_order(io_tlb_nslabs * sizeof(int))); +- if (!io_tlb_list) +- goto cleanup2; +- +- for (i = 0; i < io_tlb_nslabs; i++) +- io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); +- io_tlb_index = 0; +- +- io_tlb_orig_addr = (phys_addr_t *) +- __get_free_pages(GFP_KERNEL, +- get_order(io_tlb_nslabs * +- sizeof(phys_addr_t))); +- if (!io_tlb_orig_addr) +- goto cleanup3; +- +- memset(io_tlb_orig_addr, 0, io_tlb_nslabs * sizeof(phys_addr_t)); +- +- /* +- * Get the overflow emergency buffer +- */ +- io_tlb_overflow_buffer = (void *)__get_free_pages(GFP_DMA, +- get_order(io_tlb_overflow)); +- if (!io_tlb_overflow_buffer) +- goto cleanup4; +- +- swiotlb_print_info(bytes); +- +- return 0; +- +-cleanup4: +- free_pages((unsigned long)io_tlb_orig_addr, +- get_order(io_tlb_nslabs * sizeof(phys_addr_t))); +- io_tlb_orig_addr = NULL; +-cleanup3: +- free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs * +- sizeof(int))); +- io_tlb_list = NULL; +-cleanup2: +- io_tlb_end = NULL; +- free_pages((unsigned long)io_tlb_start, order); +- io_tlb_start = NULL; +-cleanup1: +- io_tlb_nslabs = req_nslabs; +- return -ENOMEM; +-} +- +-static int is_swiotlb_buffer(phys_addr_t paddr) +-{ +- return paddr >= virt_to_phys(io_tlb_start) && +- paddr < virt_to_phys(io_tlb_end); +-} +- +-/* +- * Bounce: copy the swiotlb buffer back to the original dma location +- */ +-static void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size, +- enum dma_data_direction dir) +-{ +- unsigned long pfn = PFN_DOWN(phys); +- +- if (PageHighMem(pfn_to_page(pfn))) { +- /* The buffer does not have a mapping. Map it in and copy */ +- unsigned int offset = phys & ~PAGE_MASK; +- char *buffer; +- unsigned int sz = 0; +- unsigned long flags; +- +- while (size) { +- sz = min_t(size_t, PAGE_SIZE - offset, size); +- +- local_irq_save(flags); +- buffer = kmap_atomic(pfn_to_page(pfn), +- KM_BOUNCE_READ); +- if (dir == DMA_TO_DEVICE) +- memcpy(dma_addr, buffer + offset, sz); +- else +- memcpy(buffer + offset, dma_addr, sz); +- kunmap_atomic(buffer, KM_BOUNCE_READ); +- local_irq_restore(flags); +- +- size -= sz; +- pfn++; +- dma_addr += sz; +- offset = 0; +- } +- } else { +- if (dir == DMA_TO_DEVICE) +- memcpy(dma_addr, phys_to_virt(phys), size); +- else +- memcpy(phys_to_virt(phys), dma_addr, size); +- } +-} +- +-/* +- * Allocates bounce buffer and returns its kernel virtual address. +- */ +-static void * +-map_single(struct device *hwdev, phys_addr_t phys, size_t size, int dir) +-{ +- unsigned long flags; +- char *dma_addr; +- unsigned int nslots, stride, index, wrap; +- int i; +- unsigned long start_dma_addr; +- unsigned long mask; +- unsigned long offset_slots; +- unsigned long max_slots; +- +- mask = dma_get_seg_boundary(hwdev); +- start_dma_addr = swiotlb_virt_to_bus(hwdev, io_tlb_start) & mask; +- +- offset_slots = ALIGN(start_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; +- +- /* +- * Carefully handle integer overflow which can occur when mask == ~0UL. +- */ +- max_slots = mask + 1 +- ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT +- : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT); +- +- /* +- * For mappings greater than a page, we limit the stride (and +- * hence alignment) to a page size. +- */ +- nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; +- if (size > PAGE_SIZE) +- stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT)); +- else +- stride = 1; +- +- BUG_ON(!nslots); +- +- /* +- * Find suitable number of IO TLB entries size that will fit this +- * request and allocate a buffer from that IO TLB pool. +- */ +- spin_lock_irqsave(&io_tlb_lock, flags); +- index = ALIGN(io_tlb_index, stride); +- if (index >= io_tlb_nslabs) +- index = 0; +- wrap = index; +- +- do { +- while (iommu_is_span_boundary(index, nslots, offset_slots, +- max_slots)) { +- index += stride; +- if (index >= io_tlb_nslabs) +- index = 0; +- if (index == wrap) +- goto not_found; +- } +- +- /* +- * If we find a slot that indicates we have 'nslots' number of +- * contiguous buffers, we allocate the buffers from that slot +- * and mark the entries as '0' indicating unavailable. +- */ +- if (io_tlb_list[index] >= nslots) { +- int count = 0; +- +- for (i = index; i < (int) (index + nslots); i++) +- io_tlb_list[i] = 0; +- for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--) +- io_tlb_list[i] = ++count; +- dma_addr = io_tlb_start + (index << IO_TLB_SHIFT); +- +- /* +- * Update the indices to avoid searching in the next +- * round. +- */ +- io_tlb_index = ((index + nslots) < io_tlb_nslabs +- ? (index + nslots) : 0); +- +- goto found; +- } +- index += stride; +- if (index >= io_tlb_nslabs) +- index = 0; +- } while (index != wrap); +- +-not_found: +- spin_unlock_irqrestore(&io_tlb_lock, flags); +- return NULL; +-found: +- spin_unlock_irqrestore(&io_tlb_lock, flags); +- +- /* +- * Save away the mapping from the original address to the DMA address. +- * This is needed when we sync the memory. Then we sync the buffer if +- * needed. +- */ +- for (i = 0; i < nslots; i++) +- io_tlb_orig_addr[index+i] = phys + (i << IO_TLB_SHIFT); +- if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) +- swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE); +- +- return dma_addr; +-} +- +-/* +- * dma_addr is the kernel virtual address of the bounce buffer to unmap. +- */ +-static void +-do_unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir) +-{ +- unsigned long flags; +- int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; +- int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT; +- phys_addr_t phys = io_tlb_orig_addr[index]; +- +- /* +- * First, sync the memory before unmapping the entry +- */ +- if (phys && ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))) +- swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE); +- +- /* +- * Return the buffer to the free list by setting the corresponding +- * entries to indicate the number of contigous entries available. +- * While returning the entries to the free list, we merge the entries +- * with slots below and above the pool being returned. +- */ +- spin_lock_irqsave(&io_tlb_lock, flags); +- { +- count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ? +- io_tlb_list[index + nslots] : 0); +- /* +- * Step 1: return the slots to the free list, merging the +- * slots with superceeding slots +- */ +- for (i = index + nslots - 1; i >= index; i--) +- io_tlb_list[i] = ++count; +- /* +- * Step 2: merge the returned slots with the preceding slots, +- * if available (non zero) +- */ +- for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--) +- io_tlb_list[i] = ++count; +- } +- spin_unlock_irqrestore(&io_tlb_lock, flags); +-} +- +-static void +-sync_single(struct device *hwdev, char *dma_addr, size_t size, +- int dir, int target) +-{ +- int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT; +- phys_addr_t phys = io_tlb_orig_addr[index]; +- +- phys += ((unsigned long)dma_addr & ((1 << IO_TLB_SHIFT) - 1)); +- +- switch (target) { +- case SYNC_FOR_CPU: +- if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)) +- swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE); +- else +- BUG_ON(dir != DMA_TO_DEVICE); +- break; +- case SYNC_FOR_DEVICE: +- if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) +- swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE); +- else +- BUG_ON(dir != DMA_FROM_DEVICE); +- break; +- default: +- BUG(); +- } +-} +- + void * + swiotlb_alloc_coherent(struct device *hwdev, size_t size, + dma_addr_t *dma_handle, gfp_t flags) +@@ -512,12 +21,13 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size, + void *ret; + int order = get_order(size); + u64 dma_mask = DMA_BIT_MASK(32); ++ unsigned long start_dma_addr; + + if (hwdev && hwdev->coherent_dma_mask) + dma_mask = hwdev->coherent_dma_mask; + + ret = (void *)__get_free_pages(flags, order); +- if (ret && swiotlb_virt_to_bus(hwdev, ret) + size > dma_mask) { ++ if (ret && swiotlb_virt_to_bus(hwdev, ret) + size - 1 > dma_mask) { + /* + * The allocated memory isn't reachable by the device. + */ +@@ -527,10 +37,12 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size, + if (!ret) { + /* + * We are either out of memory or the device can't DMA +- * to GFP_DMA memory; fall back on map_single(), which ++ * to GFP_DMA memory; fall back on do_map_single(), which + * will grab memory from the lowest available address range. + */ +- ret = map_single(hwdev, 0, size, DMA_FROM_DEVICE); ++ start_dma_addr = swiotlb_virt_to_bus(hwdev, io_tlb_start); ++ ret = do_map_single(hwdev, 0, start_dma_addr, size, ++ DMA_FROM_DEVICE); + if (!ret) + return NULL; + } +@@ -539,12 +51,13 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size, + dev_addr = swiotlb_virt_to_bus(hwdev, ret); + + /* Confirm address can be DMA'd by device */ +- if (dev_addr + size > dma_mask) { +- printk("hwdev DMA mask = 0x%016Lx, dev_addr = 0x%016Lx\n", ++ if (dev_addr + size - 1 > dma_mask) { ++ dev_err(hwdev, "DMA: hwdev DMA mask = 0x%016Lx, " \ ++ "dev_addr = 0x%016Lx\n", + (unsigned long long)dma_mask, + (unsigned long long)dev_addr); + +- /* DMA_TO_DEVICE to avoid memcpy in unmap_single */ ++ /* DMA_TO_DEVICE to avoid memcpy in do_unmap_single */ + do_unmap_single(hwdev, ret, size, DMA_TO_DEVICE); + return NULL; + } +@@ -563,35 +76,11 @@ swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, + if (!is_swiotlb_buffer(paddr)) + free_pages((unsigned long)vaddr, get_order(size)); + else +- /* DMA_TO_DEVICE to avoid memcpy in unmap_single */ ++ /* DMA_TO_DEVICE to avoid memcpy in do_unmap_single */ + do_unmap_single(hwdev, vaddr, size, DMA_TO_DEVICE); + } + EXPORT_SYMBOL(swiotlb_free_coherent); + +-static void +-swiotlb_full(struct device *dev, size_t size, int dir, int do_panic) +-{ +- /* +- * Ran out of IOMMU space for this operation. This is very bad. +- * Unfortunately the drivers cannot handle this operation properly. +- * unless they check for dma_mapping_error (most don't) +- * When the mapping is small enough return a static buffer to limit +- * the damage, or panic when the transfer is too big. +- */ +- printk(KERN_ERR "DMA: Out of SW-IOMMU space for %zu bytes at " +- "device %s\n", size, dev ? dev_name(dev) : "?"); +- +- if (size <= io_tlb_overflow || !do_panic) +- return; +- +- if (dir == DMA_BIDIRECTIONAL) +- panic("DMA: Random memory could be DMA accessed\n"); +- if (dir == DMA_FROM_DEVICE) +- panic("DMA: Random memory could be DMA written\n"); +- if (dir == DMA_TO_DEVICE) +- panic("DMA: Random memory could be DMA read\n"); +-} +- + /* + * Map a single buffer of the indicated size for DMA in streaming mode. The + * physical address to use is returned. +@@ -604,6 +93,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, + enum dma_data_direction dir, + struct dma_attrs *attrs) + { ++ unsigned long start_dma_addr; + phys_addr_t phys = page_to_phys(page) + offset; + dma_addr_t dev_addr = phys_to_dma(dev, phys); + void *map; +@@ -620,7 +110,8 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, + /* + * Oh well, have to allocate and map a bounce buffer. + */ +- map = map_single(dev, phys, size, dir); ++ start_dma_addr = swiotlb_virt_to_bus(dev, io_tlb_start); ++ map = do_map_single(dev, phys, start_dma_addr, size, dir); + if (!map) { + swiotlb_full(dev, size, dir, 1); + map = io_tlb_overflow_buffer; +@@ -632,7 +123,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, + * Ensure that the address returned is DMA'ble + */ + if (!dma_capable(dev, dev_addr, size)) +- panic("map_single: bounce buffer is not DMA'ble"); ++ panic("DMA: swiotlb_map_single: bounce buffer is not DMA'ble"); + + return dev_addr; + } +@@ -697,7 +188,7 @@ swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, + BUG_ON(dir == DMA_NONE); + + if (is_swiotlb_buffer(paddr)) { +- sync_single(hwdev, phys_to_virt(paddr), size, dir, target); ++ do_sync_single(hwdev, phys_to_virt(paddr), size, dir, target); + return; + } + +@@ -774,19 +265,22 @@ int + swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems, + enum dma_data_direction dir, struct dma_attrs *attrs) + { ++ unsigned long start_dma_addr; + struct scatterlist *sg; + int i; + + BUG_ON(dir == DMA_NONE); + ++ start_dma_addr = swiotlb_virt_to_bus(hwdev, io_tlb_start); + for_each_sg(sgl, sg, nelems, i) { + phys_addr_t paddr = sg_phys(sg); + dma_addr_t dev_addr = phys_to_dma(hwdev, paddr); + + if (swiotlb_force || + !dma_capable(hwdev, dev_addr, sg->length)) { +- void *map = map_single(hwdev, sg_phys(sg), +- sg->length, dir); ++ void *map = do_map_single(hwdev, sg_phys(sg), ++ start_dma_addr, ++ sg->length, dir); + if (!map) { + /* Don't panic here, we expect map_sg users + to do proper error handling. */ +@@ -819,7 +313,8 @@ EXPORT_SYMBOL(swiotlb_map_sg); + */ + void + swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, +- int nelems, enum dma_data_direction dir, struct dma_attrs *attrs) ++ int nelems, enum dma_data_direction dir, ++ struct dma_attrs *attrs) + { + struct scatterlist *sg; + int i; +diff --git a/mm/bootmem.c b/mm/bootmem.c +index 555d5d2..d1dc23c 100644 +--- a/mm/bootmem.c ++++ b/mm/bootmem.c +@@ -143,6 +143,30 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) + return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); + } + ++/* ++ * free_bootmem_late - free bootmem pages directly to page allocator ++ * @addr: starting address of the range ++ * @size: size of the range in bytes ++ * ++ * This is only useful when the bootmem allocator has already been torn ++ * down, but we are still initializing the system. Pages are given directly ++ * to the page allocator, no bootmem metadata is updated because it is gone. ++ */ ++void __init free_bootmem_late(unsigned long addr, unsigned long size) ++{ ++ unsigned long cursor, end; ++ ++ kmemleak_free_part(__va(addr), size); ++ ++ cursor = PFN_UP(addr); ++ end = PFN_DOWN(addr + size); ++ ++ for (; cursor < end; cursor++) { ++ __free_pages_bootmem(pfn_to_page(cursor), 0); ++ totalram_pages++; ++ } ++} ++ + static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) + { + int aligned; +diff --git a/mm/memory.c b/mm/memory.c +index 53c1da0..c8741df 100644 +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -553,6 +553,13 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, + if (is_zero_pfn(pfn)) + return NULL; + check_pfn: ++ ++#if defined(CONFIG_XEN) && defined(CONFIG_X86) ++ /* XEN: Covers user-space grant mappings (even of local pages). */ ++ if (unlikely(vma->vm_flags & VM_FOREIGN)) ++ return NULL; ++#endif ++ + if (unlikely(pfn > highest_memmap_pfn)) { + print_bad_pte(vma, addr, pte, NULL); + return NULL; +@@ -839,8 +846,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, + page->index > details->last_index)) + continue; + } +- ptent = ptep_get_and_clear_full(mm, addr, pte, +- tlb->fullmm); ++ if (unlikely(vma->vm_ops && vma->vm_ops->zap_pte)) ++ ptent = vma->vm_ops->zap_pte(vma, addr, pte, ++ tlb->fullmm); ++ else ++ ptent = ptep_get_and_clear_full(mm, addr, pte, ++ tlb->fullmm); + tlb_remove_tlb_entry(tlb, pte, addr); + if (unlikely(!page)) + continue; +@@ -1100,6 +1111,7 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, + tlb_finish_mmu(tlb, address, end); + return end; + } ++EXPORT_SYMBOL_GPL(zap_page_range); + + /** + * zap_vma_ptes - remove ptes mapping the vma +@@ -1306,6 +1318,29 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + continue; + } + ++#ifdef CONFIG_XEN ++ if (vma && (vma->vm_flags & VM_FOREIGN)) { ++ struct vm_foreign_map *foreign_map = ++ vma->vm_private_data; ++ struct page **map = foreign_map->map; ++ int offset = (start - vma->vm_start) >> PAGE_SHIFT; ++ if (map[offset] != NULL) { ++ if (pages) { ++ struct page *page = map[offset]; ++ ++ pages[i] = page; ++ get_page(page); ++ } ++ if (vmas) ++ vmas[i] = vma; ++ i++; ++ start += PAGE_SIZE; ++ nr_pages--; ++ continue; ++ } ++ } ++#endif ++ + if (!vma || + (vma->vm_flags & (VM_IO | VM_PFNMAP)) || + !(vm_flags & vma->vm_flags)) +@@ -1781,6 +1816,10 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + + vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; + ++#ifdef CONFIG_XEN ++ vma->vm_mm->context.has_foreign_mappings = 1; ++#endif ++ + err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size)); + if (err) { + /* +@@ -1896,11 +1935,10 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, + { + pgd_t *pgd; + unsigned long next; +- unsigned long start = addr, end = addr + size; ++ unsigned long end = addr + size; + int err; + + BUG_ON(addr >= end); +- mmu_notifier_invalidate_range_start(mm, start, end); + pgd = pgd_offset(mm, addr); + do { + next = pgd_addr_end(addr, end); +@@ -1908,7 +1946,7 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, + if (err) + break; + } while (pgd++, addr = next, addr != end); +- mmu_notifier_invalidate_range_end(mm, start, end); ++ + return err; + } + EXPORT_SYMBOL_GPL(apply_to_page_range); +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 902e5fc..101715c 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -594,6 +594,13 @@ static void __free_pages_ok(struct page *page, unsigned int order) + if (bad) + return; + ++#ifdef CONFIG_XEN ++ if (PageForeign(page)) { ++ PageForeignDestructor(page, order); ++ return; ++ } ++#endif ++ + if (!PageHighMem(page)) { + debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); + debug_check_no_obj_freed(page_address(page), +@@ -1088,6 +1095,13 @@ static void free_hot_cold_page(struct page *page, int cold) + + kmemcheck_free_shadow(page, 0); + ++#ifdef CONFIG_XEN ++ if (PageForeign(page)) { ++ PageForeignDestructor(page, 0); ++ return; ++ } ++#endif ++ + if (PageAnon(page)) + page->mapping = NULL; + if (free_pages_check(page)) +diff --git a/mm/vmalloc.c b/mm/vmalloc.c +index 680dcbb..4f701c2 100644 +--- a/mm/vmalloc.c ++++ b/mm/vmalloc.c +@@ -31,6 +31,7 @@ + #include <asm/tlbflush.h> + #include <asm/shmparam.h> + ++bool vmap_lazy_unmap __read_mostly = true; + + /*** Page table manipulation functions ***/ + +@@ -502,6 +503,9 @@ static unsigned long lazy_max_pages(void) + { + unsigned int log; + ++ if (!vmap_lazy_unmap) ++ return 0; ++ + log = fls(num_online_cpus()); + + return log * (32UL * 1024 * 1024 / PAGE_SIZE); +@@ -570,8 +574,9 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, + } + rcu_read_unlock(); + +- if (nr) ++ if (nr) { + atomic_sub(nr, &vmap_lazy_nr); ++ } + + if (nr || force_flush) + flush_tlb_kernel_range(*start, *end); +diff --git a/net/core/ethtool.c b/net/core/ethtool.c +index abbe8fa..e661dd7 100644 +--- a/net/core/ethtool.c ++++ b/net/core/ethtool.c +@@ -179,14 +179,24 @@ static int ethtool_get_drvinfo(struct net_device *dev, void __user *useraddr) + struct ethtool_drvinfo info; + const struct ethtool_ops *ops = dev->ethtool_ops; + +- if (!ops->get_drvinfo) +- return -EOPNOTSUPP; +- + memset(&info, 0, sizeof(info)); + info.cmd = ETHTOOL_GDRVINFO; +- ops->get_drvinfo(dev, &info); ++ if (ops && ops->get_drvinfo) { ++ ops->get_drvinfo(dev, &info); ++ } else if (dev->dev.parent && dev->dev.parent->driver) { ++ strlcpy(info.bus_info, dev_name(dev->dev.parent), ++ sizeof(info.bus_info)); ++ strlcpy(info.driver, dev->dev.parent->driver->name, ++ sizeof(info.driver)); ++ } else { ++ return -EOPNOTSUPP; ++ } + +- if (ops->get_sset_count) { ++ /* ++ * this method of obtaining string set info is deprecated; ++ * Use ETHTOOL_GSSET_INFO instead. ++ */ ++ if (ops && ops->get_sset_count) { + int rc; + + rc = ops->get_sset_count(dev, ETH_SS_TEST); +@@ -201,14 +211,14 @@ static int ethtool_get_drvinfo(struct net_device *dev, void __user *useraddr) + } else { + /* code path for obsolete hooks */ + +- if (ops->self_test_count) ++ if (ops && ops->self_test_count) + info.testinfo_len = ops->self_test_count(dev); +- if (ops->get_stats_count) ++ if (ops && ops->get_stats_count) + info.n_stats = ops->get_stats_count(dev); + } +- if (ops->get_regs_len) ++ if (ops && ops->get_regs_len) + info.regdump_len = ops->get_regs_len(dev); +- if (ops->get_eeprom_len) ++ if (ops && ops->get_eeprom_len) + info.eedump_len = ops->get_eeprom_len(dev); + + if (copy_to_user(useraddr, &info, sizeof(info))) +@@ -945,12 +955,19 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) + if (!dev || !netif_device_present(dev)) + return -ENODEV; + +- if (!dev->ethtool_ops) +- return -EOPNOTSUPP; +- +- if (copy_from_user(ðcmd, useraddr, sizeof (ethcmd))) ++ if (copy_from_user(ðcmd, useraddr, sizeof(ethcmd))) + return -EFAULT; + ++ if (!dev->ethtool_ops) { ++ /* ETHTOOL_GDRVINFO does not require any driver support. ++ * It is also unprivileged and does not change anything, ++ * so we can take a shortcut to it. */ ++ if (ethcmd == ETHTOOL_GDRVINFO) ++ return ethtool_get_drvinfo(dev, useraddr); ++ else ++ return -EOPNOTSUPP; ++ } ++ + /* Allow some commands to be done by anyone */ + switch(ethcmd) { + case ETHTOOL_GDRVINFO: +diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c +index d4fd895..4ab8c97 100644 +--- a/net/core/rtnetlink.c ++++ b/net/core/rtnetlink.c +@@ -35,6 +35,7 @@ + #include <linux/security.h> + #include <linux/mutex.h> + #include <linux/if_addr.h> ++#include <linux/pci.h> + + #include <asm/uaccess.h> + #include <asm/system.h> +@@ -582,6 +583,22 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a, + a->tx_compressed = b->tx_compressed; + }; + ++/* All VF info */ ++static inline int rtnl_vfinfo_size(const struct net_device *dev) ++{ ++ if (dev->dev.parent && dev_is_pci(dev->dev.parent)) { ++ ++ int num_vfs = dev_num_vf(dev->dev.parent); ++ size_t size = nlmsg_total_size(sizeof(struct nlattr)); ++ size += nlmsg_total_size(num_vfs * sizeof(struct nlattr)); ++ size += num_vfs * (sizeof(struct ifla_vf_mac) + ++ sizeof(struct ifla_vf_vlan) + ++ sizeof(struct ifla_vf_tx_rate)); ++ return size; ++ } else ++ return 0; ++} ++ + static inline size_t if_nlmsg_size(const struct net_device *dev) + { + return NLMSG_ALIGN(sizeof(struct ifinfomsg)) +@@ -599,6 +616,8 @@ static inline size_t if_nlmsg_size(const struct net_device *dev) + + nla_total_size(4) /* IFLA_MASTER */ + + nla_total_size(1) /* IFLA_OPERSTATE */ + + nla_total_size(1) /* IFLA_LINKMODE */ ++ + nla_total_size(4) /* IFLA_NUM_VF */ ++ + rtnl_vfinfo_size(dev) /* IFLA_VFINFO_LIST */ + + rtnl_link_get_size(dev); /* IFLA_LINKINFO */ + } + +@@ -667,6 +686,40 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, + stats = dev_get_stats(dev); + copy_rtnl_link_stats(nla_data(attr), stats); + ++ if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent) { ++ int i; ++ ++ struct nlattr *vfinfo, *vf; ++ int num_vfs = dev_num_vf(dev->dev.parent); ++ ++ NLA_PUT_U32(skb, IFLA_NUM_VF, num_vfs); ++ vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST); ++ if (!vfinfo) ++ goto nla_put_failure; ++ for (i = 0; i < num_vfs; i++) { ++ struct ifla_vf_info ivi; ++ struct ifla_vf_mac vf_mac; ++ struct ifla_vf_vlan vf_vlan; ++ struct ifla_vf_tx_rate vf_tx_rate; ++ if (dev->netdev_ops->ndo_get_vf_config(dev, i, &ivi)) ++ break; ++ vf_mac.vf = vf_vlan.vf = vf_tx_rate.vf = ivi.vf; ++ memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); ++ vf_vlan.vlan = ivi.vlan; ++ vf_vlan.qos = ivi.qos; ++ vf_tx_rate.rate = ivi.tx_rate; ++ vf = nla_nest_start(skb, IFLA_VF_INFO); ++ if (!vf) { ++ nla_nest_cancel(skb, vfinfo); ++ goto nla_put_failure; ++ } ++ NLA_PUT(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac); ++ NLA_PUT(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan); ++ NLA_PUT(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), &vf_tx_rate); ++ nla_nest_end(skb, vf); ++ } ++ nla_nest_end(skb, vfinfo); ++ } + if (dev->rtnl_link_ops) { + if (rtnl_link_fill(skb, dev) < 0) + goto nla_put_failure; +@@ -716,6 +769,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = { + [IFLA_LINKINFO] = { .type = NLA_NESTED }, + [IFLA_NET_NS_PID] = { .type = NLA_U32 }, + [IFLA_IFALIAS] = { .type = NLA_STRING, .len = IFALIASZ-1 }, ++ [IFLA_VFINFO_LIST] = {. type = NLA_NESTED }, + }; + + static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { +@@ -723,6 +777,33 @@ static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { + [IFLA_INFO_DATA] = { .type = NLA_NESTED }, + }; + ++static const struct nla_policy ifla_vfinfo_policy[IFLA_VF_INFO_MAX+1] = { ++ [IFLA_VF_INFO] = { .type = NLA_NESTED }, ++}; ++ ++static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = { ++ [IFLA_VF_MAC] = { .type = NLA_BINARY, ++ .len = sizeof(struct ifla_vf_mac) }, ++ [IFLA_VF_VLAN] = { .type = NLA_BINARY, ++ .len = sizeof(struct ifla_vf_vlan) }, ++ [IFLA_VF_TX_RATE] = { .type = NLA_BINARY, ++ .len = sizeof(struct ifla_vf_tx_rate) }, ++}; ++ ++struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) ++{ ++ struct net *net; ++ /* Examine the link attributes and figure out which ++ * network namespace we are talking about. ++ */ ++ if (tb[IFLA_NET_NS_PID]) ++ net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID])); ++ else ++ net = get_net(src_net); ++ return net; ++} ++EXPORT_SYMBOL(rtnl_link_get_net); ++ + static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[]) + { + if (dev) { +@@ -738,6 +819,52 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[]) + return 0; + } + ++static int do_setvfinfo(struct net_device *dev, struct nlattr *attr) ++{ ++ int rem, err = -EINVAL; ++ struct nlattr *vf; ++ const struct net_device_ops *ops = dev->netdev_ops; ++ ++ nla_for_each_nested(vf, attr, rem) { ++ switch (nla_type(vf)) { ++ case IFLA_VF_MAC: { ++ struct ifla_vf_mac *ivm; ++ ivm = nla_data(vf); ++ err = -EOPNOTSUPP; ++ if (ops->ndo_set_vf_mac) ++ err = ops->ndo_set_vf_mac(dev, ivm->vf, ++ ivm->mac); ++ break; ++ } ++ case IFLA_VF_VLAN: { ++ struct ifla_vf_vlan *ivv; ++ ivv = nla_data(vf); ++ err = -EOPNOTSUPP; ++ if (ops->ndo_set_vf_vlan) ++ err = ops->ndo_set_vf_vlan(dev, ivv->vf, ++ ivv->vlan, ++ ivv->qos); ++ break; ++ } ++ case IFLA_VF_TX_RATE: { ++ struct ifla_vf_tx_rate *ivt; ++ ivt = nla_data(vf); ++ err = -EOPNOTSUPP; ++ if (ops->ndo_set_vf_tx_rate) ++ err = ops->ndo_set_vf_tx_rate(dev, ivt->vf, ++ ivt->rate); ++ break; ++ } ++ default: ++ err = -EINVAL; ++ break; ++ } ++ if (err) ++ break; ++ } ++ return err; ++} ++ + static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, + struct nlattr **tb, char *ifname, int modified) + { +@@ -875,6 +1002,18 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, + write_unlock_bh(&dev_base_lock); + } + ++ if (tb[IFLA_VFINFO_LIST]) { ++ struct nlattr *attr; ++ int rem; ++ nla_for_each_nested(attr, tb[IFLA_VFINFO_LIST], rem) { ++ if (nla_type(attr) != IFLA_VF_INFO) ++ goto errout; ++ err = do_setvfinfo(dev, attr); ++ if (err < 0) ++ goto errout; ++ modified = 1; ++ } ++ } + err = 0; + + errout: +diff --git a/net/sched/Kconfig b/net/sched/Kconfig +index 929218a..956cd0a 100644 +--- a/net/sched/Kconfig ++++ b/net/sched/Kconfig +@@ -215,6 +215,26 @@ config NET_SCH_INGRESS + To compile this code as a module, choose M here: the + module will be called sch_ingress. + ++config NET_SCH_PLUG ++ tristate "Plug network traffic until release" ++ ---help--- ++ Say Y here if you are using this kernel for Xen dom0 and ++ want to protect Xen guests with Remus. ++ ++ This queueing discipline is controlled by netlink. When it receives an ++ enqueue command it inserts a plug into the outbound queue that causes ++ following packets to enqueue until a dequeue command arrives over ++ netlink, releasing packets up to the plug for delivery. ++ ++ Its intention is to support speculative execution by allowing generated ++ network traffic to be rolled back. It is used to provide network ++ protection for the Remus high availability project. ++ ++ If unsure, say N. ++ ++ To compile this code as a module, choose M here: the ++ module will be called sch_plug. ++ + comment "Classification" + + config NET_CLS +diff --git a/net/sched/Makefile b/net/sched/Makefile +index f14e71b..61ef5f7 100644 +--- a/net/sched/Makefile ++++ b/net/sched/Makefile +@@ -31,6 +31,7 @@ obj-$(CONFIG_NET_SCH_MULTIQ) += sch_multiq.o + obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o + obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o + obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o ++obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o + obj-$(CONFIG_NET_CLS_U32) += cls_u32.o + obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o + obj-$(CONFIG_NET_CLS_FW) += cls_fw.o +diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c +new file mode 100644 +index 0000000..86c3ee1 +--- /dev/null ++++ b/net/sched/sch_plug.c +@@ -0,0 +1,156 @@ ++/* ++ * sch_plug.c Queue traffic until an explicit release command ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ * ++ * The operation of the buffer is as follows: ++ * When a checkpoint begins, a plug is inserted into the ++ * network queue by a netlink request (it operates by storing ++ * a pointer to the next packet which arrives and blocking dequeue ++ * when that packet is at the head of the queue). ++ * When a checkpoint completes (the backup acknowledges receipt), ++ * currently-queued packets are released. ++ * So it supports two operations, plug and unplug. ++ */ ++ ++#include <linux/module.h> ++#include <linux/types.h> ++#include <linux/kernel.h> ++#include <linux/errno.h> ++#include <linux/netdevice.h> ++#include <linux/skbuff.h> ++#include <net/pkt_sched.h> ++ ++#define FIFO_BUF (10*1024*1024) ++ ++#define TCQ_PLUG 0 ++#define TCQ_UNPLUG 1 ++ ++struct plug_sched_data { ++ /* ++ * This packet is the first packet which should not be ++ * delivered. If it is NULL, plug_enqueue will set it to the ++ * next packet it sees. ++ */ ++ struct sk_buff *stop; ++}; ++ ++struct tc_plug_qopt { ++ /* 0: reset stop packet pointer ++ * 1: dequeue to stop pointer */ ++ int action; ++}; ++ ++static int skb_remove_foreign_references(struct sk_buff *skb) ++{ ++ return !skb_linearize(skb); ++} ++ ++static int plug_enqueue(struct sk_buff *skb, struct Qdisc* sch) ++{ ++ struct plug_sched_data *q = qdisc_priv(sch); ++ ++ if (likely(sch->qstats.backlog + skb->len <= FIFO_BUF)) { ++ if (!q->stop) ++ q->stop = skb; ++ ++ if (!skb_remove_foreign_references(skb)) { ++ printk(KERN_DEBUG "error removing foreign ref\n"); ++ return qdisc_reshape_fail(skb, sch); ++ } ++ ++ return qdisc_enqueue_tail(skb, sch); ++ } ++ printk(KERN_WARNING "queue reported full: %d,%d\n", ++ sch->qstats.backlog, skb->len); ++ ++ return qdisc_reshape_fail(skb, sch); ++} ++ ++/* dequeue doesn't actually dequeue until the release command is ++ * received. */ ++static struct sk_buff *plug_dequeue(struct Qdisc* sch) ++{ ++ struct plug_sched_data *q = qdisc_priv(sch); ++ struct sk_buff *peek; ++ ++ if (sch->flags & TCQ_F_THROTTLED) ++ return NULL; ++ ++ peek = (struct sk_buff *)((sch->q).next); ++ ++ /* this pointer comparison may be shady */ ++ if (peek == q->stop) { ++ /* ++ * This is the tail of the last round. Release it and ++ * block the queue ++ */ ++ sch->flags |= TCQ_F_THROTTLED; ++ return NULL; ++ } ++ ++ return qdisc_dequeue_head(sch); ++} ++ ++static int plug_init(struct Qdisc *sch, struct nlattr *opt) ++{ ++ sch->flags |= TCQ_F_THROTTLED; ++ ++ return 0; ++} ++ ++/* ++ * receives two messages: ++ * 0: checkpoint queue (set stop to next packet) ++ * 1: dequeue until stop ++ */ ++static int plug_change(struct Qdisc *sch, struct nlattr *opt) ++{ ++ struct plug_sched_data *q = qdisc_priv(sch); ++ struct tc_plug_qopt *msg; ++ ++ if (!opt || nla_len(opt) < sizeof(*msg)) ++ return -EINVAL; ++ ++ msg = nla_data(opt); ++ ++ if (msg->action == TCQ_PLUG) { ++ /* reset stop */ ++ q->stop = NULL; ++ } else if (msg->action == TCQ_UNPLUG) { ++ /* dequeue */ ++ sch->flags &= ~TCQ_F_THROTTLED; ++ netif_schedule_queue(sch->dev_queue); ++ } else { ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++struct Qdisc_ops plug_qdisc_ops = { ++ .id = "plug", ++ .priv_size = sizeof(struct plug_sched_data), ++ .enqueue = plug_enqueue, ++ .dequeue = plug_dequeue, ++ .peek = qdisc_peek_head, ++ .init = plug_init, ++ .change = plug_change, ++ .owner = THIS_MODULE, ++}; ++ ++static int __init plug_module_init(void) ++{ ++ return register_qdisc(&plug_qdisc_ops); ++} ++ ++static void __exit plug_module_exit(void) ++{ ++ unregister_qdisc(&plug_qdisc_ops); ++} ++module_init(plug_module_init) ++module_exit(plug_module_exit) ++MODULE_LICENSE("GPL"); |