6 files changed, 43180 insertions, 153 deletions
diff --git a/main/xf86-video-intel/APKBUILD b/main/xf86-video-intel/APKBUILD
index 183bc3a433..154a763c6f 100644
--- a/main/xf86-video-intel/APKBUILD
+++ b/main/xf86-video-intel/APKBUILD
@@ -1,36 +1,32 @@
 # Maintainer: Natanael Copa <ncopa@alpinelinux.org>
 pkgname=xf86-video-intel
-pkgver=2.99.917
-pkgrel=4
+pkgver=2.99.917_git20170325
+pkgrel=0
 pkgdesc="X.Org driver for Intel cards"
 url="http://xorg.freedesktop.org/"
 arch="x86 x86_64"
-license="custom"
+license="MIT"
 subpackages="$pkgname-doc"
 depends="mesa-dri-intel"
 makedepends="xorg-server-dev libxi-dev fontsproto randrproto
 	videoproto renderproto glproto xineramaproto libdrm-dev xf86driproto
-	mesa-dev libxvmc-dev xcb-util-dev eudev-dev"
+	mesa-dev libxvmc-dev xcb-util-dev eudev-dev
+	util-macros autoconf automake libtool"
 
-source="http://xorg.freedesktop.org/releases/individual/driver/$pkgname-$pkgver.tar.bz2
-	gcc5-workaround.patch
-	xf86-video-intel-2.99.917-libdrm-kernel-4_0-crash.patch
-	O_CLOEXEC.patch
-	xorg-1.18.patch
+_ver=${pkgver%_git*}
+source="http://xorg.freedesktop.org/releases/individual/driver/$pkgname-$_ver.tar.bz2
+	git.patch
 	"
 
-_builddir="$srcdir"/$pkgname-$pkgver
+builddir="$srcdir"/$pkgname-$_ver
 prepare() {
-	cd "$_builddir"
-	for i in $source; do
-		case $i in
-		*.patch) msg $i; patch -p1 -i "$srcdir"/$i || return 1;;
-		esac
-	done
+	cd "$builddir"
+	default_prepare
+	autoreconf -vif
 }
 
 build() {
-	cd "$srcdir"/$pkgname-$pkgver
+	cd "$builddir"
 	export LDFLAGS="$LDFLAGS -Wl,-z,lazy"
 	./configure \
 		--build=$CBUILD \
@@ -38,30 +34,18 @@ build() {
 		--prefix=/usr \
 		--enable-xvmc \
 		--disable-selective-werror \
+		--with-default-dri=3 \
 		|| return 1
 	make || return 1
 }
 
 package() {
-	cd "$srcdir"/$pkgname-$pkgver
+	cd "$builddir"
 	make DESTDIR="$pkgdir" install || return 1
 	install -Dm644 COPYING "$pkgdir"/usr/share/licenses/$pkgname/COPYING
 
 	# http://bugs.alpinelinux.org/issues/3312
 	chmod o-x "$pkgdir"/usr/libexec/xf86-video-intel-backlight-helper
 }
-md5sums="fa196a66e52c0c624fe5d350af7a5e7b  xf86-video-intel-2.99.917.tar.bz2
-2e9c5ee749f0a255d2b10ce18b3512fa  gcc5-workaround.patch
-2fa815b66eb6896b3962074731b0b4bb  xf86-video-intel-2.99.917-libdrm-kernel-4_0-crash.patch
-d5c410d504c58aa641658a19e4950ea5  O_CLOEXEC.patch
-d64095af23cf26c3559bba6f739e371e  xorg-1.18.patch"
-sha256sums="00b781eea055582820a123c47b62411bdf6aabf4f03dc0568faec55faf9667c9  xf86-video-intel-2.99.917.tar.bz2
-55367cd8dbe58d1097e2cf6cee11895acadc1a5ef527b8d39361e3975a6943e5  gcc5-workaround.patch
-54298cb4a59016be0451e3ea72b2c2c6b2a97cb9ec2c8f45d62c12447d14b361  xf86-video-intel-2.99.917-libdrm-kernel-4_0-crash.patch
-79f6c0bf8eb56d631857a0064e6c4ba1582acfb12c467f29c211e4fc4e628b98  O_CLOEXEC.patch
-f45ad7f6d8a78f282faf283057cc7fa82d2b1c99979dba563c1a734a25e4fb7a  xorg-1.18.patch"
 sha512sums="cbf4d46ad1ad5e5587c0f1f620ff534ef0645270517b60056b9f03e83d8216e2f456de46352a06c37c0c46963cc4ed20b71b815b20ec1bf680ff046e535f580f  xf86-video-intel-2.99.917.tar.bz2
-b208508d229f53f18cf3aa8de2c3637964d8b22f8a615fc4759a2bb58cbe9db4dca7a79129a7b59fd138980c90bdcaf1aec142e1f13954c4cf25a817a2125998  gcc5-workaround.patch
-003fc22a9446cdfcb8d51cbface096187f93a0c54b024ee34b160ca41a491c35e8b387caabc3c3f6411b93663c5119f48dc2adae0d76878723c02483306972ac  xf86-video-intel-2.99.917-libdrm-kernel-4_0-crash.patch
-1054d8e4f314b061209d74d05037abefec64ab0c4a1efcf82e512ea8db9022c56cf7891ca4ed08af88f560e986ea0e726144f5bde11212e938cc741c40da5348  O_CLOEXEC.patch
-f9c22684d50e4bd567efcb38f93b40b3bad2d56d8c7ae96bf768064dffbff1f9147e40ae415a7bbef61d8f3140ef6746a372059cea11861bc61b4b94acfa91a4  xorg-1.18.patch"
+0fe4e455dcbc4ae6622dca483ef3ddc765c43009fdb0fef82bdaa835a737796a6caf8afa9c6630919f43c977a6f736770c3779f04d8c823da4fc9cee17d16f19  git.patch"
diff --git a/main/xf86-video-intel/O_CLOEXEC.patch b/main/xf86-video-intel/O_CLOEXEC.patch
deleted file mode 100644
index 9dbe933588..0000000000
--- a/main/xf86-video-intel/O_CLOEXEC.patch
+++ /dev/null
@@ -1,10 +0,0 @@
---- ./src/sna/kgem.c.orig
-+++ ./src/sna/kgem.c
-@@ -37,6 +37,7 @@
- #include <sys/mman.h>
- #include <time.h>
- #include <errno.h>
-+#define __USE_GNU
- #include <fcntl.h>
- 
- #include <xf86drm.h>
diff --git a/main/xf86-video-intel/gcc5-workaround.patch b/main/xf86-video-intel/gcc5-workaround.patch
deleted file mode 100644
index dd832546e4..0000000000
--- a/main/xf86-video-intel/gcc5-workaround.patch
+++ /dev/null
@@ -1,22 +0,0 @@
---- ./src/sna/compiler.h.orig
-+++ ./src/sna/compiler.h
-@@ -65,16 +65,14 @@
- #define avx2 __attribute__((target("avx2,sse4.2,sse2,fpmath=sse")))
- #endif
- 
--#if HAS_GCC(4, 6) && defined(__OPTIMIZE__)
-+#if HAS_GCC(4, 6) && !HAS_GCC(5,0) && defined(__OPTIMIZE__)
- #define fast __attribute__((optimize("Ofast")))
- #else
- #define fast
- #endif
- 
--#if HAS_GCC(4, 6) && defined(__OPTIMIZE__)
--#define fast_memcpy __attribute__((optimize("Ofast"))) __attribute__((target("inline-all-stringops")))
--#elif HAS_GCC(4, 5) && defined(__OPTIMIZE__)
--#define fast_memcpy __attribute__((target("inline-all-stringops")))
-+#if HAS_GCC(4, 5) && defined(__OPTIMIZE__)
-+#define fast_memcpy fast __attribute__((target("inline-all-stringops")))
- #else
- #define fast_memcpy
- #endif
diff --git a/main/xf86-video-intel/git.patch b/main/xf86-video-intel/git.patch
new file mode 100644
index 0000000000..20084425a4
--- /dev/null
+++ b/main/xf86-video-intel/git.patch
@@ -0,0 +1,43164 @@
+diff --git a/Makefile.am b/Makefile.am
+index 418fdc92..de5fbe12 100644
+--- a/Makefile.am
++++ b/Makefile.am
+@@ -18,14 +18,16 @@
+ #  IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ #  CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+-ACLOCAL_AMFLAGS = ${ACLOCAL_FLAGS} -I m4
++#Having problems passing through user flags as libtool complains
++#ACLOCAL_AMFLAGS = ${ACLOCAL_FLAGS} -I m4
++ACLOCAL_AMFLAGS = -I m4
+ 
+ SUBDIRS = man libobj xvmc src tools
+ 
+ MAINTAINERCLEANFILES = ChangeLog INSTALL
+ 
+ if HAVE_X11
+-SUBDIRS += test
++SUBDIRS += test benchmarks
+ endif
+ 
+ .PHONY: ChangeLog INSTALL
+diff --git a/NEWS b/NEWS
+index 604b9cce..0e200332 100644
+--- a/NEWS
++++ b/NEWS
+@@ -21,7 +21,7 @@ should make one more snapshot before an imminent release.
+    Before kernel 3.19, O_NONBLOCK support is broken and so we must avoid
+    reading if we are not expecting an event.
+ 
+- * Backwards compatibilty fix for fake triple buffering with PRIME and
++ * Backwards compatibility fix for fake triple buffering with PRIME and
+    Xorg-1.15
+    https://bugs.freedesktop.org/show_bug.cgi?id=85144#c12
+ 
+@@ -51,7 +51,7 @@ should make one more snapshot before an imminent release.
+ Snapshot 2.99.916 (2014-09-08)
+ ==============================
+ Quick update for MST in UXA - we need to hook up the RandR outputs for
+-dynamicaly added connectors.
++dynamically added connectors.
+ 
+ 
+ Snapshot 2.99.915 (2014-09-08)
+@@ -503,7 +503,7 @@ release.
+    backlight property is queried whilst the connector is disabled
+    https://bugs.freedesktop.org/show_bug.cgi?id=70406
+ 
+- * Pad GETCONNECTOR ioctl for compatability between 32/64-bit userspace
++ * Pad GETCONNECTOR ioctl for compatibility between 32/64-bit userspace
+    and kernel
+ 
+  * Handle long glyph runs correctly
+@@ -523,7 +523,7 @@ snapshot beforehand to push out the bug fixes from the last week.
+ 
+  * Fix video output using sprites when changing the image size
+ 
+- * Apply more restrictive tile constaints for 915g class devices
++ * Apply more restrictive tile constraints for 915g class devices
+    https://bugs.launchpad.net/ubuntu/+source/xserver-xorg-video-intel/+bug/1232546
+ 
+  * Ensure all overlapping rectangles are drawn for XRenderFillRectangles
+@@ -1132,7 +1132,7 @@ operation.
+  * Explicitly prevent ring-switching for synchronized rendering to
+    scanouts (for vsync).
+ 
+- * Clip dirty region to slave pixmaps (otherwise UDL is nigh unusuable)
++ * Clip dirty region to slave pixmaps (otherwise UDL is nigh unusable)
+    https://bugs.freedesktop.org/show_bug.cgi?id=59539
+ 
+ 
+@@ -1226,7 +1226,7 @@ Release 2.20.15 (2012-12-03)
+ ============================
+ And lo, enabling more of the common acceleration paths for gen4 revealed
+ another lurking bug - something is wrong with how we prepare Y-tiling
+-surfaces for rendering. For the time being, we can surreptiously disable
++surfaces for rendering. For the time being, we can surreptitiously disable
+ them for gen4 and avoid hitting GPU hangs.
+ 
+  * Avoid clobbering the render state after failing to convert the
+@@ -1515,7 +1515,7 @@ Release 2.20.5 (2012-08-26)
+ Another silly bug found, another small bugfix release. The goal was for
+ the driver to bind to all Intel devices supported by the kernel.
+ Unfortunately we were too successful and started claiming Pouslbo,
+-Medfield and Cedarview devices which are still encumbered by propietary
++Medfield and Cedarview devices which are still encumbered by proprietary
+ IP and not supported by this driver.
+ 
+ Bugs fixed since 2.20.4:
+diff --git a/README b/README
+index cf4d88d8..348983b4 100644
+--- a/README
++++ b/README
+@@ -15,9 +15,9 @@ Intel graphics chipsets including:
+ 	G/Q33,G/Q35,G41,G/Q43,G/GM/Q45
+ 	PineView-M (Atom N400 series)
+ 	PineView-D (Atom D400/D500 series)
+-	Intel(R) HD Graphics: 2000-6000,
+-	Intel(R) Iris(TM) Graphics: 5100/6100, and
+-	Intel(R) Iris(TM) Pro Graphics: 5200/6200/P6300.
++	Intel(R) HD Graphics,
++	Intel(R) Iris(TM) Graphics,
++	Intel(R) Iris(TM) Pro Graphics.
+ 
+ Where to get more information about the driver
+ ----------------------------------------------
+diff --git a/benchmarks/.gitignore b/benchmarks/.gitignore
+new file mode 100644
+index 00000000..301c0129
+--- /dev/null
++++ b/benchmarks/.gitignore
+@@ -0,0 +1,2 @@
++dri2-swap
++dri3-swap
+diff --git a/benchmarks/Makefile.am b/benchmarks/Makefile.am
+new file mode 100644
+index 00000000..4976e8a3
+--- /dev/null
++++ b/benchmarks/Makefile.am
+@@ -0,0 +1,14 @@
++AM_CFLAGS = @CWARNFLAGS@ $(X11_CFLAGS) $(DRM_CFLAGS)
++LDADD = $(X11_LIBS) $(DRM_LIBS) $(CLOCK_GETTIME_LIBS)
++
++check_PROGRAMS =
++
++if DRI2
++check_PROGRAMS += dri2-swap
++endif
++
++if DRI3
++check_PROGRAMS += dri3-swap
++AM_CFLAGS += $(X11_DRI3_CFLAGS)
++LDADD += $(X11_DRI3_LIBS)
++endif
+diff --git a/benchmarks/dri2-swap.c b/benchmarks/dri2-swap.c
+new file mode 100644
+index 00000000..3d9d30aa
+--- /dev/null
++++ b/benchmarks/dri2-swap.c
+@@ -0,0 +1,588 @@
++/*
++ * Copyright (c) 2015 Intel Corporation
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice (including the next
++ * paragraph) shall be included in all copies or substantial portions of the
++ * Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ *
++ */
++
++#ifdef HAVE_CONFIG_H
++#include "config.h"
++#endif
++
++#include <X11/Xlib.h>
++#include <X11/Xatom.h>
++#include <X11/Xlib-xcb.h>
++#include <X11/Xutil.h>
++#include <X11/Xlibint.h>
++#include <X11/extensions/dpms.h>
++#include <X11/extensions/randr.h>
++#include <X11/extensions/Xcomposite.h>
++#include <X11/extensions/Xdamage.h>
++#include <X11/extensions/Xrandr.h>
++#include <xcb/xcb.h>
++#include <xcb/dri2.h>
++#include <xf86drm.h>
++
++#include <stdio.h>
++#include <string.h>
++#include <fcntl.h>
++#include <unistd.h>
++#include <assert.h>
++#include <errno.h>
++#include <setjmp.h>
++#include <signal.h>
++
++#include <X11/Xlibint.h>
++#include <X11/extensions/Xext.h>
++#include <X11/extensions/extutil.h>
++#include <X11/extensions/dri2proto.h>
++#include <X11/extensions/dri2tokens.h>
++#include <X11/extensions/Xfixes.h>
++
++static char dri2ExtensionName[] = DRI2_NAME;
++static XExtensionInfo *dri2Info;
++static XEXT_GENERATE_CLOSE_DISPLAY (DRI2CloseDisplay, dri2Info)
++
++static Bool
++DRI2WireToEvent(Display *dpy, XEvent *event, xEvent *wire);
++static Status
++DRI2EventToWire(Display *dpy, XEvent *event, xEvent *wire);
++static int
++DRI2Error(Display *display, xError *err, XExtCodes *codes, int *ret_code);
++
++static /* const */ XExtensionHooks dri2ExtensionHooks = {
++  NULL,                   /* create_gc */
++  NULL,                   /* copy_gc */
++  NULL,                   /* flush_gc */
++  NULL,                   /* free_gc */
++  NULL,                   /* create_font */
++  NULL,                   /* free_font */
++  DRI2CloseDisplay,       /* close_display */
++  DRI2WireToEvent,        /* wire_to_event */
++  DRI2EventToWire,        /* event_to_wire */
++  DRI2Error,              /* error */
++  NULL,                   /* error_string */
++};
++
++static XEXT_GENERATE_FIND_DISPLAY (DRI2FindDisplay,
++                                   dri2Info,
++                                   dri2ExtensionName,
++                                   &dri2ExtensionHooks,
++                                   0, NULL)
++
++static Bool
++DRI2WireToEvent(Display *dpy, XEvent *event, xEvent *wire)
++{
++   XExtDisplayInfo *info = DRI2FindDisplay(dpy);
++
++   XextCheckExtension(dpy, info, dri2ExtensionName, False);
++
++   switch ((wire->u.u.type & 0x7f) - info->codes->first_event) {
++#ifdef X_DRI2SwapBuffers
++   case DRI2_BufferSwapComplete:
++      return False;
++#endif
++#ifdef DRI2_InvalidateBuffers
++   case DRI2_InvalidateBuffers:
++      return False;
++#endif
++   default:
++      /* client doesn't support server event */
++      break;
++   }
++
++   return False;
++}
++
++/* We don't actually support this.  It doesn't make sense for clients to
++ * send each other DRI2 events.
++ */
++static Status
++DRI2EventToWire(Display *dpy, XEvent *event, xEvent *wire)
++{
++   XExtDisplayInfo *info = DRI2FindDisplay(dpy);
++
++   XextCheckExtension(dpy, info, dri2ExtensionName, False);
++
++   switch (event->type) {
++   default:
++      /* client doesn't support server event */
++      break;
++   }
++
++   return Success;
++}
++
++static int
++DRI2Error(Display *display, xError *err, XExtCodes *codes, int *ret_code)
++{
++	if (err->majorCode == codes->major_opcode &&
++	    err->errorCode == BadDrawable &&
++	    err->minorCode == X_DRI2CopyRegion)
++		return True;
++
++	/* If the X drawable was destroyed before the GLX drawable, the
++	 * DRI2 drawble will be gone by the time we call
++	 * DRI2DestroyDrawable.  So just ignore BadDrawable here. */
++	if (err->majorCode == codes->major_opcode &&
++	    err->errorCode == BadDrawable &&
++	    err->minorCode == X_DRI2DestroyDrawable)
++		return True;
++
++	/* If the server is non-local DRI2Connect will raise BadRequest.
++	 * Swallow this so that DRI2Connect can signal this in its return code */
++	if (err->majorCode == codes->major_opcode &&
++	    err->minorCode == X_DRI2Connect &&
++	    err->errorCode == BadRequest) {
++		*ret_code = False;
++		return True;
++	}
++
++	return False;
++}
++
++static Bool
++DRI2QueryExtension(Display * dpy, int *eventBase, int *errorBase)
++{
++	XExtDisplayInfo *info = DRI2FindDisplay(dpy);
++
++	if (XextHasExtension(info)) {
++		*eventBase = info->codes->first_event;
++		*errorBase = info->codes->first_error;
++		return True;
++	}
++
++	return False;
++}
++
++static Bool
++DRI2Connect(Display * dpy, XID window, char **driverName, char **deviceName)
++{
++	XExtDisplayInfo *info = DRI2FindDisplay(dpy);
++	xDRI2ConnectReply rep;
++	xDRI2ConnectReq *req;
++
++	XextCheckExtension(dpy, info, dri2ExtensionName, False);
++
++	LockDisplay(dpy);
++	GetReq(DRI2Connect, req);
++	req->reqType = info->codes->major_opcode;
++	req->dri2ReqType = X_DRI2Connect;
++	req->window = window;
++	req->driverType = DRI2DriverDRI;
++	if (!_XReply(dpy, (xReply *) & rep, 0, xFalse)) {
++		UnlockDisplay(dpy);
++		SyncHandle();
++		return False;
++	}
++
++	if (rep.driverNameLength == 0 && rep.deviceNameLength == 0) {
++		UnlockDisplay(dpy);
++		SyncHandle();
++		return False;
++	}
++
++	*driverName = Xmalloc(rep.driverNameLength + 1);
++	if (*driverName == NULL) {
++		_XEatData(dpy,
++			  ((rep.driverNameLength + 3) & ~3) +
++			  ((rep.deviceNameLength + 3) & ~3));
++		UnlockDisplay(dpy);
++		SyncHandle();
++		return False;
++	}
++	_XReadPad(dpy, *driverName, rep.driverNameLength);
++	(*driverName)[rep.driverNameLength] = '\0';
++
++	*deviceName = Xmalloc(rep.deviceNameLength + 1);
++	if (*deviceName == NULL) {
++		Xfree(*driverName);
++		_XEatData(dpy, ((rep.deviceNameLength + 3) & ~3));
++		UnlockDisplay(dpy);
++		SyncHandle();
++		return False;
++	}
++	_XReadPad(dpy, *deviceName, rep.deviceNameLength);
++	(*deviceName)[rep.deviceNameLength] = '\0';
++
++	UnlockDisplay(dpy);
++	SyncHandle();
++
++	return True;
++}
++
++static Bool
++DRI2Authenticate(Display * dpy, XID window, unsigned int magic)
++{
++	XExtDisplayInfo *info = DRI2FindDisplay(dpy);
++	xDRI2AuthenticateReq *req;
++	xDRI2AuthenticateReply rep;
++
++	XextCheckExtension(dpy, info, dri2ExtensionName, False);
++
++	LockDisplay(dpy);
++	GetReq(DRI2Authenticate, req);
++	req->reqType = info->codes->major_opcode;
++	req->dri2ReqType = X_DRI2Authenticate;
++	req->window = window;
++	req->magic = magic;
++
++	if (!_XReply(dpy, (xReply *) & rep, 0, xFalse)) {
++		UnlockDisplay(dpy);
++		SyncHandle();
++		return False;
++	}
++
++	UnlockDisplay(dpy);
++	SyncHandle();
++
++	return rep.authenticated;
++}
++
++static void
++DRI2CreateDrawable(Display * dpy, XID drawable)
++{
++	XExtDisplayInfo *info = DRI2FindDisplay(dpy);
++	xDRI2CreateDrawableReq *req;
++
++	XextSimpleCheckExtension(dpy, info, dri2ExtensionName);
++
++	LockDisplay(dpy);
++	GetReq(DRI2CreateDrawable, req);
++	req->reqType = info->codes->major_opcode;
++	req->dri2ReqType = X_DRI2CreateDrawable;
++	req->drawable = drawable;
++	UnlockDisplay(dpy);
++	SyncHandle();
++}
++
++static void DRI2SwapInterval(Display *dpy, XID drawable, int interval)
++{
++    XExtDisplayInfo *info = DRI2FindDisplay(dpy);
++    xDRI2SwapIntervalReq *req;
++
++    XextSimpleCheckExtension (dpy, info, dri2ExtensionName);
++
++    LockDisplay(dpy);
++    GetReq(DRI2SwapInterval, req);
++    req->reqType = info->codes->major_opcode;
++    req->dri2ReqType = X_DRI2SwapInterval;
++    req->drawable = drawable;
++    req->interval = interval;
++    UnlockDisplay(dpy);
++    SyncHandle();
++}
++
++static int _x_error_occurred;
++
++static int
++_check_error_handler(Display     *display,
++		     XErrorEvent *event)
++{
++	fprintf(stderr,
++		"X11 error from display %s, serial=%ld, error=%d, req=%d.%d\n",
++	       DisplayString(display),
++	       event->serial,
++	       event->error_code,
++	       event->request_code,
++	       event->minor_code);
++	_x_error_occurred++;
++	return False; /* ignored */
++}
++
++static double elapsed(const struct timespec *start,
++		      const struct timespec *end)
++{
++	return 1e6*(end->tv_sec - start->tv_sec) + (end->tv_nsec - start->tv_nsec)/1000;
++}
++
++static void run(Display *dpy, Window win)
++{
++	xcb_connection_t *c = XGetXCBConnection(dpy);
++	struct timespec start, end;
++	int n, completed = 0;
++
++	clock_gettime(CLOCK_MONOTONIC, &start);
++	do {
++		for (n = 0; n < 1000; n++) {
++			unsigned int attachments[] = { DRI2BufferBackLeft };
++			unsigned int seq[2];
++
++			seq[0] = xcb_dri2_swap_buffers_unchecked(c, win,
++								 0, 0, 0, 0, 0, 0).sequence;
++
++
++			seq[1] = xcb_dri2_get_buffers_unchecked(c, win,
++								1, 1, attachments).sequence;
++
++			xcb_flush(c);
++			xcb_discard_reply(c, seq[0]);
++			xcb_discard_reply(c, seq[1]);
++			completed++;
++		}
++		clock_gettime(CLOCK_MONOTONIC, &end);
++	} while (end.tv_sec < start.tv_sec + 10);
++
++	printf("%f\n", completed / (elapsed(&start, &end) / 1000000));
++}
++
++static inline XRRScreenResources *_XRRGetScreenResourcesCurrent(Display *dpy, Window window)
++{
++	XRRScreenResources *res;
++
++	res = XRRGetScreenResourcesCurrent(dpy, window);
++	if (res == NULL)
++		res = XRRGetScreenResources(dpy, window);
++
++	return res;
++}
++
++static XRRModeInfo *lookup_mode(XRRScreenResources *res, int id)
++{
++	int i;
++
++	for (i = 0; i < res->nmode; i++) {
++		if (res->modes[i].id == id)
++			return &res->modes[i];
++	}
++
++	return NULL;
++}
++
++static int dri2_open(Display *dpy)
++{
++	drm_auth_t auth;
++	char *driver, *device;
++	int fd;
++
++	if (!DRI2QueryExtension(dpy, &fd, &fd))
++		return -1;
++
++	if (!DRI2Connect(dpy, DefaultRootWindow(dpy), &driver, &device))
++		return -1;
++
++	fd = open(device, O_RDWR);
++	if (fd < 0)
++		return -1;
++
++	if (drmIoctl(fd, DRM_IOCTL_GET_MAGIC, &auth))
++		return -1;
++
++	if (!DRI2Authenticate(dpy, DefaultRootWindow(dpy), auth.magic))
++		return -1;
++
++	return fd;
++}
++
++static void fullscreen(Display *dpy, Window win)
++{
++	Atom atom = XInternAtom(dpy, "_NET_WM_STATE_FULLSCREEN", False);
++	XChangeProperty(dpy, win,
++			XInternAtom(dpy, "_NET_WM_STATE", False),
++			XA_ATOM, 32, PropModeReplace,
++			(unsigned char *)&atom, 1);
++}
++
++static int has_composite(Display *dpy)
++{
++	int event, error;
++	int major, minor;
++
++	if (!XDamageQueryExtension (dpy, &event, &error))
++		return 0;
++
++	if (!XCompositeQueryExtension(dpy, &event, &error))
++		return 0;
++
++	XCompositeQueryVersion(dpy, &major, &minor);
++
++	return major > 0 || minor >= 4;
++}
++
++int main(int argc, char **argv)
++{
++	Display *dpy;
++	Window root, win;
++	XRRScreenResources *res;
++	XRRCrtcInfo **original_crtc;
++	XSetWindowAttributes attr;
++	enum window { ROOT, FULLSCREEN, WINDOW } w = FULLSCREEN;
++	enum visible {REDIRECTED, NORMAL } v = NORMAL;
++	enum display { OFF, ON } d = OFF;
++	int width, height;
++	int i, fd;
++	int c;
++
++	while ((c = getopt(argc, argv, "d:v:w:")) != -1) {
++		switch (c) {
++		case 'd':
++			if (strcmp(optarg, "off") == 0)
++				d = OFF;
++			else if (strcmp(optarg, "on") == 0)
++				d = ON;
++			else
++				abort();
++			break;
++
++		case 'v':
++			if (strcmp(optarg, "redirected") == 0)
++				v = REDIRECTED;
++			else if (strcmp(optarg, "normal") == 0)
++				v = NORMAL;
++			else
++				abort();
++			break;
++
++		case 'w':
++			if (strcmp(optarg, "fullscreen") == 0)
++				w = FULLSCREEN;
++			else if (strcmp(optarg, "window") == 0)
++				w = WINDOW;
++			else if (strcmp(optarg, "root") == 0)
++				w = ROOT;
++			else
++				abort();
++			break;
++		}
++	}
++
++	attr.override_redirect = 1;
++
++	dpy = XOpenDisplay(NULL);
++	if (dpy == NULL)
++		return 77;
++
++	width = DisplayWidth(dpy, DefaultScreen(dpy));
++	height = DisplayHeight(dpy, DefaultScreen(dpy));
++
++	fd = dri2_open(dpy);
++	if (fd < 0)
++		return 77;
++
++	if (DPMSQueryExtension(dpy, &i, &i))
++		DPMSDisable(dpy);
++
++	root = DefaultRootWindow(dpy);
++
++	signal(SIGALRM, SIG_IGN);
++	XSetErrorHandler(_check_error_handler);
++
++	res = NULL;
++	if (XRRQueryVersion(dpy, &i, &i))
++		res = _XRRGetScreenResourcesCurrent(dpy, root);
++	if (res == NULL)
++		return 77;
++
++	if (v == REDIRECTED && !has_composite(dpy))
++		return 77;
++
++	original_crtc = malloc(sizeof(XRRCrtcInfo *)*res->ncrtc);
++	for (i = 0; i < res->ncrtc; i++)
++		original_crtc[i] = XRRGetCrtcInfo(dpy, res, res->crtcs[i]);
++
++	for (i = 0; i < res->ncrtc; i++)
++		XRRSetCrtcConfig(dpy, res, res->crtcs[i], CurrentTime,
++				 0, 0, None, RR_Rotate_0, NULL, 0);
++
++	DRI2CreateDrawable(dpy, root);
++	DRI2SwapInterval(dpy, root, 0);
++
++	if (d != OFF) {
++		for (i = 0; i < res->noutput; i++) {
++			XRROutputInfo *output;
++			XRRModeInfo *mode;
++
++			output = XRRGetOutputInfo(dpy, res, res->outputs[i]);
++			if (output == NULL)
++				continue;
++
++			mode = NULL;
++			if (res->nmode)
++				mode = lookup_mode(res, output->modes[0]);
++			if (mode == NULL)
++				continue;
++
++			XRRSetCrtcConfig(dpy, res, output->crtcs[0], CurrentTime,
++					 0, 0, output->modes[0], RR_Rotate_0, &res->outputs[i], 1);
++			width = mode->width;
++			height = mode->height;
++			break;
++		}
++		if (i == res->noutput) {
++			_x_error_occurred = 77;
++			goto restore;
++		}
++	}
++
++	if (w == ROOT) {
++		run(dpy, root);
++	} else if (w == FULLSCREEN) {
++		win = XCreateWindow(dpy, root,
++				    0, 0, width, height, 0,
++				    DefaultDepth(dpy, DefaultScreen(dpy)),
++				    InputOutput,
++				    DefaultVisual(dpy, DefaultScreen(dpy)),
++				    CWOverrideRedirect, &attr);
++		DRI2CreateDrawable(dpy, win);
++		DRI2SwapInterval(dpy, win, 0);
++		if (v == REDIRECTED) {
++			XCompositeRedirectWindow(dpy, win, CompositeRedirectManual);
++			XDamageCreate(dpy, win, XDamageReportRawRectangles);
++		} else
++			fullscreen(dpy, win);
++		XMapWindow(dpy, win);
++		run(dpy, win);
++	} else if (w == WINDOW) {
++		win = XCreateWindow(dpy, root,
++				    0, 0, width/2, height/2, 0,
++				    DefaultDepth(dpy, DefaultScreen(dpy)),
++				    InputOutput,
++				    DefaultVisual(dpy, DefaultScreen(dpy)),
++				    CWOverrideRedirect, &attr);
++		DRI2CreateDrawable(dpy, win);
++		DRI2SwapInterval(dpy, win, 0);
++		if (v == REDIRECTED) {
++			XCompositeRedirectWindow(dpy, win, CompositeRedirectManual);
++			XDamageCreate(dpy, win, XDamageReportRawRectangles);
++		}
++		XMapWindow(dpy, win);
++		run(dpy, win);
++	}
++
++restore:
++	for (i = 0; i < res->ncrtc; i++)
++		XRRSetCrtcConfig(dpy, res, res->crtcs[i], CurrentTime,
++				 0, 0, None, RR_Rotate_0, NULL, 0);
++
++	for (i = 0; i < res->ncrtc; i++)
++		XRRSetCrtcConfig(dpy, res, res->crtcs[i], CurrentTime,
++				 original_crtc[i]->x,
++				 original_crtc[i]->y,
++				 original_crtc[i]->mode,
++				 original_crtc[i]->rotation,
++				 original_crtc[i]->outputs,
++				 original_crtc[i]->noutput);
++
++	if (DPMSQueryExtension(dpy, &i, &i))
++		DPMSEnable(dpy);
++
++	XSync(dpy, True);
++	return _x_error_occurred;
++}
+diff --git a/benchmarks/dri3-swap.c b/benchmarks/dri3-swap.c
+new file mode 100644
+index 00000000..4dd423b3
+--- /dev/null
++++ b/benchmarks/dri3-swap.c
+@@ -0,0 +1,595 @@
++/*
++ * Copyright (c) 2015 Intel Corporation
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice (including the next
++ * paragraph) shall be included in all copies or substantial portions of the
++ * Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ *
++ */
++
++#ifdef HAVE_CONFIG_H
++#include "config.h"
++#endif
++
++#include <X11/Xlib.h>
++#include <X11/Xatom.h>
++#include <X11/Xlib-xcb.h>
++#include <X11/xshmfence.h>
++#include <X11/Xutil.h>
++#include <X11/Xlibint.h>
++#include <X11/extensions/Xcomposite.h>
++#include <X11/extensions/Xdamage.h>
++#include <X11/extensions/dpms.h>
++#include <X11/extensions/randr.h>
++#include <X11/extensions/Xrandr.h>
++#include <xcb/xcb.h>
++#include <xcb/present.h>
++#include <xcb/dri3.h>
++#include <xcb/xfixes.h>
++#include <xf86drm.h>
++#include <i915_drm.h>
++
++#include <stdio.h>
++#include <string.h>
++#include <fcntl.h>
++#include <unistd.h>
++#include <assert.h>
++#include <errno.h>
++#include <setjmp.h>
++#include <signal.h>
++
++struct dri3_fence {
++	XID xid;
++	void *addr;
++};
++
++static int _x_error_occurred;
++static uint32_t stamp;
++
++struct list {
++    struct list *next, *prev;
++};
++
++static void
++list_init(struct list *list)
++{
++    list->next = list->prev = list;
++}
++
++static inline void
++__list_add(struct list *entry,
++	    struct list *prev,
++	    struct list *next)
++{
++    next->prev = entry;
++    entry->next = next;
++    entry->prev = prev;
++    prev->next = entry;
++}
++
++static inline void
++list_add(struct list *entry, struct list *head)
++{
++    __list_add(entry, head, head->next);
++}
++
++static inline void
++__list_del(struct list *prev, struct list *next)
++{
++	next->prev = prev;
++	prev->next = next;
++}
++
++static inline void
++_list_del(struct list *entry)
++{
++    __list_del(entry->prev, entry->next);
++}
++
++static inline void
++list_move(struct list *list, struct list *head)
++{
++	if (list->prev != head) {
++		_list_del(list);
++		list_add(list, head);
++	}
++}
++
++#define __container_of(ptr, sample, member)				\
++    (void *)((char *)(ptr) - ((char *)&(sample)->member - (char *)(sample)))
++
++#define list_for_each_entry(pos, head, member)				\
++    for (pos = __container_of((head)->next, pos, member);		\
++	 &pos->member != (head);					\
++	 pos = __container_of(pos->member.next, pos, member))
++
++static int
++_check_error_handler(Display     *display,
++		     XErrorEvent *event)
++{
++	printf("X11 error from display %s, serial=%ld, error=%d, req=%d.%d\n",
++	       DisplayString(display),
++	       event->serial,
++	       event->error_code,
++	       event->request_code,
++	       event->minor_code);
++	_x_error_occurred++;
++	return False; /* ignored */
++}
++
++static int dri3_create_fence(Display *dpy,
++			     Pixmap pixmap,
++			     struct dri3_fence *fence)
++{
++	xcb_connection_t *c = XGetXCBConnection(dpy);
++	struct dri3_fence f;
++	int fd;
++
++	fd = xshmfence_alloc_shm();
++	if (fd < 0)
++		return -1;
++
++	f.addr = xshmfence_map_shm(fd);
++	if (f.addr == NULL) {
++		close(fd);
++		return -1;
++	}
++
++	f.xid = xcb_generate_id(c);
++	xcb_dri3_fence_from_fd(c, pixmap, f.xid, 0, fd);
++
++	*fence = f;
++	return 0;
++}
++
++static double elapsed(const struct timespec *start,
++		      const struct timespec *end)
++{
++	return 1e6*(end->tv_sec - start->tv_sec) + (end->tv_nsec - start->tv_nsec)/1000;
++}
++
++struct buffer {
++	struct list link;
++	Pixmap pixmap;
++	struct dri3_fence fence;
++	int fd;
++	int busy;
++};
++
++static void run(Display *dpy, Window win)
++{
++	xcb_connection_t *c = XGetXCBConnection(dpy);
++	struct timespec start, end;
++#define N_BACK 8
++	struct buffer buffer[N_BACK];
++	struct list mru;
++	Window root;
++	unsigned int width, height;
++	unsigned border, depth;
++	unsigned present_flags = XCB_PRESENT_OPTION_ASYNC;
++	xcb_xfixes_region_t update = 0;
++	int completed = 0;
++	int queued = 0;
++	uint32_t eid;
++	void *Q;
++	int i, n;
++
++	list_init(&mru);
++
++	XGetGeometry(dpy, win,
++		     &root, &i, &n, &width, &height, &border, &depth);
++
++	_x_error_occurred = 0;
++
++	for (n = 0; n < N_BACK; n++) {
++		xcb_dri3_buffer_from_pixmap_reply_t *reply;
++		int *fds;
++
++		buffer[n].pixmap =
++			XCreatePixmap(dpy, win, width, height, depth);
++		buffer[n].fence.xid = 0;
++		buffer[n].fd = -1;
++
++		if (dri3_create_fence(dpy, win, &buffer[n].fence))
++			return;
++
++		reply = xcb_dri3_buffer_from_pixmap_reply (c,
++							   xcb_dri3_buffer_from_pixmap(c, buffer[n].pixmap),
++							   NULL);
++		if (reply == NULL)
++			return;
++
++		fds = xcb_dri3_buffer_from_pixmap_reply_fds (c, reply);
++		buffer[n].fd = fds[0];
++		free(reply);
++
++		/* start idle */
++		xshmfence_trigger(buffer[n].fence.addr);
++		buffer[n].busy = 0;
++		list_add(&buffer[n].link, &mru);
++	}
++
++	eid = xcb_generate_id(c);
++	xcb_present_select_input(c, eid, win,
++                                 XCB_PRESENT_EVENT_MASK_IDLE_NOTIFY |
++                                 XCB_PRESENT_EVENT_MASK_COMPLETE_NOTIFY);
++	Q = xcb_register_for_special_xge(c, &xcb_present_id, eid, &stamp);
++
++	clock_gettime(CLOCK_MONOTONIC, &start);
++	do {
++		for (n = 0; n < 1000; n++) {
++			struct buffer *tmp, *b = NULL;
++			list_for_each_entry(tmp, &mru, link) {
++				if (!tmp->busy) {
++					b = tmp;
++					break;
++				}
++			}
++			while (b == NULL) {
++				xcb_present_generic_event_t *ev;
++
++				ev = (xcb_present_generic_event_t *)
++					xcb_wait_for_special_event(c, Q);
++				if (ev == NULL)
++					abort();
++
++				do {
++					switch (ev->evtype) {
++					case XCB_PRESENT_COMPLETE_NOTIFY:
++						completed++;
++						queued--;
++						break;
++
++					case XCB_PRESENT_EVENT_IDLE_NOTIFY:
++						{
++							xcb_present_idle_notify_event_t *ie = (xcb_present_idle_notify_event_t *)ev;
++							assert(ie->serial < N_BACK);
++							buffer[ie->serial].busy = 0;
++							if (b == NULL)
++								b = &buffer[ie->serial];
++							break;
++						}
++					}
++					free(ev);
++				} while ((ev = (xcb_present_generic_event_t *)xcb_poll_for_special_event(c, Q)));
++			}
++
++			b->busy = 1;
++			if (b->fence.xid) {
++				xshmfence_await(b->fence.addr);
++				xshmfence_reset(b->fence.addr);
++			}
++			xcb_present_pixmap(c, win, b->pixmap, b - buffer,
++					   0, /* valid */
++					   update, /* update */
++					   0, /* x_off */
++					   0, /* y_off */
++					   None,
++					   None, /* wait fence */
++					   b->fence.xid,
++					   present_flags,
++					   0, /* target msc */
++					   0, /* divisor */
++					   0, /* remainder */
++					   0, NULL);
++			list_move(&b->link, &mru);
++			queued++;
++			xcb_flush(c);
++		}
++		clock_gettime(CLOCK_MONOTONIC, &end);
++	} while (end.tv_sec < start.tv_sec + 10);
++
++	while (queued) {
++		xcb_present_generic_event_t *ev;
++
++		ev = (xcb_present_generic_event_t *)
++			xcb_wait_for_special_event(c, Q);
++		if (ev == NULL)
++			abort();
++
++		do {
++			switch (ev->evtype) {
++			case XCB_PRESENT_COMPLETE_NOTIFY:
++				completed++;
++				queued--;
++				break;
++
++			case XCB_PRESENT_EVENT_IDLE_NOTIFY:
++				break;
++			}
++			free(ev);
++		} while ((ev = (xcb_present_generic_event_t *)xcb_poll_for_special_event(c, Q)));
++	}
++	clock_gettime(CLOCK_MONOTONIC, &end);
++
++	printf("%f\n", completed / (elapsed(&start, &end) / 1000000));
++}
++
++static int has_present(Display *dpy)
++{
++	xcb_connection_t *c = XGetXCBConnection(dpy);
++	xcb_generic_error_t *error = NULL;
++	void *reply;
++
++	reply = xcb_present_query_version_reply(c,
++						xcb_present_query_version(c,
++									  XCB_PRESENT_MAJOR_VERSION,
++									  XCB_PRESENT_MINOR_VERSION),
++						&error);
++
++	free(reply);
++	free(error);
++	if (reply == NULL) {
++		fprintf(stderr, "Present not supported on %s\n", DisplayString(dpy));
++		return 0;
++	}
++
++	return 1;
++}
++
++static int has_composite(Display *dpy)
++{
++	int event, error;
++	int major, minor;
++
++	if (!XDamageQueryExtension (dpy, &event, &error))
++		return 0;
++
++	if (!XCompositeQueryExtension(dpy, &event, &error))
++		return 0;
++
++	XCompositeQueryVersion(dpy, &major, &minor);
++
++	return major > 0 || minor >= 4;
++}
++
++static inline XRRScreenResources *_XRRGetScreenResourcesCurrent(Display *dpy, Window window)
++{
++	XRRScreenResources *res;
++
++	res = XRRGetScreenResourcesCurrent(dpy, window);
++	if (res == NULL)
++		res = XRRGetScreenResources(dpy, window);
++
++	return res;
++}
++
++static XRRModeInfo *lookup_mode(XRRScreenResources *res, int id)
++{
++	int i;
++
++	for (i = 0; i < res->nmode; i++) {
++		if (res->modes[i].id == id)
++			return &res->modes[i];
++	}
++
++	return NULL;
++}
++
++static void fullscreen(Display *dpy, Window win)
++{
++	Atom atom = XInternAtom(dpy, "_NET_WM_STATE_FULLSCREEN", False);
++	XChangeProperty(dpy, win,
++			XInternAtom(dpy, "_NET_WM_STATE", False),
++			XA_ATOM, 32, PropModeReplace,
++			(unsigned char *)&atom, 1);
++}
++
++static int dri3_query_version(Display *dpy, int *major, int *minor)
++{
++	xcb_connection_t *c = XGetXCBConnection(dpy);
++	xcb_dri3_query_version_reply_t *reply;
++	xcb_generic_error_t *error;
++
++	*major = *minor = -1;
++
++	reply = xcb_dri3_query_version_reply(c,
++					     xcb_dri3_query_version(c,
++								    XCB_DRI3_MAJOR_VERSION,
++								    XCB_DRI3_MINOR_VERSION),
++					     &error);
++	free(error);
++	if (reply == NULL)
++		return -1;
++
++	*major = reply->major_version;
++	*minor = reply->minor_version;
++	free(reply);
++
++	return 0;
++}
++
++static int has_dri3(Display *dpy)
++{
++	const xcb_query_extension_reply_t *ext;
++	int major, minor;
++
++	ext = xcb_get_extension_data(XGetXCBConnection(dpy), &xcb_dri3_id);
++	if (ext == NULL || !ext->present)
++		return 0;
++
++	if (dri3_query_version(dpy, &major, &minor) < 0)
++		return 0;
++
++	return major >= 0;
++}
++
++int main(int argc, char **argv)
++{
++	Display *dpy;
++	Window root, win;
++	XRRScreenResources *res;
++	XRRCrtcInfo **original_crtc;
++	XSetWindowAttributes attr;
++	enum window { ROOT, FULLSCREEN, WINDOW } w = FULLSCREEN;
++	enum visible {REDIRECTED, NORMAL } v = NORMAL;
++	enum display { OFF, ON } d = OFF;
++	int width, height;
++	int i;
++
++	while ((i = getopt(argc, argv, "d:v:w:")) != -1) {
++		switch (i) {
++		case 'd':
++			if (strcmp(optarg, "off") == 0)
++				d = OFF;
++			else if (strcmp(optarg, "on") == 0)
++				d = ON;
++			else
++				abort();
++			break;
++
++		case 'v':
++			if (strcmp(optarg, "redirected") == 0)
++				v = REDIRECTED;
++			else if (strcmp(optarg, "normal") == 0)
++				v = NORMAL;
++			else
++				abort();
++			break;
++
++		case 'w':
++			if (strcmp(optarg, "fullscreen") == 0)
++				w = FULLSCREEN;
++			else if (strcmp(optarg, "window") == 0)
++				w = WINDOW;
++			else if (strcmp(optarg, "root") == 0)
++				w = ROOT;
++			else
++				abort();
++			break;
++		}
++	}
++
++	attr.override_redirect = 1;
++
++	dpy = XOpenDisplay(NULL);
++	if (dpy == NULL)
++		return 77;
++
++	width = DisplayWidth(dpy, DefaultScreen(dpy));
++	height = DisplayHeight(dpy, DefaultScreen(dpy));
++
++	if (!has_present(dpy))
++		return 77;
++
++	if (!has_dri3(dpy))
++		return 77;
++
++	if (DPMSQueryExtension(dpy, &i, &i))
++		DPMSDisable(dpy);
++
++	root = DefaultRootWindow(dpy);
++
++	signal(SIGALRM, SIG_IGN);
++	XSetErrorHandler(_check_error_handler);
++
++	res = NULL;
++	if (XRRQueryVersion(dpy, &i, &i))
++		res = _XRRGetScreenResourcesCurrent(dpy, root);
++	if (res == NULL)
++		return 77;
++
++	if (v == REDIRECTED && !has_composite(dpy))
++		return 77;
++
++	original_crtc = malloc(sizeof(XRRCrtcInfo *)*res->ncrtc);
++	for (i = 0; i < res->ncrtc; i++)
++		original_crtc[i] = XRRGetCrtcInfo(dpy, res, res->crtcs[i]);
++
++	for (i = 0; i < res->ncrtc; i++)
++		XRRSetCrtcConfig(dpy, res, res->crtcs[i], CurrentTime,
++				 0, 0, None, RR_Rotate_0, NULL, 0);
++
++	if (d != OFF) {
++		for (i = 0; i < res->noutput; i++) {
++			XRROutputInfo *output;
++			XRRModeInfo *mode;
++
++			output = XRRGetOutputInfo(dpy, res, res->outputs[i]);
++			if (output == NULL)
++				continue;
++
++			mode = NULL;
++			if (res->nmode)
++				mode = lookup_mode(res, output->modes[0]);
++			if (mode == NULL)
++				continue;
++
++			XRRSetCrtcConfig(dpy, res, output->crtcs[0], CurrentTime,
++					 0, 0, output->modes[0], RR_Rotate_0, &res->outputs[i], 1);
++			width = mode->width;
++			height = mode->height;
++			break;
++		}
++		if (i == res->noutput) {
++			_x_error_occurred = 77;
++			goto restore;
++		}
++	}
++
++	if (w == ROOT) {
++		run(dpy, root);
++	} else if (w == FULLSCREEN) {
++		win = XCreateWindow(dpy, root,
++				    0, 0, width, height, 0,
++				    DefaultDepth(dpy, DefaultScreen(dpy)),
++				    InputOutput,
++				    DefaultVisual(dpy, DefaultScreen(dpy)),
++				    CWOverrideRedirect, &attr);
++		if (v == REDIRECTED) {
++			XCompositeRedirectWindow(dpy, win, CompositeRedirectManual);
++			XDamageCreate(dpy, win, XDamageReportRawRectangles);
++		} else
++			fullscreen(dpy, win);
++		XMapWindow(dpy, win);
++		run(dpy, win);
++	} else if (w == WINDOW) {
++		win = XCreateWindow(dpy, root,
++				    0, 0, width/2, height/2, 0,
++				    DefaultDepth(dpy, DefaultScreen(dpy)),
++				    InputOutput,
++				    DefaultVisual(dpy, DefaultScreen(dpy)),
++				    CWOverrideRedirect, &attr);
++		if (v == REDIRECTED) {
++			XCompositeRedirectWindow(dpy, win, CompositeRedirectManual);
++			XDamageCreate(dpy, win, XDamageReportRawRectangles);
++		}
++		XMapWindow(dpy, win);
++		run(dpy, win);
++	}
++
++restore:
++	for (i = 0; i < res->ncrtc; i++)
++		XRRSetCrtcConfig(dpy, res, res->crtcs[i], CurrentTime,
++				 0, 0, None, RR_Rotate_0, NULL, 0);
++
++	for (i = 0; i < res->ncrtc; i++)
++		XRRSetCrtcConfig(dpy, res, res->crtcs[i], CurrentTime,
++				 original_crtc[i]->x,
++				 original_crtc[i]->y,
++				 original_crtc[i]->mode,
++				 original_crtc[i]->rotation,
++				 original_crtc[i]->outputs,
++				 original_crtc[i]->noutput);
++
++	if (DPMSQueryExtension(dpy, &i, &i))
++		DPMSEnable(dpy);
++
++	XSync(dpy, True);
++	return _x_error_occurred;
++}
+diff --git a/configure.ac b/configure.ac
+index 61bea435..d13917ec 100644
+--- a/configure.ac
++++ b/configure.ac
+@@ -195,18 +195,24 @@ AC_ARG_ENABLE(udev,
+               [UDEV="$enableval"],
+               [UDEV=auto])
+ 
++udev_msg=" disabled"
+ if test "x$UDEV" != "xno"; then
+ 	PKG_CHECK_MODULES(UDEV, [libudev], [udev="yes"], [udev="no"])
++	AC_CHECK_HEADERS([sys/stat.h], [], [udev="no"])
+ 	if test "x$UDEV" = "xyes" -a "x$udev" != "xyes"; then
+ 		AC_MSG_ERROR([udev support requested but not found (libudev)])
+ 	fi
+ 	if test "x$udev" = "xyes"; then
+ 		AC_DEFINE(HAVE_UDEV,1,[Enable udev-based monitor hotplug detection])
++		udev_msg=" yes"
++	else
++		udev_msg=" no"
+ 	fi
+ fi
+ 
+-PKG_CHECK_MODULES(X11, [x11 xrender xrandr xext xfixes cairo cairo-xlib-xrender pixman-1 libpng], [x11="yes"], [x11="no"])
++PKG_CHECK_MODULES(X11, [x11 x11-xcb xcb-dri2 xcomposite xdamage xrender xrandr xext xfixes cairo cairo-xlib-xrender pixman-1 libpng], [x11="yes"], [x11="no"])
+ AM_CONDITIONAL(HAVE_X11, test "x$x11" = "xyes")
++echo X11_CLFAGS="$X11_CLFAGS" X11_LIBS="$X11_LIBS"
+ 
+ cpuid="yes"
+ AC_TRY_LINK([
+@@ -270,10 +276,13 @@ if test "x$shm" = "xyes"; then
+ 	AC_DEFINE([HAVE_MIT_SHM], 1, [Define to 1 if MIT-SHM is available])
+ fi
+ 
+-PKG_CHECK_MODULES(X11_DRI3, [xcb-dri3 xcb-sync xcb-present x11-xcb xshmfence x11 xrender xext libdrm], [x11_dri3="yes"], [x11_dri3="no"])
++PKG_CHECK_MODULES(X11_DRI3, [xcb-dri3 xcb-sync xcb-xfixes xcb-present x11-xcb xshmfence x11 xcomposite xdamage xrender xrandr xxf86vm xext libdrm], [x11_dri3="yes"], [x11_dri3="no"])
+ AM_CONDITIONAL(X11_DRI3, test "x$x11_dri3" = "xyes" -a "x$shm" = "xyes")
+ AM_CONDITIONAL(X11_SHM, test "x$shm" = "xyes")
+ 
++PKG_CHECK_MODULES(X11_VM, [xxf86vm], [x11_vm="yes"], [x11_vm="no"])
++AM_CONDITIONAL(X11_VM, test "x$x11_vm" = "xyes")
++
+ AC_ARG_ENABLE(tools,
+               AS_HELP_STRING([--disable-tools],
+ 			     [Enable building and installing the miscellaneous tools [default=auto]]),
+@@ -285,7 +294,7 @@ if test "x$shm" != "xyes"; then
+ 	tools="no"
+ fi
+ if test "x$tools" != "xno"; then
+-	ivo_requires="xrandr xdamage xfixes xcursor xtst xrender xext x11 pixman-1"
++	ivo_requires="xrandr xdamage xfixes xcursor xtst xrender xscrnsaver xext x11 pixman-1"
+ 	extra_cflags=""
+ 
+ 	ignore="xinerama"
+@@ -307,6 +316,8 @@ if test "x$tools" != "xno"; then
+ 		tools="no"
+ 	fi
+ 
++	PKG_CHECK_MODULES(TOOL_CURSOR, [xfixes x11 libpng], [cursor="yes"], [ivo="no"])
++
+ 	IVO_CFLAGS="$IVO_CFLAGS $extra_cflags"
+ fi
+ if test "x$tools" != "xno"; then
+@@ -315,6 +326,7 @@ fi
+ AC_MSG_CHECKING([whether to build additional tools])
+ AC_MSG_RESULT([$tools])
+ AM_CONDITIONAL(BUILD_TOOLS, test "x$tools" != "xno")
++AM_CONDITIONAL(BUILD_TOOL_CURSOR, test "x$cursor" = "xyes")
+ 
+ # Define a configure option for an alternate module directory
+ AC_ARG_WITH(xorg-module-dir,
+@@ -339,10 +351,20 @@ AC_ARG_ENABLE(dri2,
+ 	      [DRI2=$enableval],
+ 	      [DRI2=yes])
+ AC_ARG_ENABLE(dri3,
+-	      AS_HELP_STRING([--enable-dri3],
+-			     [Enable DRI3 support [[default=no]]]),
++	      AS_HELP_STRING([--disable-dri3],
++			     [Disable DRI3 support [[default=yes]]]),
+ 	      [DRI3=$enableval],
+-	      [DRI3=no])
++	      [DRI3=yes])
++AC_ARG_WITH(default-dri,
++	    AS_HELP_STRING([--with-default-dri],
++			   [Select the default maximum DRI level [default 2]]),
++	      [DRI_DEFAULT=$withval],
++	      [DRI_DEFAULT=2])
++if test "x$DRI_DEFAULT" = "x0"; then
++	AC_DEFINE(DEFAULT_DRI_LEVEL, 0,[Default DRI level])
++else
++	AC_DEFINE(DEFAULT_DRI_LEVEL, ~0, [Default DRI level])
++fi
+ 
+ AC_ARG_ENABLE(xvmc, AS_HELP_STRING([--disable-xvmc],
+                                   [Disable XvMC support [[default=yes]]]),
+@@ -375,14 +397,12 @@ AC_ARG_ENABLE(ums-only,
+ required_xorg_server_version=1.6
+ required_pixman_version=0.16
+ 
+-if pkg-config --exists 'pixman-1 >= 0.27.1'; then
+-	AC_DEFINE([HAS_PIXMAN_GLYPHS], 1, [Enable pixman glyph cache])
+-fi
+-
+-if pkg-config --exists 'pixman-1 >= 0.24.0'; then
+-	AC_DEFINE([HAS_PIXMAN_TRIANGLES], 1, [Enable pixman triangle rasterisation])
+-fi
+-
++PKG_CHECK_EXISTS([pixman-1 >= 0.24.0],
++		 AC_DEFINE([HAS_PIXMAN_TRIANGLES], 1, [Enable pixman triangle rasterisation])
++		 [])
++PKG_CHECK_EXISTS([pixman-1 >= 0.27.1],
++		 [AC_DEFINE([HAS_PIXMAN_GLYPHS], 1, [Enable pixman glyph cache])],
++		 [])
+ # Store the list of server defined optional extensions in REQUIRED_MODULES
+ XORG_DRIVER_CHECK_EXT(RANDR, randrproto)
+ XORG_DRIVER_CHECK_EXT(RENDER, renderproto)
+@@ -398,24 +418,25 @@ AC_ARG_ENABLE(sna,
+ 	      [SNA="$enableval"],
+ 	      [SNA=auto])
+ 
++AC_CHECK_HEADERS([dev/wscons/wsconsio.h])
++AC_FUNC_ALLOCA
++AC_HEADER_MAJOR
++
+ if test "x$SNA" != "xno"; then
+ 	AC_DEFINE(USE_SNA, 1, [Enable SNA support])
+ 	AC_CHECK_HEADERS([sys/sysinfo.h], AC_CHECK_MEMBERS([struct sysinfo.totalram], [], [], [[#include <sys/sysinfo.h>]]))
+ fi
+ 
+ uxa_requires_libdrm=2.4.52
++uxa_requires_pixman=0.24.0
++
+ AC_ARG_ENABLE(uxa,
+ 	      AS_HELP_STRING([--enable-uxa],
+ 			     [Enable Unified Acceleration Architecture (UXA) [default=auto]]),
+ 	      [UXA="$enableval"],
+ 	      [UXA=auto])
+ if test "x$UXA" = "xauto"; then
+-	if ! pkg-config --exists "libdrm_intel >= $uxa_requires_libdrm"; then
+-		UXA=no
+-	fi
+-	if ! pkg-config --exists 'pixman-1 >= 0.24.0'; then
+-		UXA=no
+-	fi
++	PKG_CHECK_EXISTS([libdrm_intel >= $uxa_requires_libdrm pixman-1 >= $uxa_requires_pixman], [], [UXA=no])
+ fi
+ if test "x$UXA" != "xno"; then
+ 	AC_DEFINE(USE_UXA, 1, [Enable UXA support])
+@@ -424,8 +445,10 @@ if test "x$UXA" != "xno"; then
+ 	UXA=yes
+ fi
+ 
+-PKG_CHECK_MODULES(XORG, [xorg-server >= $required_xorg_server_version xproto fontsproto pixman-1 >= $required_pixman_version $REQUIRED_MODULES])
++PKG_CHECK_MODULES(XORG, [xorg-server >= $required_xorg_server_version xproto fontsproto damageproto pixman-1 >= $required_pixman_version $REQUIRED_MODULES])
+ ABI_VERSION=`$PKG_CONFIG --variable=abi_videodrv xorg-server`
++XSERVER_VERSION=`$PKG_CONFIG --modversion xorg-server`
++PIXMAN_VERSION=`$PKG_CONFIG --modversion pixman-1`
+ 
+ if test "x$ONLY_UMS" = "xyes"; then
+ 	UMS="yes"
+@@ -519,7 +542,12 @@ AC_MSG_RESULT([$have_dri1])
+ AM_CONDITIONAL(DRI1, test "x$have_dri1" != "xno")
+ if test "x$have_dri1" != "xno"; then
+         AC_DEFINE(HAVE_DRI1,1,[Enable DRI1 driver support])
+-	dri_msg="$dri_msg DRI1"
++	str="DRI1"
++	if test "x$DRI_DEFAULT" = "x1"; then
++		AC_DEFINE(DEFAULT_DRI_LEVEL,1,[Default DRI level])
++		str="*$str"
++	fi
++	dri_msg="$dri_msg $str"
+ else
+         DRI1_CFLAGS=""
+         DRI1_LIBS=""
+@@ -576,7 +604,12 @@ AM_CONDITIONAL(DRI2, test "x$have_dri2" != "xno")
+ AC_MSG_RESULT([$have_dri2])
+ if test "x$have_dri2" != "xno"; then
+         AC_DEFINE(HAVE_DRI2,1,[Enable DRI2 driver support])
+-	dri_msg="$dri_msg DRI2"
++	str="DRI2"
++	if test "x$DRI_DEFAULT" = "x2"; then
++		AC_DEFINE(DEFAULT_DRI_LEVEL,2,[Default DRI level])
++		str="*$str"
++	fi
++	dri_msg="$dri_msg $str"
+ else
+ 	if test "x$DRI" = "xyes" -a "x$DRI2" != "xno" -a "x$KMS" = "xyes"; then
+ 		AC_MSG_ERROR([DRI2 requested but prerequisites not found])
+@@ -591,13 +624,21 @@ AM_CONDITIONAL(DRI3, test "x$have_dri3" != "xno")
+ AC_MSG_RESULT([$have_dri3])
+ if test "x$have_dri3" != "xno"; then
+         AC_DEFINE(HAVE_DRI3,1,[Enable DRI3 driver support])
+-	dri_msg="$dri_msg DRI3"
++	str="DRI3"
++	if test "x$DRI_DEFAULT" = "x3"; then
++		AC_DEFINE(DEFAULT_DRI_LEVEL,3,[Default DRI level])
++		str="*$str"
++	fi
++	dri_msg="$dri_msg $str"
+ else
+ 	if test "x$DRI" = "xyes" -a "x$DRI3" != "xno" -a "x$KMS" = "xyes"; then
+ 		AC_MSG_ERROR([DRI3 requested but prerequisites not found])
+ 	fi
+ fi
+ 
++AC_MSG_CHECKING([default DRI support])
++AC_MSG_RESULT([$DEFAULT_DRI_DEFAULT])
++
+ AC_CHECK_HEADERS([X11/extensions/dpmsconst.h])
+ 
+ PRESENT="no"
+@@ -711,27 +752,6 @@ if test "x$TEARFREE" = "xyes"; then
+ 	xp_msg="$xp_msg TearFree"
+ fi
+ 
+-AC_ARG_ENABLE(rendernode,
+-	      AS_HELP_STRING([--enable-rendernode],
+-			     [Enable use of render nodes (experimental) [default=no]]),
+-	      [RENDERNODE="$enableval"],
+-	      [RENDERNODE="no"])
+-AM_CONDITIONAL(USE_RENDERNODE, test "x$RENDERNODE" = "xyes")
+-if test "x$RENDERNODE" = "xyes"; then
+-	AC_DEFINE(USE_RENDERNODE,1,[Assume "rendernode" support])
+-	xp_msg="$xp_msg rendernode"
+-fi
+-
+-AC_ARG_ENABLE(wc-mmap,
+-	      AS_HELP_STRING([--enable-wc-mmap],
+-			     [Enable use of WriteCombining mmaps [default=no]]),
+-	      [WC_MMAP="$enableval"],
+-	      [WC_MMAP="no"])
+-if test "x$WC_MMAP" = "xyes"; then
+-	AC_DEFINE(USE_WC_MMAP,1,[Enable use of WriteCombining mmaps])
+-	xp_msg="$xp_msg mmap(wc)"
+-fi
+-
+ AC_ARG_ENABLE(create2,
+ 	      AS_HELP_STRING([--enable-create2],
+ 			     [Enable use of create2 ioctl (experimental) [default=no]]),
+@@ -848,6 +868,7 @@ AC_CONFIG_FILES([
+                 xvmc/shader/mc/Makefile
+                 xvmc/shader/vld/Makefile
+ 		test/Makefile
++		benchmarks/Makefile
+ 		tools/Makefile
+ 		tools/org.x.xf86-video-intel.backlight-helper.policy
+ ])
+@@ -855,7 +876,7 @@ AC_OUTPUT
+ 
+ echo ""
+ echo ""
+-test -e `pwd $0`/README && cat `pwd $0`/README
++cat $srcdir/README
+ 
+ accel_msg=""
+ if test "x$SNA" != "xno"; then
+@@ -895,13 +916,15 @@ fi
+ 
+ echo ""
+ echo "AC_PACKAGE_STRING will be compiled with:"
+-echo "  Xorg Video ABI version: $ABI_VERSION"
++echo "  Xorg Video ABI version: $ABI_VERSION (xorg-server-$XSERVER_VERSION)"
++echo "  pixman version: pixman-1-$PIXMAN_VERSION"
+ echo "  Acceleration backends:$accel_msg"
+ echo "  Additional debugging support?$debug_msg"
+ echo "  Support for Kernel Mode Setting? $KMS"
+ echo "  Support for legacy User Mode Setting (for i810)? $UMS"
+ echo "  Support for Direct Rendering Infrastructure:$dri_msg"
+ echo "  Support for Xv motion compensation (XvMC and libXvMC):$xvmc_msg"
++echo "  Support for display hotplug notifications (udev):$udev_msg"
+ echo "  Build additional tools and utilities?$tools_msg"
+ if test -n "$xp_msg"; then
+ echo "  Experimental support:$xp_msg"
+diff --git a/libobj/alloca.c b/libobj/alloca.c
+new file mode 100644
+index 00000000..883e1e9f
+--- /dev/null
++++ b/libobj/alloca.c
+@@ -0,0 +1,4 @@
++void *alloca(size_t sz)
++{
++	return NULL;
++}
+diff --git a/man/intel.man b/man/intel.man
+index 17515206..be398fbe 100644
+--- a/man/intel.man
++++ b/man/intel.man
+@@ -27,9 +27,9 @@ supports the i810, i810-DC100, i810e, i815, i830M, 845G, 852GM, 855GM,
+ 865G, 915G, 915GM, 945G, 945GM, 965G, 965Q, 946GZ, 965GM, 945GME,
+ G33, Q33, Q35, G35, GM45, G45, Q45, G43, G41 chipsets, Pineview-M in
+ Atom N400 series, Pineview-D in Atom D400/D500 series,
+-Intel(R) HD Graphics: 2000-6000,
+-Intel(R) Iris(TM) Graphics: 5100/6100, and
+-Intel(R) Iris(TM) Pro Graphics: 5200/6200/P6300.
++Intel(R) HD Graphics,
++Intel(R) Iris(TM) Graphics,
++Intel(R) Iris(TM) Pro Graphics.
+ 
+ .SH CONFIGURATION DETAILS
+ Please refer to __xconfigfile__(__filemansuffix__) for general configuration
+@@ -112,8 +112,8 @@ The default is 8192 if AGP allocable memory is < 128 MB, 16384 if < 192 MB,
+ 24576 if higher. DRI require at least a value of 16384. Higher values may give
+ better 3D performance, at expense of available system memory.
+ .TP
+-.BI "Option \*qNoAccel\*q \*q" boolean \*q
+-Disable or enable acceleration.
++.BI "Option \*qAccel\*q \*q" boolean \*q
++Enable or disable acceleration.
+ .IP
+ Default: acceleration is enabled.
+ 
+@@ -122,8 +122,8 @@ The following driver
+ .B Options
+ are supported for the 830M and later chipsets:
+ .TP
+-.BI "Option \*qNoAccel\*q \*q" boolean \*q
+-Disable or enable acceleration.
++.BI "Option \*qAccel\*q \*q" boolean \*q
++Enable or disable acceleration.
+ .IP
+ Default: acceleration is enabled.
+ .TP
+@@ -201,6 +201,16 @@ that choice by specifying the entry under /sys/class/backlight to use.
+ .IP
+ Default: Automatic selection.
+ .TP
++.BI "Option \*qCustomEDID\*q \*q" string \*q
++Override the probed EDID on particular outputs. Sometimes the manufacturer
++supplied EDID is corrupt or lacking a few usable modes and supplying a
++corrected EDID may be easier than specifying every modeline. This option
++allows to pass the path to load an EDID from per output. The format is a
++comma separated string of output:path pairs, e.g.
++DP1:/path/to/dp1.edid,DP2:/path/to/dp2.edid
++.IP
++Default: No override, use manufacturer supplied EDIDs.
++.TP
+ .BI "Option \*qFallbackDebug\*q \*q" boolean \*q
+ Enable printing of debugging information on acceleration fallbacks to the
+ server log.
+@@ -225,6 +235,15 @@ i.e. perform synchronous rendering.
+ .IP
+ Default: Disabled
+ .TP
++.BI "Option \*qHWRotation\*q \*q" boolean \*q
++Override the use of native hardware rotation and force the use of software,
++but GPU accelerated where possible, rotation. On some platforms the hardware
++can scanout directly into a rotated output bypassing the intermediate rendering
++and extra allocations required for software implemented rotation (i.e. native
++rotation uses less resources, is quicker and uses less power). This allows you
++to disable the native rotation in case of errors.
++.IP
++Default: Enabled (use hardware rotation)
+ .TP
+ .BI "Option \*qVSync\*q \*q" boolean \*q
+ This option controls the use of commands to synchronise rendering with the
+@@ -324,13 +343,29 @@ Default: 0
+ .BI "Option \*qZaphodHeads\*q \*q" string \*q
+ .IP
+ Specify the randr output(s) to use with zaphod mode for a particular driver
+-instance.  If you this option you must use it with all instances of the
+-driver
++instance.  If you set this option you must use it with all instances of the
++driver. By default, each head is assigned only one CRTC (which limits
++using multiple outputs with that head to cloned mode). CRTC can be manually
++assigned to individual heads by preceding the output names with a comma
++delimited list of pipe numbers followed by a colon. Note that different pipes
++may be limited in their functionality and some outputs may only work with
++different pipes.
+ .br
+ For example:
++
++.RS
+ .B
+ Option \*qZaphodHeads\*q \*qLVDS1,VGA1\*q
+-will assign xrandr outputs LVDS1 and VGA0 to this instance of the driver.
++
++will assign xrandr outputs LVDS1 and VGA1 to this instance of the driver.
++.RE
++
++.RS
++.B
++Option \*qZaphodHeads\*q \*q0,2:HDMI1,DP2\*q
++
++will assign xrandr outputs HDMI1 and DP2 and CRTCs 0 and 2 to this instance of the driver.
++.RE
+ 
+ .SH OUTPUT CONFIGURATION
+ On 830M and better chipsets, the driver supports runtime configuration of
+@@ -431,11 +466,11 @@ First DVI SDVO output
+ Second DVI SDVO output
+ 
+ .SS "TMDS-1", "TMDS-2", "HDMI-1", "HDMI-2"
+-DVI/HDMI outputs. Avaliable common properties include:
++DVI/HDMI outputs. Available common properties include:
+ .TP
+ \fBBROADCAST_RGB\fP - method used to set RGB color range
+ Adjusting this property allows you to set RGB color range on each
+-channel in order to match HDTV requirment(default 0 for full
++channel in order to match HDTV requirement(default 0 for full
+ range). Setting 1 means RGB color range is 16-235, 0 means RGB color
+ range is 0-255 on each channel.  (Full range is 0-255, not 16-235)
+ 
+diff --git a/src/backlight.c b/src/backlight.c
+index 9f239867..fcbb279f 100644
+--- a/src/backlight.c
++++ b/src/backlight.c
+@@ -34,6 +34,12 @@
+ #include <sys/stat.h>
+ #include <sys/ioctl.h>
+ 
++#if MAJOR_IN_MKDEV
++#include <sys/mkdev.h>
++#elif MAJOR_IN_SYSMACROS
++#include <sys/sysmacros.h>
++#endif
++
+ #include <stdio.h>
+ #include <stdlib.h>
+ #include <string.h>
+@@ -42,6 +48,7 @@
+ #include <fcntl.h>
+ #include <unistd.h>
+ #include <dirent.h>
++#include <errno.h>
+ 
+ #include <xorg-server.h>
+ #include <xf86.h>
+@@ -84,7 +91,7 @@ void backlight_init(struct backlight *b)
+ 	b->has_power = 0;
+ }
+ 
+-#ifdef __OpenBSD__
++#ifdef HAVE_DEV_WSCONS_WSCONSIO_H
+ 
+ #include <dev/wscons/wsconsio.h>
+ #include <xf86Priv.h>
+@@ -122,6 +129,11 @@ int backlight_get(struct backlight *b)
+ 	return param.curval;
+ }
+ 
++char *backlight_find_for_device(struct pci_device *pci)
++{
++	return NULL;
++}
++
+ int backlight_open(struct backlight *b, char *iface)
+ {
+ 	struct wsdisplay_param param;
+@@ -146,12 +158,9 @@ int backlight_open(struct backlight *b, char *iface)
+ 	return param.curval;
+ }
+ 
+-enum backlight_type backlight_exists(const char *iface)
++int backlight_exists(const char *iface)
+ {
+-	if (iface != NULL)
+-		return BL_NONE;
+-
+-	return BL_PLATFORM;
++	return iface == NULL;
+ }
+ 
+ int backlight_on(struct backlight *b)
+@@ -163,6 +172,7 @@ int backlight_off(struct backlight *b)
+ {
+ 	return 0;
+ }
++
+ #else
+ 
+ static int
+@@ -213,6 +223,24 @@ __backlight_read(const char *iface, const char *file)
+ }
+ 
+ static int
++writen(int fd, const char *value, int len)
++{
++	int ret;
++
++	do {
++		ret = write(fd, value, len);
++		if (ret < 0) {
++			if (errno == EAGAIN || errno == EINTR)
++				continue;
++
++			return ret;
++		}
++	} while (value += ret, len -= ret);
++
++	return 0;
++}
++
++static int
+ __backlight_write(const char *iface, const char *file, const char *value)
+ {
+ 	int fd, ret;
+@@ -221,7 +249,7 @@ __backlight_write(const char *iface, const char *file, const char *value)
+ 	if (fd < 0)
+ 		return -1;
+ 
+-	ret = write(fd, value, strlen(value)+1);
++	ret = writen(fd, value, strlen(value)+1);
+ 	close(fd);
+ 
+ 	return ret;
+@@ -244,10 +272,10 @@ static const char *known_interfaces[] = {
+ 	"intel_backlight",
+ };
+ 
+-static enum backlight_type __backlight_type(const char *iface)
++static int __backlight_type(const char *iface)
+ {
+ 	char buf[1024];
+-	int fd, v;
++	int fd, v, i;
+ 
+ 	v = -1;
+ 	fd = __backlight_open(iface, "type", O_RDONLY);
+@@ -261,39 +289,41 @@ static enum backlight_type __backlight_type(const char *iface)
+ 		buf[v] = '\0';
+ 
+ 		if (strcmp(buf, "raw") == 0)
+-			v = BL_RAW;
++			v = BL_RAW << 8;
+ 		else if (strcmp(buf, "platform") == 0)
+-			v = BL_PLATFORM;
++			v = BL_PLATFORM << 8;
+ 		else if (strcmp(buf, "firmware") == 0)
+-			v = BL_FIRMWARE;
++			v = BL_FIRMWARE << 8;
+ 		else
+-			v = BL_NAMED;
++			v = BL_NAMED << 8;
+ 	} else
+-		v = BL_NAMED;
++		v = BL_NAMED << 8;
+ 
+-	if (v == BL_NAMED) {
+-		int i;
+-		for (i = 0; i < ARRAY_SIZE(known_interfaces); i++) {
+-			if (strcmp(iface, known_interfaces[i]) == 0)
+-				break;
+-		}
+-		v += i;
++	for (i = 0; i < ARRAY_SIZE(known_interfaces); i++) {
++		if (strcmp(iface, known_interfaces[i]) == 0)
++			break;
+ 	}
++	v += i;
+ 
+ 	return v;
+ }
+ 
+-enum backlight_type backlight_exists(const char *iface)
++static int __backlight_exists(const char *iface)
+ {
+ 	if (__backlight_read(iface, "brightness") < 0)
+-		return BL_NONE;
++		return -1;
+ 
+ 	if (__backlight_read(iface, "max_brightness") <= 0)
+-		return BL_NONE;
++		return -1;
+ 
+ 	return __backlight_type(iface);
+ }
+ 
++int backlight_exists(const char *iface)
++{
++	return __backlight_exists(iface) != -1;
++}
++
+ static int __backlight_init(struct backlight *b, char *iface, int fd)
+ {
+ 	b->fd = fd_move_cloexec(fd_set_nonblock(fd));
+@@ -399,7 +429,50 @@ __backlight_find(void)
+ 			continue;
+ 
+ 		/* Fallback to priority list of known iface for old kernels */
+-		v = backlight_exists(de->d_name);
++		v = __backlight_exists(de->d_name);
++		if (v < 0)
++			continue;
++
++		if (v < best_type) {
++			char *copy = strdup(de->d_name);
++			if (copy) {
++				free(best_iface);
++				best_iface = copy;
++				best_type = v;
++			}
++		}
++	}
++	closedir(dir);
++
++	return best_iface;
++}
++
++char *backlight_find_for_device(struct pci_device *pci)
++{
++	char path[200];
++	unsigned best_type = INT_MAX;
++	char *best_iface = NULL;
++	DIR *dir;
++	struct dirent *de;
++
++	snprintf(path, sizeof(path),
++		 "/sys/bus/pci/devices/%04x:%02x:%02x.%d/backlight",
++		 pci->domain, pci->bus, pci->dev, pci->func);
++
++	dir = opendir(path);
++	if (dir == NULL)
++		return NULL;
++
++	while ((de = readdir(dir))) {
++		int v;
++
++		if (*de->d_name == '.')
++			continue;
++
++		v = __backlight_exists(de->d_name);
++		if (v < 0)
++			continue;
++
+ 		if (v < best_type) {
+ 			char *copy = strdup(de->d_name);
+ 			if (copy) {
+@@ -416,14 +489,17 @@ __backlight_find(void)
+ 
+ int backlight_open(struct backlight *b, char *iface)
+ {
+-	int level;
++	int level, type;
+ 
+ 	if (iface == NULL)
+ 		iface = __backlight_find();
+ 	if (iface == NULL)
+ 		goto err;
+ 
+-	b->type = __backlight_type(iface);
++	type = __backlight_type(iface);
++	if (type < 0)
++		goto err;
++	b->type = type >> 8;
+ 
+ 	b->max = __backlight_read(iface, "max_brightness");
+ 	if (b->max <= 0)
+@@ -447,7 +523,7 @@ err:
+ int backlight_set(struct backlight *b, int level)
+ {
+ 	char val[BACKLIGHT_VALUE_LEN];
+-	int len, ret = 0;
++	int len;
+ 
+ 	if (b->iface == NULL)
+ 		return 0;
+@@ -456,10 +532,7 @@ int backlight_set(struct backlight *b, int level)
+ 		level = b->max;
+ 
+ 	len = snprintf(val, BACKLIGHT_VALUE_LEN, "%d\n", level);
+-	if (write(b->fd, val, len) != len)
+-		ret = -1;
+-
+-	return ret;
++	return writen(b->fd, val, len);
+ }
+ 
+ int backlight_get(struct backlight *b)
+@@ -517,43 +590,6 @@ void backlight_disable(struct backlight *b)
+ void backlight_close(struct backlight *b)
+ {
+ 	backlight_disable(b);
+-	if (b->pid)
++	if (b->pid > 0)
+ 		waitpid(b->pid, NULL, 0);
+ }
+-
+-char *backlight_find_for_device(struct pci_device *pci)
+-{
+-	char path[200];
+-	unsigned best_type = INT_MAX;
+-	char *best_iface = NULL;
+-	DIR *dir;
+-	struct dirent *de;
+-
+-	snprintf(path, sizeof(path),
+-		 "/sys/bus/pci/devices/%04x:%02x:%02x.%d/backlight",
+-		 pci->domain, pci->bus, pci->dev, pci->func);
+-
+-	dir = opendir(path);
+-	if (dir == NULL)
+-		return NULL;
+-
+-	while ((de = readdir(dir))) {
+-		int v;
+-
+-		if (*de->d_name == '.')
+-			continue;
+-
+-		v = backlight_exists(de->d_name);
+-		if (v < best_type) {
+-			char *copy = strdup(de->d_name);
+-			if (copy) {
+-				free(best_iface);
+-				best_iface = copy;
+-				best_type = v;
+-			}
+-		}
+-	}
+-	closedir(dir);
+-
+-	return best_iface;
+-}
+diff --git a/src/backlight.h b/src/backlight.h
+index bb0e28bc..ba17755b 100644
+--- a/src/backlight.h
++++ b/src/backlight.h
+@@ -43,7 +43,7 @@ struct backlight {
+ 	int pid, fd;
+ };
+ 
+-enum backlight_type backlight_exists(const char *iface);
++int backlight_exists(const char *iface);
+ 
+ void backlight_init(struct backlight *backlight);
+ int backlight_open(struct backlight *backlight, char *iface);
+diff --git a/src/compat-api.h b/src/compat-api.h
+index d09e1fb3..05797a08 100644
+--- a/src/compat-api.h
++++ b/src/compat-api.h
+@@ -30,6 +30,7 @@
+ 
+ #include <xorg-server.h>
+ #include <xorgVersion.h>
++#include <xf86Module.h>
+ 
+ #include <picturestr.h>
+ #ifndef GLYPH_HAS_GLYPH_PICTURE_ACCESSOR
+@@ -39,7 +40,17 @@
+ 
+ #ifndef XF86_HAS_SCRN_CONV
+ #define xf86ScreenToScrn(s) xf86Screens[(s)->myNum]
++#if XORG_VERSION_CURRENT < XORG_VERSION_NUMERIC(1,1,0,0,0)
+ #define xf86ScrnToScreen(s) screenInfo.screens[(s)->scrnIndex]
++#else
++#define xf86ScrnToScreen(s) ((s)->pScreen)
++#endif
++#else
++#define xf86ScrnToScreen(s) ((s)->pScreen)
++#endif
++
++#if GET_ABI_MAJOR(ABI_VIDEODRV_VERSION) >= 22
++#define HAVE_NOTIFY_FD 1
+ #endif
+ 
+ #ifndef XF86_SCRN_INTERFACE
+@@ -131,6 +142,17 @@ region_rects(const RegionRec *r)
+ 	return r->data ? (const BoxRec *)(r->data + 1) :  &r->extents;
+ }
+ 
++inline static void
++region_get_boxes(const RegionRec *r, const BoxRec **s, const BoxRec **e)
++{
++	int n;
++	if (r->data)
++		*s = region_boxptr(r), n = r->data->numRects;
++	else
++		*s = &r->extents, n = 1;
++	*e = *s + n;
++}
++
+ #ifndef INCLUDE_LEGACY_REGION_DEFINES
+ #define RegionCreate(r, s) REGION_CREATE(NULL, r, s)
+ #define RegionBreak(r) REGION_BREAK(NULL, r)
+@@ -223,4 +245,19 @@ static inline void FreePixmap(PixmapPtr pixmap)
+ 			  dstx, dsty)
+ #endif
+ 
++#if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,12,99,901,0)
++#define isGPU(S) (S)->is_gpu
++#else
++#define isGPU(S) 0
++#endif
++
++#if HAS_DIRTYTRACKING_ROTATION
++#define PixmapSyncDirtyHelper(d, dd) PixmapSyncDirtyHelper(d)
++#endif
++
++#if !HAVE_NOTIFY_FD
++#define SetNotifyFd(fd, cb, mode, data) AddGeneralSocket(fd);
++#define RemoveNotifyFd(fd) RemoveGeneralSocket(fd)
++#endif
++
+ #endif
+diff --git a/src/i915_pciids.h b/src/i915_pciids.h
+index 180ad0e6..466c7159 100644
+--- a/src/i915_pciids.h
++++ b/src/i915_pciids.h
+@@ -134,7 +134,7 @@
+ #define INTEL_IVB_Q_IDS(info) \
+ 	INTEL_QUANTA_VGA_DEVICE(info) /* Quanta transcode */
+ 
+-#define INTEL_HSW_D_IDS(info) \
++#define INTEL_HSW_IDS(info) \
+ 	INTEL_VGA_DEVICE(0x0402, info), /* GT1 desktop */ \
+ 	INTEL_VGA_DEVICE(0x0412, info), /* GT2 desktop */ \
+ 	INTEL_VGA_DEVICE(0x0422, info), /* GT3 desktop */ \
+@@ -179,9 +179,7 @@
+ 	INTEL_VGA_DEVICE(0x0D2B, info), /* CRW GT3 reserved */ \
+ 	INTEL_VGA_DEVICE(0x0D0E, info), /* CRW GT1 reserved */ \
+ 	INTEL_VGA_DEVICE(0x0D1E, info), /* CRW GT2 reserved */ \
+-	INTEL_VGA_DEVICE(0x0D2E, info)  /* CRW GT3 reserved */ \
+-
+-#define INTEL_HSW_M_IDS(info) \
++	INTEL_VGA_DEVICE(0x0D2E, info),  /* CRW GT3 reserved */ \
+ 	INTEL_VGA_DEVICE(0x0406, info), /* GT1 mobile */ \
+ 	INTEL_VGA_DEVICE(0x0416, info), /* GT2 mobile */ \
+ 	INTEL_VGA_DEVICE(0x0426, info), /* GT2 mobile */ \
+@@ -198,60 +196,48 @@
+ 	INTEL_VGA_DEVICE(0x0D16, info), /* CRW GT2 mobile */ \
+ 	INTEL_VGA_DEVICE(0x0D26, info)  /* CRW GT3 mobile */
+ 
+-#define INTEL_VLV_M_IDS(info) \
++#define INTEL_VLV_IDS(info) \
+ 	INTEL_VGA_DEVICE(0x0f30, info), \
+ 	INTEL_VGA_DEVICE(0x0f31, info), \
+ 	INTEL_VGA_DEVICE(0x0f32, info), \
+ 	INTEL_VGA_DEVICE(0x0f33, info), \
+-	INTEL_VGA_DEVICE(0x0157, info)
+-
+-#define INTEL_VLV_D_IDS(info) \
++	INTEL_VGA_DEVICE(0x0157, info), \
+ 	INTEL_VGA_DEVICE(0x0155, info)
+ 
+-#define _INTEL_BDW_M(gt, id, info) \
+-	INTEL_VGA_DEVICE((((gt) - 1) << 4) | (id), info)
+-#define _INTEL_BDW_D(gt, id, info) \
+-	INTEL_VGA_DEVICE((((gt) - 1) << 4) | (id), info)
+-
+-#define _INTEL_BDW_M_IDS(gt, info) \
+-	_INTEL_BDW_M(gt, 0x1602, info), /* ULT */ \
+-	_INTEL_BDW_M(gt, 0x1606, info), /* ULT */ \
+-	_INTEL_BDW_M(gt, 0x160B, info), /* Iris */ \
+-	_INTEL_BDW_M(gt, 0x160E, info) /* ULX */
+-
+-#define _INTEL_BDW_D_IDS(gt, info) \
+-	_INTEL_BDW_D(gt, 0x160A, info), /* Server */ \
+-	_INTEL_BDW_D(gt, 0x160D, info) /* Workstation */
+-
+-#define INTEL_BDW_GT12M_IDS(info) \
+-	_INTEL_BDW_M_IDS(1, info), \
+-	_INTEL_BDW_M_IDS(2, info)
+-
+-#define INTEL_BDW_GT12D_IDS(info) \
+-	_INTEL_BDW_D_IDS(1, info), \
+-	_INTEL_BDW_D_IDS(2, info)
+-
+-#define INTEL_BDW_GT3M_IDS(info) \
+-	_INTEL_BDW_M_IDS(3, info)
+-
+-#define INTEL_BDW_GT3D_IDS(info) \
+-	_INTEL_BDW_D_IDS(3, info)
+-
+-#define INTEL_BDW_RSVDM_IDS(info) \
+-	_INTEL_BDW_M_IDS(4, info)
+-
+-#define INTEL_BDW_RSVDD_IDS(info) \
+-	_INTEL_BDW_D_IDS(4, info)
+-
+-#define INTEL_BDW_M_IDS(info) \
+-	INTEL_BDW_GT12M_IDS(info), \
+-	INTEL_BDW_GT3M_IDS(info), \
+-	INTEL_BDW_RSVDM_IDS(info)
+-
+-#define INTEL_BDW_D_IDS(info) \
+-	INTEL_BDW_GT12D_IDS(info), \
+-	INTEL_BDW_GT3D_IDS(info), \
+-	INTEL_BDW_RSVDD_IDS(info)
++#define INTEL_BDW_GT12_IDS(info)  \
++	INTEL_VGA_DEVICE(0x1602, info), /* GT1 ULT */ \
++	INTEL_VGA_DEVICE(0x1606, info), /* GT1 ULT */ \
++	INTEL_VGA_DEVICE(0x160B, info), /* GT1 Iris */ \
++	INTEL_VGA_DEVICE(0x160E, info), /* GT1 ULX */ \
++	INTEL_VGA_DEVICE(0x1612, info), /* GT2 Halo */ \
++	INTEL_VGA_DEVICE(0x1616, info), /* GT2 ULT */ \
++	INTEL_VGA_DEVICE(0x161B, info), /* GT2 ULT */ \
++	INTEL_VGA_DEVICE(0x161E, info),  /* GT2 ULX */ \
++	INTEL_VGA_DEVICE(0x160A, info), /* GT1 Server */ \
++	INTEL_VGA_DEVICE(0x160D, info), /* GT1 Workstation */ \
++	INTEL_VGA_DEVICE(0x161A, info), /* GT2 Server */ \
++	INTEL_VGA_DEVICE(0x161D, info)  /* GT2 Workstation */
++
++#define INTEL_BDW_GT3_IDS(info) \
++	INTEL_VGA_DEVICE(0x1622, info), /* ULT */ \
++	INTEL_VGA_DEVICE(0x1626, info), /* ULT */ \
++	INTEL_VGA_DEVICE(0x162B, info), /* Iris */ \
++	INTEL_VGA_DEVICE(0x162E, info),  /* ULX */\
++	INTEL_VGA_DEVICE(0x162A, info), /* Server */ \
++	INTEL_VGA_DEVICE(0x162D, info)  /* Workstation */
++
++#define INTEL_BDW_RSVD_IDS(info) \
++	INTEL_VGA_DEVICE(0x1632, info), /* ULT */ \
++	INTEL_VGA_DEVICE(0x1636, info), /* ULT */ \
++	INTEL_VGA_DEVICE(0x163B, info), /* Iris */ \
++	INTEL_VGA_DEVICE(0x163E, info), /* ULX */ \
++	INTEL_VGA_DEVICE(0x163A, info), /* Server */ \
++	INTEL_VGA_DEVICE(0x163D, info)  /* Workstation */
++
++#define INTEL_BDW_IDS(info) \
++	INTEL_BDW_GT12_IDS(info), \
++	INTEL_BDW_GT3_IDS(info), \
++	INTEL_BDW_RSVD_IDS(info)
+ 
+ #define INTEL_CHV_IDS(info) \
+ 	INTEL_VGA_DEVICE(0x22b0, info), \
+@@ -259,21 +245,85 @@
+ 	INTEL_VGA_DEVICE(0x22b2, info), \
+ 	INTEL_VGA_DEVICE(0x22b3, info)
+ 
+-#define INTEL_SKL_IDS(info) \
+-	INTEL_VGA_DEVICE(0x1916, info), /* ULT GT2 */ \
++#define INTEL_SKL_GT1_IDS(info)	\
+ 	INTEL_VGA_DEVICE(0x1906, info), /* ULT GT1 */ \
+-	INTEL_VGA_DEVICE(0x1926, info), /* ULT GT3 */ \
+-	INTEL_VGA_DEVICE(0x1921, info), /* ULT GT2F */ \
+ 	INTEL_VGA_DEVICE(0x190E, info), /* ULX GT1 */ \
++	INTEL_VGA_DEVICE(0x1902, info), /* DT  GT1 */ \
++	INTEL_VGA_DEVICE(0x190B, info), /* Halo GT1 */ \
++	INTEL_VGA_DEVICE(0x190A, info) /* SRV GT1 */
++
++#define INTEL_SKL_GT2_IDS(info)	\
++	INTEL_VGA_DEVICE(0x1916, info), /* ULT GT2 */ \
++	INTEL_VGA_DEVICE(0x1921, info), /* ULT GT2F */ \
+ 	INTEL_VGA_DEVICE(0x191E, info), /* ULX GT2 */ \
+ 	INTEL_VGA_DEVICE(0x1912, info), /* DT  GT2 */ \
+-	INTEL_VGA_DEVICE(0x1902, info), /* DT  GT1 */ \
+ 	INTEL_VGA_DEVICE(0x191B, info), /* Halo GT2 */ \
+-	INTEL_VGA_DEVICE(0x192B, info), /* Halo GT3 */ \
+-	INTEL_VGA_DEVICE(0x190B, info), /* Halo GT1 */ \
+ 	INTEL_VGA_DEVICE(0x191A, info), /* SRV GT2 */ \
+-	INTEL_VGA_DEVICE(0x192A, info), /* SRV GT3 */ \
+-	INTEL_VGA_DEVICE(0x190A, info), /* SRV GT1 */ \
+ 	INTEL_VGA_DEVICE(0x191D, info)  /* WKS GT2 */
+ 
++#define INTEL_SKL_GT3_IDS(info) \
++	INTEL_VGA_DEVICE(0x1923, info), /* ULT GT3 */ \
++	INTEL_VGA_DEVICE(0x1926, info), /* ULT GT3 */ \
++	INTEL_VGA_DEVICE(0x1927, info), /* ULT GT3 */ \
++	INTEL_VGA_DEVICE(0x192B, info), /* Halo GT3 */ \
++	INTEL_VGA_DEVICE(0x192D, info)  /* SRV GT3 */
++
++#define INTEL_SKL_GT4_IDS(info) \
++	INTEL_VGA_DEVICE(0x1932, info), /* DT GT4 */ \
++	INTEL_VGA_DEVICE(0x193B, info), /* Halo GT4 */ \
++	INTEL_VGA_DEVICE(0x193D, info), /* WKS GT4 */ \
++	INTEL_VGA_DEVICE(0x192A, info), /* SRV GT4 */ \
++	INTEL_VGA_DEVICE(0x193A, info)  /* SRV GT4e */
++
++#define INTEL_SKL_IDS(info)	 \
++	INTEL_SKL_GT1_IDS(info), \
++	INTEL_SKL_GT2_IDS(info), \
++	INTEL_SKL_GT3_IDS(info), \
++	INTEL_SKL_GT4_IDS(info)
++
++#define INTEL_BXT_IDS(info) \
++	INTEL_VGA_DEVICE(0x0A84, info), \
++	INTEL_VGA_DEVICE(0x1A84, info), \
++	INTEL_VGA_DEVICE(0x1A85, info), \
++	INTEL_VGA_DEVICE(0x5A84, info), /* APL HD Graphics 505 */ \
++	INTEL_VGA_DEVICE(0x5A85, info)  /* APL HD Graphics 500 */
++
++#define INTEL_GLK_IDS(info) \
++	INTEL_VGA_DEVICE(0x3184, info), \
++	INTEL_VGA_DEVICE(0x3185, info)
++
++#define INTEL_KBL_GT1_IDS(info)	\
++	INTEL_VGA_DEVICE(0x5913, info), /* ULT GT1.5 */ \
++	INTEL_VGA_DEVICE(0x5915, info), /* ULX GT1.5 */ \
++	INTEL_VGA_DEVICE(0x5917, info), /* DT  GT1.5 */ \
++	INTEL_VGA_DEVICE(0x5906, info), /* ULT GT1 */ \
++	INTEL_VGA_DEVICE(0x590E, info), /* ULX GT1 */ \
++	INTEL_VGA_DEVICE(0x5902, info), /* DT  GT1 */ \
++	INTEL_VGA_DEVICE(0x5908, info), /* Halo GT1 */ \
++	INTEL_VGA_DEVICE(0x590B, info), /* Halo GT1 */ \
++	INTEL_VGA_DEVICE(0x590A, info) /* SRV GT1 */
++
++#define INTEL_KBL_GT2_IDS(info)	\
++	INTEL_VGA_DEVICE(0x5916, info), /* ULT GT2 */ \
++	INTEL_VGA_DEVICE(0x5921, info), /* ULT GT2F */ \
++	INTEL_VGA_DEVICE(0x591E, info), /* ULX GT2 */ \
++	INTEL_VGA_DEVICE(0x5912, info), /* DT  GT2 */ \
++	INTEL_VGA_DEVICE(0x591B, info), /* Halo GT2 */ \
++	INTEL_VGA_DEVICE(0x591A, info), /* SRV GT2 */ \
++	INTEL_VGA_DEVICE(0x591D, info) /* WKS GT2 */
++
++#define INTEL_KBL_GT3_IDS(info) \
++	INTEL_VGA_DEVICE(0x5923, info), /* ULT GT3 */ \
++	INTEL_VGA_DEVICE(0x5926, info), /* ULT GT3 */ \
++	INTEL_VGA_DEVICE(0x5927, info) /* ULT GT3 */
++
++#define INTEL_KBL_GT4_IDS(info) \
++	INTEL_VGA_DEVICE(0x593B, info) /* Halo GT4 */
++
++#define INTEL_KBL_IDS(info) \
++	INTEL_KBL_GT1_IDS(info), \
++	INTEL_KBL_GT2_IDS(info), \
++	INTEL_KBL_GT3_IDS(info), \
++	INTEL_KBL_GT4_IDS(info)
++
+ #endif /* _I915_PCIIDS_H */
+diff --git a/src/intel_device.c b/src/intel_device.c
+index 140e1536..c4910cd8 100644
+--- a/src/intel_device.c
++++ b/src/intel_device.c
+@@ -38,6 +38,12 @@
+ #include <dirent.h>
+ #include <errno.h>
+ 
++#if MAJOR_IN_MKDEV
++#include <sys/mkdev.h>
++#elif MAJOR_IN_SYSMACROS
++#include <sys/sysmacros.h>
++#endif
++
+ #include <pciaccess.h>
+ 
+ #include <xorg-server.h>
+@@ -197,9 +203,15 @@ static inline struct intel_device *intel_device(ScrnInfoPtr scrn)
+ 	return xf86GetEntityPrivate(scrn->entityList[0], intel_device_key)->ptr;
+ }
+ 
++static const char *kernel_module_names[] ={
++	"i915",
++	NULL,
++};
++
+ static int is_i915_device(int fd)
+ {
+ 	drm_version_t version;
++	const char **kn;
+ 	char name[5] = "";
+ 
+ 	memset(&version, 0, sizeof(version));
+@@ -209,7 +221,22 @@ static int is_i915_device(int fd)
+ 	if (drmIoctl(fd, DRM_IOCTL_VERSION, &version))
+ 		return 0;
+ 
+-	return strcmp("i915", name) == 0;
++	for (kn = kernel_module_names; *kn; kn++)
++		if (strcmp(*kn, name) == 0)
++			return 1;
++
++	return 0;
++}
++
++static int load_i915_kernel_module(void)
++{
++	const char **kn;
++
++	for (kn = kernel_module_names; *kn; kn++)
++		if (xf86LoadKernelModule(*kn))
++			return 0;
++
++	return -1;
+ }
+ 
+ static int is_i915_gem(int fd)
+@@ -336,7 +363,7 @@ static int __intel_open_device__pci(const struct pci_device *pci)
+ 
+ 		sprintf(path + base, "driver");
+ 		if (stat(path, &st)) {
+-			if (xf86LoadKernelModule("i915"))
++			if (load_i915_kernel_module())
+ 				return -1;
+ 			(void)xf86LoadKernelModule("fbcon");
+ 		}
+@@ -399,7 +426,7 @@ static int __intel_open_device__legacy(const struct pci_device *pci)
+ 
+ 	ret = drmCheckModesettingSupported(id);
+ 	if (ret) {
+-		if (xf86LoadKernelModule("i915"))
++		if (load_i915_kernel_module() == 0)
+ 			ret = drmCheckModesettingSupported(id);
+ 		if (ret)
+ 			return -1;
+@@ -461,9 +488,9 @@ static int is_render_node(int fd, struct stat *st)
+ 
+ static char *find_render_node(int fd)
+ {
+-#if defined(USE_RENDERNODE)
+ 	struct stat master, render;
+ 	char buf[128];
++	int i;
+ 
+ 	/* Are we a render-node ourselves? */
+ 	if (is_render_node(fd, &master))
+@@ -472,9 +499,17 @@ static char *find_render_node(int fd)
+ 	sprintf(buf, "/dev/dri/renderD%d", (int)((master.st_rdev | 0x80) & 0xbf));
+ 	if (stat(buf, &render) == 0 &&
+ 	    master.st_mode == render.st_mode &&
+-	    render.st_rdev == ((master.st_rdev | 0x80) & 0xbf))
++	    render.st_rdev == (master.st_rdev | 0x80))
+ 		return strdup(buf);
+-#endif
++
++	/* Misaligned card <-> renderD, do a full search */
++	for (i = 0; i < 16; i++) {
++		sprintf(buf, "/dev/dri/renderD%d", i + 128);
++		if (stat(buf, &render) == 0 &&
++		    master.st_mode == render.st_mode &&
++		    render.st_rdev == (master.st_rdev | 0x80))
++			return strdup(buf);
++	}
+ 
+ 	return NULL;
+ }
+@@ -608,6 +643,27 @@ err_path:
+ 	return -1;
+ }
+ 
++void intel_close_device(int entity_num)
++{
++	struct intel_device *dev;
++
++	if (intel_device_key == -1)
++		return;
++
++	dev = xf86GetEntityPrivate(entity_num, intel_device_key)->ptr;
++	xf86GetEntityPrivate(entity_num, intel_device_key)->ptr = NULL;
++	if (!dev)
++		return;
++
++	if (dev->master_count == 0) /* Don't close server-fds */
++		close(dev->fd);
++
++	if (dev->render_node != dev->master_node)
++		free(dev->render_node);
++	free(dev->master_node);
++	free(dev);
++}
++
+ int __intel_peek_fd(ScrnInfoPtr scrn)
+ {
+ 	struct intel_device *dev;
+@@ -672,6 +728,12 @@ struct intel_device *intel_get_device(ScrnInfoPtr scrn, int *fd)
+ 	return dev;
+ }
+ 
++const char *intel_get_master_name(struct intel_device *dev)
++{
++	assert(dev && dev->master_node);
++	return dev->master_node;
++}
++
+ const char *intel_get_client_name(struct intel_device *dev)
+ {
+ 	assert(dev && dev->render_node);
+diff --git a/src/intel_driver.h b/src/intel_driver.h
+index 28ed1a0e..bece88a0 100644
+--- a/src/intel_driver.h
++++ b/src/intel_driver.h
+@@ -124,9 +124,11 @@ int intel_entity_get_devid(int index);
+ int intel_open_device(int entity_num,
+ 		      const struct pci_device *pci,
+ 		      struct xf86_platform_device *dev);
++void intel_close_device(int entity_num);
+ int __intel_peek_fd(ScrnInfoPtr scrn);
+ struct intel_device *intel_get_device(ScrnInfoPtr scrn, int *fd);
+ int intel_has_render_node(struct intel_device *dev);
++const char *intel_get_master_name(struct intel_device *dev);
+ const char *intel_get_client_name(struct intel_device *dev);
+ int intel_get_client_fd(struct intel_device *dev);
+ int intel_get_device_id(struct intel_device *dev);
+diff --git a/src/intel_list.h b/src/intel_list.h
+index 51af825d..c8a3187a 100644
+--- a/src/intel_list.h
++++ b/src/intel_list.h
+@@ -306,8 +306,7 @@ list_is_empty(const struct list *head)
+     list_entry((ptr)->prev, type, member)
+ 
+ #define __container_of(ptr, sample, member)				\
+-    (void *)((char *)(ptr)						\
+-	     - ((char *)&(sample)->member - (char *)(sample)))
++    (void *)((char *)(ptr) - ((char *)&(sample)->member - (char *)(sample)))
+ /**
+  * Loop through the list given by head and set pos to struct in the list.
+  *
+@@ -392,17 +391,50 @@ static inline void list_move_tail(struct list *list, struct list *head)
+ #define list_last_entry(ptr, type, member) \
+     list_entry((ptr)->prev, type, member)
+ 
+-#define list_for_each_entry_reverse(pos, head, member)				\
++#define list_for_each_entry_reverse(pos, head, member)			\
+     for (pos = __container_of((head)->prev, pos, member);		\
+ 	 &pos->member != (head);					\
+ 	 pos = __container_of(pos->member.prev, pos, member))
+ 
+ #endif
+ 
++#define list_for_each_entry_safe_from(pos, tmp, head, member)		\
++    for (tmp = __container_of(pos->member.next, pos, member);		\
++	 &pos->member != (head);					\
++	 pos = tmp, tmp = __container_of(tmp->member.next, tmp, member))
++
+ #undef container_of
+ #define container_of(ptr, type, member) \
+ 	((type *)((char *)(ptr) - (char *) &((type *)0)->member))
+ 
++static inline void __list_splice(const struct list *list,
++				 struct list *prev,
++				 struct list *next)
++{
++	struct list *first = list->next;
++	struct list *last = list->prev;
++
++	first->prev = prev;
++	prev->next = first;
++
++	last->next = next;
++	next->prev = last;
++}
++
++static inline void list_splice(const struct list *list,
++			       struct list *head)
++{
++	if (!list_is_empty(list))
++		__list_splice(list, head, head->next);
++}
++
++static inline void list_splice_tail(const struct list *list,
++				    struct list *head)
++{
++	if (!list_is_empty(list))
++		__list_splice(list, head->prev, head);
++}
++
+ static inline int list_is_singular(const struct list *list)
+ {
+ 	return list->next == list->prev;
+diff --git a/src/intel_module.c b/src/intel_module.c
+index 102d52aa..2e97b5ea 100644
+--- a/src/intel_module.c
++++ b/src/intel_module.c
+@@ -126,6 +126,17 @@ static const struct intel_device_info intel_skylake_info = {
+ 	.gen = 0110,
+ };
+ 
++static const struct intel_device_info intel_broxton_info = {
++	.gen = 0111,
++};
++
++static const struct intel_device_info intel_kabylake_info = {
++	.gen = 0112,
++};
++
++static const struct intel_device_info intel_geminilake_info = {
++	.gen = 0113,
++};
+ 
+ static const SymTabRec intel_chipsets[] = {
+ 	{PCI_CHIP_I810,				"i810"},
+@@ -234,30 +245,63 @@ static const SymTabRec intel_chipsets[] = {
+ 	{0x0157, "HD Graphics"},
+ 
+ 	/* Broadwell Marketing names */
+-	{0x1602, "HD graphics"},
+-	{0x1606, "HD graphics"},
+-	{0x160B, "HD graphics"},
+-	{0x160A, "HD graphics"},
+-	{0x160D, "HD graphics"},
+-	{0x160E, "HD graphics"},
+-	{0x1612, "HD graphics 5600"},
+-	{0x1616, "HD graphics 5500"},
+-	{0x161B, "HD graphics"},
+-	{0x161A, "HD graphics"},
+-	{0x161D, "HD graphics"},
+-	{0x161E, "HD graphics 5300"},
+-	{0x1622, "Iris Pro graphics 6200"},
+-	{0x1626, "HD graphics 6000"},
+-	{0x162B, "Iris graphics 6100"},
+-	{0x162A, "Iris Pro graphics P6300"},
+-	{0x162D, "HD graphics"},
+-	{0x162E, "HD graphics"},
+-	{0x1632, "HD graphics"},
+-	{0x1636, "HD graphics"},
+-	{0x163B, "HD graphics"},
+-	{0x163A, "HD graphics"},
+-	{0x163D, "HD graphics"},
+-	{0x163E, "HD graphics"},
++	{0x1602, "HD Graphics"},
++	{0x1606, "HD Graphics"},
++	{0x160B, "HD Graphics"},
++	{0x160A, "HD Graphics"},
++	{0x160D, "HD Graphics"},
++	{0x160E, "HD Graphics"},
++	{0x1612, "HD Graphics 5600"},
++	{0x1616, "HD Graphics 5500"},
++	{0x161B, "HD Graphics"},
++	{0x161A, "HD Graphics"},
++	{0x161D, "HD Graphics"},
++	{0x161E, "HD Graphics 5300"},
++	{0x1622, "Iris Pro Graphics 6200"},
++	{0x1626, "HD Graphics 6000"},
++	{0x162B, "Iris Graphics 6100"},
++	{0x162A, "Iris Pro Graphics P6300"},
++	{0x162D, "HD Graphics"},
++	{0x162E, "HD Graphics"},
++	{0x1632, "HD Graphics"},
++	{0x1636, "HD Graphics"},
++	{0x163B, "HD Graphics"},
++	{0x163A, "HD Graphics"},
++	{0x163D, "HD Graphics"},
++	{0x163E, "HD Graphics"},
++
++	/* Cherryview (Cherrytrail/Braswell) */
++	{0x22b0, "HD Graphics"},
++	{0x22b1, "HD Graphics"},
++	{0x22b2, "HD Graphics"},
++	{0x22b3, "HD Graphics"},
++
++	/* Skylake */
++	{0x1902, "HD Graphics 510"},
++	{0x1906, "HD Graphics 510"},
++	{0x190B, "HD Graphics 510"},
++	{0x1912, "HD Graphics 530"},
++	{0x1916, "HD Graphics 520"},
++	{0x191B, "HD Graphics 530"},
++	{0x191D, "HD Graphics P530"},
++	{0x191E, "HD Graphics 515"},
++	{0x1921, "HD Graphics 520"},
++	{0x1926, "Iris Graphics 540"},
++	{0x1927, "Iris Graphics 550"},
++	{0x192B, "Iris Graphics 555"},
++	{0x192D, "Iris Graphics P555"},
++	{0x1932, "Iris Pro Graphics 580"},
++	{0x193A, "Iris Pro Graphics P580"},
++	{0x193B, "Iris Pro Graphics 580"},
++	{0x193D, "Iris Pro Graphics P580"},
++
++	/* Broxton (Apollolake) */
++	{0x5A84, "HD Graphics 505"},
++	{0x5A85, "HD Graphics 500"},
++
++	/* Kabylake */
++	{0x5916, "HD Graphics 620"},
++	{0x591E, "HD Graphics 615"},
+ 
+ 	/* When adding new identifiers, also update:
+ 	 * 1. intel_identify()
+@@ -305,18 +349,14 @@ static const struct pci_id_match intel_device_match[] = {
+ 	INTEL_IVB_D_IDS(&intel_ivybridge_info),
+ 	INTEL_IVB_M_IDS(&intel_ivybridge_info),
+ 
+-	INTEL_HSW_D_IDS(&intel_haswell_info),
+-	INTEL_HSW_M_IDS(&intel_haswell_info),
+-
+-	INTEL_VLV_D_IDS(&intel_valleyview_info),
+-	INTEL_VLV_M_IDS(&intel_valleyview_info),
+-
+-	INTEL_BDW_D_IDS(&intel_broadwell_info),
+-	INTEL_BDW_M_IDS(&intel_broadwell_info),
+-
++	INTEL_HSW_IDS(&intel_haswell_info),
++	INTEL_VLV_IDS(&intel_valleyview_info),
++	INTEL_BDW_IDS(&intel_broadwell_info),
+ 	INTEL_CHV_IDS(&intel_cherryview_info),
+-
+ 	INTEL_SKL_IDS(&intel_skylake_info),
++	INTEL_BXT_IDS(&intel_broxton_info),
++	INTEL_KBL_IDS(&intel_kabylake_info),
++	INTEL_GLK_IDS(&intel_geminilake_info),
+ 
+ 	INTEL_VGA_DEVICE(PCI_MATCH_ANY, &intel_generic_info),
+ #endif
+@@ -448,9 +488,9 @@ static void intel_identify(int flags)
+ 	if (unique != stack)
+ 		free(unique);
+ 
+-	xf86Msg(X_INFO, INTEL_NAME ": Driver for Intel(R) HD Graphics: 2000-6000\n");
+-	xf86Msg(X_INFO, INTEL_NAME ": Driver for Intel(R) Iris(TM) Graphics: 5100, 6100\n");
+-	xf86Msg(X_INFO, INTEL_NAME ": Driver for Intel(R) Iris(TM) Pro Graphics: 5200, 6200, P6300\n");
++	xf86Msg(X_INFO, INTEL_NAME ": Driver for Intel(R) HD Graphics\n");
++	xf86Msg(X_INFO, INTEL_NAME ": Driver for Intel(R) Iris(TM) Graphics\n");
++	xf86Msg(X_INFO, INTEL_NAME ": Driver for Intel(R) Iris(TM) Pro Graphics\n");
+ }
+ 
+ static Bool intel_driver_func(ScrnInfoPtr pScrn,
+@@ -508,6 +548,9 @@ static enum accel_method { NOACCEL, SNA, UXA } get_accel_method(void)
+ 	if (hosted())
+ 		return SNA;
+ 
++	if (xf86configptr == NULL) /* X -configure */
++		return SNA;
++
+ 	dev = _xf86findDriver("intel", xf86configptr->conf_device_lst);
+ 	if (dev && dev->dev_option_lst) {
+ 		const char *s;
+@@ -582,10 +625,17 @@ intel_scrn_create(DriverPtr		driver,
+ 	case NOACCEL:
+ #endif
+ 	case UXA:
+-		  return intel_init_scrn(scrn);
++		return intel_init_scrn(scrn);
+ #endif
+ 
+-	default: break;
++	default:
++#if USE_SNA
++		return sna_init_scrn(scrn, entity_num);
++#elif USE_UXA
++		return intel_init_scrn(scrn);
++#else
++		break;
++#endif
+ 	}
+ #endif
+ 
+@@ -604,6 +654,8 @@ static Bool intel_pci_probe(DriverPtr		driver,
+ 			    struct pci_device	*pci,
+ 			    intptr_t		match_data)
+ {
++	Bool ret;
++
+ 	if (intel_open_device(entity_num, pci, NULL) == -1) {
+ #if UMS
+ 		switch (pci->device_id) {
+@@ -621,7 +673,11 @@ static Bool intel_pci_probe(DriverPtr		driver,
+ #endif
+ 	}
+ 
+-	return intel_scrn_create(driver, entity_num, match_data, 0);
++	ret = intel_scrn_create(driver, entity_num, match_data, 0);
++	if (!ret)
++		intel_close_device(entity_num);
++
++	return ret;
+ }
+ 
+ #ifdef XSERVER_PLATFORM_BUS
+@@ -644,9 +700,16 @@ intel_platform_probe(DriverPtr driver,
+ 
+ 	/* if we get any flags we don't understand fail to probe for now */
+ 	if (flags)
+-		return FALSE;
++		goto err;
++
++	if (!intel_scrn_create(driver, entity_num, match_data, scrn_flags))
++		goto err;
+ 
+-	return intel_scrn_create(driver, entity_num, match_data, scrn_flags);
++	return TRUE;
++
++err:
++	intel_close_device(entity_num);
++	return FALSE;
+ }
+ #endif
+ 
+diff --git a/src/intel_options.c b/src/intel_options.c
+index ff8541a4..7f253ac1 100644
+--- a/src/intel_options.c
++++ b/src/intel_options.c
+@@ -2,18 +2,24 @@
+ #include "config.h"
+ #endif
+ 
++#include <xorg-server.h>
++#include <xorgVersion.h>
++#include <xf86Parser.h>
++
+ #include "intel_options.h"
+ 
+ const OptionInfoRec intel_options[] = {
+-	{OPTION_ACCEL_DISABLE,	"NoAccel",	OPTV_BOOLEAN,	{0},	0},
++	{OPTION_ACCEL_ENABLE,	"Accel",	OPTV_BOOLEAN,	{0},	0},
+ 	{OPTION_ACCEL_METHOD,	"AccelMethod",	OPTV_STRING,	{0},	0},
+ 	{OPTION_BACKLIGHT,	"Backlight",	OPTV_STRING,	{0},	0},
++	{OPTION_EDID,		"CustomEDID",	OPTV_STRING,	{0},	0},
+ 	{OPTION_DRI,		"DRI",		OPTV_STRING,	{0},	0},
+ 	{OPTION_PRESENT,	"Present",	OPTV_BOOLEAN,	{0},	1},
+ 	{OPTION_COLOR_KEY,	"ColorKey",	OPTV_INTEGER,	{0},	0},
+ 	{OPTION_VIDEO_KEY,	"VideoKey",	OPTV_INTEGER,	{0},	0},
+ 	{OPTION_TILING_2D,	"Tiling",	OPTV_BOOLEAN,	{0},	1},
+ 	{OPTION_TILING_FB,	"LinearFramebuffer",	OPTV_BOOLEAN,	{0},	0},
++	{OPTION_ROTATION,	"HWRotation",	OPTV_BOOLEAN,	{0},	1},
+ 	{OPTION_VSYNC,		"VSync",	OPTV_BOOLEAN,	{0},	1},
+ 	{OPTION_PAGEFLIP,	"PageFlip",	OPTV_BOOLEAN,	{0},	1},
+ 	{OPTION_SWAPBUFFERS_WAIT, "SwapbuffersWait", OPTV_BOOLEAN,	{0},	1},
+@@ -21,7 +27,6 @@ const OptionInfoRec intel_options[] = {
+ 	{OPTION_PREFER_OVERLAY, "XvPreferOverlay", OPTV_BOOLEAN, {0}, 0},
+ 	{OPTION_HOTPLUG,	"HotPlug",	OPTV_BOOLEAN,	{0},	1},
+ 	{OPTION_REPROBE,	"ReprobeOutputs", OPTV_BOOLEAN,	{0},	0},
+-	{OPTION_DELETE_DP12,	"DeleteUnusedDP12Displays", OPTV_BOOLEAN,	{0},	0},
+ #ifdef INTEL_XVMC
+ 	{OPTION_XVMC,		"XvMC",		OPTV_BOOLEAN,	{0},	1},
+ #endif
+@@ -54,3 +59,85 @@ OptionInfoPtr intel_options_get(ScrnInfoPtr scrn)
+ 
+ 	return options;
+ }
++
++Bool intel_option_cast_to_bool(OptionInfoPtr options, int id, Bool val)
++{
++#if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,7,99,901,0)
++	xf86getBoolValue(&val, xf86GetOptValString(options, id));
++#endif
++	return val;
++}
++
++static int
++namecmp(const char *s1, const char *s2)
++{
++	char c1, c2;
++
++	if (!s1 || *s1 == 0) {
++		if (!s2 || *s2 == 0)
++			return 0;
++		else
++			return 1;
++	}
++
++	while (*s1 == '_' || *s1 == ' ' || *s1 == '\t')
++		s1++;
++
++	while (*s2 == '_' || *s2 == ' ' || *s2 == '\t')
++		s2++;
++
++	c1 = isupper(*s1) ? tolower(*s1) : *s1;
++	c2 = isupper(*s2) ? tolower(*s2) : *s2;
++	while (c1 == c2) {
++		if (c1 == '\0')
++			return 0;
++
++		s1++;
++		while (*s1 == '_' || *s1 == ' ' || *s1 == '\t')
++			s1++;
++
++		s2++;
++		while (*s2 == '_' || *s2 == ' ' || *s2 == '\t')
++			s2++;
++
++		c1 = isupper(*s1) ? tolower(*s1) : *s1;
++		c2 = isupper(*s2) ? tolower(*s2) : *s2;
++	}
++
++	return c1 - c2;
++}
++
++unsigned intel_option_cast_to_unsigned(OptionInfoPtr options, int id, unsigned val)
++{
++#if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,7,99,901,0)
++	const char *str = xf86GetOptValString(options, id);
++#else
++	const char *str = NULL;
++#endif
++	unsigned v;
++
++	if (str == NULL || *str == '\0')
++		return val;
++
++	if (namecmp(str, "on") == 0)
++		return val;
++	if (namecmp(str, "true") == 0)
++		return val;
++	if (namecmp(str, "yes") == 0)
++		return val;
++
++	if (namecmp(str, "0") == 0)
++		return 0;
++	if (namecmp(str, "off") == 0)
++		return 0;
++	if (namecmp(str, "false") == 0)
++		return 0;
++	if (namecmp(str, "no") == 0)
++		return 0;
++
++	v = atoi(str);
++	if (v)
++		return v;
++
++	return val;
++}
+diff --git a/src/intel_options.h b/src/intel_options.h
+index 7e2cbd9b..43635f1f 100644
+--- a/src/intel_options.h
++++ b/src/intel_options.h
+@@ -12,15 +12,17 @@
+  */
+ 
+ enum intel_options {
+-	OPTION_ACCEL_DISABLE,
++	OPTION_ACCEL_ENABLE,
+ 	OPTION_ACCEL_METHOD,
+ 	OPTION_BACKLIGHT,
++	OPTION_EDID,
+ 	OPTION_DRI,
+ 	OPTION_PRESENT,
+ 	OPTION_VIDEO_KEY,
+ 	OPTION_COLOR_KEY,
+ 	OPTION_TILING_2D,
+ 	OPTION_TILING_FB,
++	OPTION_ROTATION,
+ 	OPTION_VSYNC,
+ 	OPTION_PAGEFLIP,
+ 	OPTION_SWAPBUFFERS_WAIT,
+@@ -28,7 +30,6 @@ enum intel_options {
+ 	OPTION_PREFER_OVERLAY,
+ 	OPTION_HOTPLUG,
+ 	OPTION_REPROBE,
+-	OPTION_DELETE_DP12,
+ #if defined(XvMCExtension) && defined(ENABLE_XVMC)
+ 	OPTION_XVMC,
+ #define INTEL_XVMC 1
+@@ -51,5 +52,7 @@ enum intel_options {
+ 
+ extern const OptionInfoRec intel_options[];
+ OptionInfoPtr intel_options_get(ScrnInfoPtr scrn);
++unsigned intel_option_cast_to_unsigned(OptionInfoPtr, int id, unsigned val);
++Bool intel_option_cast_to_bool(OptionInfoPtr, int id, Bool val);
+ 
+ #endif /* INTEL_OPTIONS_H */
+diff --git a/src/legacy/i810/i810_common.h b/src/legacy/i810/i810_common.h
+index 4cc10e8b..8355708c 100644
+--- a/src/legacy/i810/i810_common.h
++++ b/src/legacy/i810/i810_common.h
+@@ -52,7 +52,7 @@
+ 
+ #define ALIGN(i,m) (((i) + (m) - 1) & ~((m) - 1))
+ 
+-/* Using usleep() makes things noticably slow. */
++/* Using usleep() makes things noticeably slow. */
+ #if 0
+ #define DELAY(x) usleep(x)
+ #else
+@@ -185,7 +185,7 @@ enum {
+  *    - zbuffer linear offset and pitch -- also invarient
+  *    - drawing origin in back and depth buffers.
+  *
+- * Keep the depth/back buffer state here to acommodate private buffers
++ * Keep the depth/back buffer state here to accommodate private buffers
+  * in the future.
+  */
+ #define I810_DESTREG_DI0  0		/* CMD_OP_DESTBUFFER_INFO (2 dwords) */
+diff --git a/src/legacy/i810/i810_hwmc.c b/src/legacy/i810/i810_hwmc.c
+index 7cb9c1ab..58661b0a 100644
+--- a/src/legacy/i810/i810_hwmc.c
++++ b/src/legacy/i810/i810_hwmc.c
+@@ -171,7 +171,7 @@ static XF86MCAdaptorPtr ppAdapt[1] =
+  *
+  *  I810InitMC
+  *
+- *  Initialize the hardware motion compenstation extention for this 
++ *  Initialize the hardware motion compensation extension for this
+  *  hardware. The initialization routines want the address of the pointers
+  *  to the structures, not the address of the structures. This means we
+  *  allocate (or create static?) the pointer memory and pass that 
+diff --git a/src/legacy/i810/i810_memory.c b/src/legacy/i810/i810_memory.c
+index c3de2777..6f274836 100644
+--- a/src/legacy/i810/i810_memory.c
++++ b/src/legacy/i810/i810_memory.c
+@@ -76,7 +76,7 @@ I810AllocateGARTMemory(ScrnInfoPtr pScrn)
+    unsigned long size = pScrn->videoRam * 1024UL;
+    I810Ptr pI810 = I810PTR(pScrn);
+    int key;
+-   long tom = 0;
++   unsigned long tom = 0;
+    unsigned long physical;
+ 
+    if (!xf86AgpGARTSupported() || !xf86AcquireGART(pScrn->scrnIndex)) {
+@@ -132,8 +132,8 @@ I810AllocateGARTMemory(ScrnInfoPtr pScrn)
+     * Keep it 512K aligned for the sake of tiled regions.
+     */
+ 
+-   tom += 0x7ffff;
+-   tom &= ~0x7ffff;
++   tom += 0x7ffffUL;
++   tom &= ~0x7ffffUL;
+ 
+    if ((key = xf86AllocateGARTMemory(pScrn->scrnIndex, size, 1, NULL)) != -1) {
+       pI810->DcacheOffset = tom;
+diff --git a/src/legacy/i810/i810_reg.h b/src/legacy/i810/i810_reg.h
+index 54faeb3d..fa091c5b 100644
+--- a/src/legacy/i810/i810_reg.h
++++ b/src/legacy/i810/i810_reg.h
+@@ -245,7 +245,7 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+  * not sure they refer to local (graphics) memory.
+  *
+  * These details are for the local memory control registers,
+- * (pp301-310).  The test machines are not equiped with local memory,
++ * (pp301-310).  The test machines are not equipped with local memory,
+  * so nothing is tested.  Only a single row seems to be supported.
+  */
+ #define DRAM_ROW_TYPE      0x3000
+diff --git a/src/legacy/i810/i810_video.c b/src/legacy/i810/i810_video.c
+index be49b91d..af683c81 100644
+--- a/src/legacy/i810/i810_video.c
++++ b/src/legacy/i810/i810_video.c
+@@ -77,7 +77,11 @@ static int I810PutImage( ScrnInfoPtr,
+ static int I810QueryImageAttributes(ScrnInfoPtr, 
+ 	int, unsigned short *, unsigned short *,  int *, int *);
+ 
++#if !HAVE_NOTIFY_FD
+ static void I810BlockHandler(BLOCKHANDLER_ARGS_DECL);
++#else
++static void I810BlockHandler(void *data, void *_timeout);
++#endif
+ 
+ #define MAKE_ATOM(a) MakeAtom(a, sizeof(a) - 1, TRUE)
+ 
+@@ -418,8 +422,14 @@ I810SetupImageVideo(ScreenPtr screen)
+ 
+     pI810->adaptor = adapt;
+ 
++#if !HAVE_NOTIFY_FD
+     pI810->BlockHandler = screen->BlockHandler;
+     screen->BlockHandler = I810BlockHandler;
++#else
++    RegisterBlockAndWakeupHandlers(I810BlockHandler,
++				   (ServerWakeupHandlerProcPtr)NoopDDA,
++				   pScrn);
++#endif
+ 
+     xvBrightness = MAKE_ATOM("XV_BRIGHTNESS");
+     xvContrast   = MAKE_ATOM("XV_CONTRAST");
+@@ -1135,6 +1145,7 @@ I810QueryImageAttributes(
+     return size;
+ }
+ 
++#if !HAVE_NOTIFY_FD
+ static void
+ I810BlockHandler (BLOCKHANDLER_ARGS_DECL)
+ {
+@@ -1172,6 +1183,38 @@ I810BlockHandler (BLOCKHANDLER_ARGS_DECL)
+         }
+     }
+ }
++#else
++static void
++I810BlockHandler(void *data, void *_timeout)
++{
++    ScrnInfoPtr pScrn = data;
++    I810Ptr      pI810 = I810PTR(pScrn);
++    I810PortPrivPtr pPriv = GET_PORT_PRIVATE(pScrn);
++    I810OverlayRegPtr overlay = (I810OverlayRegPtr) (pI810->FbBase + pI810->OverlayStart);
++
++    if(pPriv->videoStatus & TIMER_MASK) {
++	UpdateCurrentTime();
++	if(pPriv->videoStatus & OFF_TIMER) {
++	    if(pPriv->offTime < currentTime.milliseconds) {
++		/* Turn off the overlay */
++		overlay->OV0CMD &= 0xFFFFFFFE;
++		OVERLAY_UPDATE(pI810->OverlayPhysical);
++
++		pPriv->videoStatus = FREE_TIMER;
++		pPriv->freeTime = currentTime.milliseconds + FREE_DELAY;
++	    }
++	} else {  /* FREE_TIMER */
++	    if(pPriv->freeTime < currentTime.milliseconds) {
++		if(pPriv->linear) {
++		   xf86FreeOffscreenLinear(pPriv->linear);
++		   pPriv->linear = NULL;
++		}
++		pPriv->videoStatus = 0;
++	    }
++        }
++    }
++}
++#endif
+ 
+ 
+ /***************************************************************************
+@@ -1373,7 +1416,6 @@ I810DisplaySurface(
+       UpdateCurrentTime();
+       pI810Priv->videoStatus = FREE_TIMER;
+       pI810Priv->freeTime = currentTime.milliseconds + FREE_DELAY;
+-      pScrn->pScreen->BlockHandler = I810BlockHandler;
+     }
+ 
+     return Success;
+diff --git a/src/legacy/i810/xvmc/I810XvMC.c b/src/legacy/i810/xvmc/I810XvMC.c
+index e6b63d30..a538e999 100644
+--- a/src/legacy/i810/xvmc/I810XvMC.c
++++ b/src/legacy/i810/xvmc/I810XvMC.c
+@@ -61,7 +61,7 @@ static int event_base;
+ // Arguments: pI810XvMC private data structure from the current context.
+ // Notes: We faked the drmMapBufs for the i810's security so now we have
+ //   to insert an allocated page into the correct spot in the faked
+-//   list to keep up appearences.
++//   list to keep up appearances.
+ //   Concept for this function was taken from Mesa sources.
+ // Returns: drmBufPtr containing the information about the allocated page.
+ ***************************************************************************/
+@@ -188,7 +188,7 @@ _X_EXPORT Status XvMCCreateContext(Display *display, XvPortID port,
+ 
+   /* Check for drm */
+   if(! drmAvailable()) {
+-    printf("Direct Rendering is not avilable on this system!\n");
++    printf("Direct Rendering is not available on this system!\n");
+     return BadAlloc;
+   }
+ 
+@@ -3279,7 +3279,7 @@ _X_EXPORT Status XvMCSyncSurface(Display *display,XvMCSurface *surface) {
+ //   display - Connection to X server
+ //   surface - Surface to flush
+ // Info:
+-//   This command is a noop for i810 becuase we always dispatch buffers in
++//   This command is a noop for i810 because we always dispatch buffers in
+ //   render. There is little gain to be had with 4k buffers.
+ // Returns: Status
+ ***************************************************************************/
+diff --git a/src/render_program/exa_wm.g4i b/src/render_program/exa_wm.g4i
+index 5d3d45b1..587b581c 100644
+--- a/src/render_program/exa_wm.g4i
++++ b/src/render_program/exa_wm.g4i
+@@ -57,7 +57,7 @@ define(`mask_dw_dy', `g6.4<0,1,0>F')
+ define(`mask_wo',    `g6.12<0,1,0>F')
+ 
+ /*
+- * Local variables. Pairs must be aligned on even reg boundry
++ * Local variables. Pairs must be aligned on even reg boundary
+  */
+ 
+ /* this holds the X dest coordinates */
+diff --git a/src/render_program/exa_wm_yuv_rgb.g8a b/src/render_program/exa_wm_yuv_rgb.g8a
+index 7def0930..34973ba8 100644
+--- a/src/render_program/exa_wm_yuv_rgb.g8a
++++ b/src/render_program/exa_wm_yuv_rgb.g8a
+@@ -76,7 +76,7 @@ add (16)    Cbn<1>F		Cb<8,8,1>F	-0.501961F  { compr align1 };
+     /* 
+      * R = Y + Cr * 1.596
+      */
+-mov (8)    acc0<1>F		Yn<8,8,1>F		    { compr align1 };
++mov (8)    acc0<1>F		Yn_01<8,8,1>F		    { compr align1 };
+ mac.sat(8) src_sample_r_01<1>F	Crn_01<8,8,1>F	1.596F	    { compr align1 };
+      
+ mov (8)    acc0<1>F		Yn_23<8,8,1>F		    { compr align1 };
+@@ -84,7 +84,7 @@ mac.sat(8) src_sample_r_23<1>F	Crn_23<8,8,1>F	1.596F	    { compr align1 };
+     /*
+      * G = Crn * -0.813 + Cbn * -0.392 + Y
+      */
+-mov (8)    acc0<1>F		Yn_23<8,8,1>F		    { compr align1 };
++mov (8)    acc0<1>F		Yn_01<8,8,1>F		    { compr align1 };
+ mac (8)    acc0<1>F		Crn_01<8,8,1>F    	-0.813F	    { compr align1 };
+ mac.sat(8) src_sample_g_01<1>F	Cbn_01<8,8,1>F    	-0.392F	    { compr align1 };
+ 
+diff --git a/src/render_program/exa_wm_yuv_rgb.g8b b/src/render_program/exa_wm_yuv_rgb.g8b
+index 44949538..2cd6fc44 100644
+--- a/src/render_program/exa_wm_yuv_rgb.g8b
++++ b/src/render_program/exa_wm_yuv_rgb.g8b
+@@ -6,7 +6,7 @@
+    { 0x80600048, 0x21c03ae8, 0x3e8d02c0, 0x3fcc49ba },
+    { 0x00600001, 0x24003ae0, 0x008d0320, 0x00000000 },
+    { 0x80600048, 0x21e03ae8, 0x3e8d02e0, 0x3fcc49ba },
+-   { 0x00600001, 0x24003ae0, 0x008d0320, 0x00000000 },
++   { 0x00600001, 0x24003ae0, 0x008d0300, 0x00000000 },
+    { 0x00600048, 0x24003ae0, 0x3e8d02c0, 0xbf5020c5 },
+    { 0x80600048, 0x22003ae8, 0x3e8d0340, 0xbec8b439 },
+    { 0x00600001, 0x24003ae0, 0x008d0320, 0x00000000 },
+diff --git a/src/sna/Makefile.am b/src/sna/Makefile.am
+index e09a8d49..adf13963 100644
+--- a/src/sna/Makefile.am
++++ b/src/sna/Makefile.am
+@@ -107,6 +107,8 @@ libsna_la_SOURCES = \
+ 	gen8_render.h \
+ 	gen8_vertex.c \
+ 	gen8_vertex.h \
++	gen9_render.c \
++	gen9_render.h \
+ 	xassert.h \
+ 	$(NULL)
+ 
+diff --git a/src/sna/blt.c b/src/sna/blt.c
+index b5bfee69..cb90437a 100644
+--- a/src/sna/blt.c
++++ b/src/sna/blt.c
+@@ -30,112 +30,608 @@
+ #endif
+ 
+ #include "sna.h"
++#include <pixman.h>
+ 
+-#if __x86_64__
+-#define USE_SSE2 1
+-#endif
+-
+-#if USE_SSE2
++#if defined(sse2)
++#pragma GCC push_options
++#pragma GCC target("sse2,inline-all-stringops,fpmath=sse")
++#pragma GCC optimize("Ofast")
+ #include <xmmintrin.h>
+ 
+ #if __x86_64__
+ #define have_sse2() 1
+ #else
+-enum {
+-	MMX = 0x1,
+-	MMX_EXTENSIONS = 0x2,
+-	SSE = 0x6,
+-	SSE2 = 0x8,
+-	CMOV = 0x10
+-};
+-
+-#ifdef __GNUC__
+-static unsigned int
+-detect_cpu_features(void)
+-{
+-	unsigned int features;
+-	unsigned int result = 0;
+-
+-	char vendor[13];
+-	vendor[0] = 0;
+-	vendor[12] = 0;
+-
+-	asm (
+-	     "pushf\n"
+-	     "pop %%eax\n"
+-	     "mov %%eax, %%ecx\n"
+-	     "xor $0x00200000, %%eax\n"
+-	     "push %%eax\n"
+-	     "popf\n"
+-	     "pushf\n"
+-	     "pop %%eax\n"
+-	     "mov $0x0, %%edx\n"
+-	     "xor %%ecx, %%eax\n"
+-	     "jz 1f\n"
+-
+-	     "mov $0x00000000, %%eax\n"
+-	     "push %%ebx\n"
+-	     "cpuid\n"
+-	     "mov %%ebx, %%eax\n"
+-	     "pop %%ebx\n"
+-	     "mov %%eax, %1\n"
+-	     "mov %%edx, %2\n"
+-	     "mov %%ecx, %3\n"
+-	     "mov $0x00000001, %%eax\n"
+-	     "push %%ebx\n"
+-	     "cpuid\n"
+-	     "pop %%ebx\n"
+-	     "1:\n"
+-	     "mov %%edx, %0\n"
+-	     : "=r" (result), "=m" (vendor[0]), "=m" (vendor[4]), "=m" (vendor[8])
+-	     :: "%eax", "%ecx", "%edx");
+-
+-	features = 0;
+-	if (result) {
+-		/* result now contains the standard feature bits */
+-		if (result & (1 << 15))
+-			features |= CMOV;
+-		if (result & (1 << 23))
+-			features |= MMX;
+-		if (result & (1 << 25))
+-			features |= SSE;
+-		if (result & (1 << 26))
+-			features |= SSE2;
+-	}
+-	return features;
+-}
+-#else
+-static unsigned int detect_cpu_features(void) { return 0; }
+-#endif
+-
+ static bool have_sse2(void)
+ {
+ 	static int sse2_present = -1;
+ 
+ 	if (sse2_present == -1)
+-		sse2_present = detect_cpu_features() & SSE2;
++		sse2_present = sna_cpu_detect() & SSE2;
+ 
+ 	return sse2_present;
+ }
+ #endif
+ 
+-static inline __m128i
++static force_inline __m128i
+ xmm_create_mask_32(uint32_t mask)
+ {
+ 	return _mm_set_epi32(mask, mask, mask, mask);
+ }
+ 
+-static inline __m128i
++static force_inline __m128i
++xmm_load_128(const __m128i *src)
++{
++	return _mm_load_si128(src);
++}
++
++static force_inline __m128i
+ xmm_load_128u(const __m128i *src)
+ {
+ 	return _mm_loadu_si128(src);
+ }
+ 
+-static inline void
++static force_inline void
+ xmm_save_128(__m128i *dst, __m128i data)
+ {
+ 	_mm_store_si128(dst, data);
+ }
++
++static force_inline void
++xmm_save_128u(__m128i *dst, __m128i data)
++{
++	_mm_storeu_si128(dst, data);
++}
++
++static force_inline void
++to_sse128xN(uint8_t *dst, const uint8_t *src, int bytes)
++{
++	int i;
++
++	for (i = 0; i < bytes / 128; i++) {
++		__m128i xmm0, xmm1, xmm2, xmm3;
++		__m128i xmm4, xmm5, xmm6, xmm7;
++
++		xmm0 = xmm_load_128u((const __m128i*)src + 0);
++		xmm1 = xmm_load_128u((const __m128i*)src + 1);
++		xmm2 = xmm_load_128u((const __m128i*)src + 2);
++		xmm3 = xmm_load_128u((const __m128i*)src + 3);
++		xmm4 = xmm_load_128u((const __m128i*)src + 4);
++		xmm5 = xmm_load_128u((const __m128i*)src + 5);
++		xmm6 = xmm_load_128u((const __m128i*)src + 6);
++		xmm7 = xmm_load_128u((const __m128i*)src + 7);
++
++		xmm_save_128((__m128i*)dst + 0, xmm0);
++		xmm_save_128((__m128i*)dst + 1, xmm1);
++		xmm_save_128((__m128i*)dst + 2, xmm2);
++		xmm_save_128((__m128i*)dst + 3, xmm3);
++		xmm_save_128((__m128i*)dst + 4, xmm4);
++		xmm_save_128((__m128i*)dst + 5, xmm5);
++		xmm_save_128((__m128i*)dst + 6, xmm6);
++		xmm_save_128((__m128i*)dst + 7, xmm7);
++
++		dst += 128;
++		src += 128;
++	}
++}
++
++static force_inline void
++to_sse64(uint8_t *dst, const uint8_t *src)
++{
++	__m128i xmm1, xmm2, xmm3, xmm4;
++
++	xmm1 = xmm_load_128u((const __m128i*)src + 0);
++	xmm2 = xmm_load_128u((const __m128i*)src + 1);
++	xmm3 = xmm_load_128u((const __m128i*)src + 2);
++	xmm4 = xmm_load_128u((const __m128i*)src + 3);
++
++	xmm_save_128((__m128i*)dst + 0, xmm1);
++	xmm_save_128((__m128i*)dst + 1, xmm2);
++	xmm_save_128((__m128i*)dst + 2, xmm3);
++	xmm_save_128((__m128i*)dst + 3, xmm4);
++}
++
++static force_inline void
++to_sse32(uint8_t *dst, const uint8_t *src)
++{
++	__m128i xmm1, xmm2;
++
++	xmm1 = xmm_load_128u((const __m128i*)src + 0);
++	xmm2 = xmm_load_128u((const __m128i*)src + 1);
++
++	xmm_save_128((__m128i*)dst + 0, xmm1);
++	xmm_save_128((__m128i*)dst + 1, xmm2);
++}
++
++static force_inline void
++to_sse16(uint8_t *dst, const uint8_t *src)
++{
++	xmm_save_128((__m128i*)dst, xmm_load_128u((const __m128i*)src));
++}
++
++static void to_memcpy(uint8_t *dst, const uint8_t *src, unsigned len)
++{
++	assert(len);
++	if ((uintptr_t)dst & 15) {
++		if (len <= 16 - ((uintptr_t)dst & 15)) {
++			memcpy(dst, src, len);
++			return;
++		}
++
++		if ((uintptr_t)dst & 1) {
++			assert(len >= 1);
++			*dst++ = *src++;
++			len--;
++		}
++		if ((uintptr_t)dst & 2) {
++			assert(((uintptr_t)dst & 1) == 0);
++			assert(len >= 2);
++			*(uint16_t *)dst = *(const uint16_t *)src;
++			dst += 2;
++			src += 2;
++			len -= 2;
++		}
++		if ((uintptr_t)dst & 4) {
++			assert(((uintptr_t)dst & 3) == 0);
++			assert(len >= 4);
++			*(uint32_t *)dst = *(const uint32_t *)src;
++			dst += 4;
++			src += 4;
++			len -= 4;
++		}
++		if ((uintptr_t)dst & 8) {
++			assert(((uintptr_t)dst & 7) == 0);
++			assert(len >= 8);
++			*(uint64_t *)dst = *(const uint64_t *)src;
++			dst += 8;
++			src += 8;
++			len -= 8;
++		}
++	}
++
++	assert(((uintptr_t)dst & 15) == 0);
++	while (len >= 64) {
++		to_sse64(dst, src);
++		dst += 64;
++		src += 64;
++		len -= 64;
++	}
++	if (len == 0)
++		return;
++
++	if (len & 32) {
++		to_sse32(dst, src);
++		dst += 32;
++		src += 32;
++	}
++	if (len & 16) {
++		to_sse16(dst, src);
++		dst += 16;
++		src += 16;
++	}
++	if (len & 8) {
++		*(uint64_t *)dst = *(uint64_t *)src;
++		dst += 8;
++		src += 8;
++	}
++	if (len & 4) {
++		*(uint32_t *)dst = *(uint32_t *)src;
++		dst += 4;
++		src += 4;
++	}
++	memcpy(dst, src, len & 3);
++}
++
++static void
++memcpy_to_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp,
++				   int32_t src_stride, int32_t dst_stride,
++				   int16_t src_x, int16_t src_y,
++				   int16_t dst_x, int16_t dst_y,
++				   uint16_t width, uint16_t height)
++{
++	const unsigned tile_width = 512;
++	const unsigned tile_height = 8;
++	const unsigned tile_size = 4096;
++
++	const unsigned cpp = bpp / 8;
++	const unsigned tile_pixels = tile_width / cpp;
++	const unsigned tile_shift = ffs(tile_pixels) - 1;
++	const unsigned tile_mask = tile_pixels - 1;
++
++	unsigned offset_x, length_x;
++
++	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
++	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
++	assert(src != dst);
++
++	if (src_x | src_y)
++		src = (const uint8_t *)src + src_y * src_stride + src_x * cpp;
++	width *= cpp;
++	assert(src_stride >= width);
++
++	if (dst_x & tile_mask) {
++		offset_x = (dst_x & tile_mask) * cpp;
++		length_x = min(tile_width - offset_x, width);
++	} else
++		length_x = 0;
++	dst = (uint8_t *)dst + (dst_x >> tile_shift) * tile_size;
++
++	while (height--) {
++		unsigned w = width;
++		const uint8_t *src_row = src;
++		uint8_t *tile_row = dst;
++
++		src = (const uint8_t *)src + src_stride;
++
++		tile_row += dst_y / tile_height * dst_stride * tile_height;
++		tile_row += (dst_y & (tile_height-1)) * tile_width;
++		dst_y++;
++
++		if (length_x) {
++			to_memcpy(tile_row + offset_x, src_row, length_x);
++
++			tile_row += tile_size;
++			src_row = (const uint8_t *)src_row + length_x;
++			w -= length_x;
++		}
++		while (w >= tile_width) {
++			assert(((uintptr_t)tile_row & (tile_width - 1)) == 0);
++			to_sse128xN(assume_aligned(tile_row, tile_width),
++				    src_row, tile_width);
++			tile_row += tile_size;
++			src_row = (const uint8_t *)src_row + tile_width;
++			w -= tile_width;
++		}
++		if (w) {
++			assert(((uintptr_t)tile_row & (tile_width - 1)) == 0);
++			to_memcpy(assume_aligned(tile_row, tile_width),
++				  src_row, w);
++		}
++	}
++}
++
++static force_inline void
++from_sse128xNu(uint8_t *dst, const uint8_t *src, int bytes)
++{
++	int i;
++
++	assert(((uintptr_t)src & 15) == 0);
++
++	for (i = 0; i < bytes / 128; i++) {
++		__m128i xmm0, xmm1, xmm2, xmm3;
++		__m128i xmm4, xmm5, xmm6, xmm7;
++
++		xmm0 = xmm_load_128((const __m128i*)src + 0);
++		xmm1 = xmm_load_128((const __m128i*)src + 1);
++		xmm2 = xmm_load_128((const __m128i*)src + 2);
++		xmm3 = xmm_load_128((const __m128i*)src + 3);
++		xmm4 = xmm_load_128((const __m128i*)src + 4);
++		xmm5 = xmm_load_128((const __m128i*)src + 5);
++		xmm6 = xmm_load_128((const __m128i*)src + 6);
++		xmm7 = xmm_load_128((const __m128i*)src + 7);
++
++		xmm_save_128u((__m128i*)dst + 0, xmm0);
++		xmm_save_128u((__m128i*)dst + 1, xmm1);
++		xmm_save_128u((__m128i*)dst + 2, xmm2);
++		xmm_save_128u((__m128i*)dst + 3, xmm3);
++		xmm_save_128u((__m128i*)dst + 4, xmm4);
++		xmm_save_128u((__m128i*)dst + 5, xmm5);
++		xmm_save_128u((__m128i*)dst + 6, xmm6);
++		xmm_save_128u((__m128i*)dst + 7, xmm7);
++
++		dst += 128;
++		src += 128;
++	}
++}
++
++static force_inline void
++from_sse128xNa(uint8_t *dst, const uint8_t *src, int bytes)
++{
++	int i;
++
++	assert(((uintptr_t)dst & 15) == 0);
++	assert(((uintptr_t)src & 15) == 0);
++
++	for (i = 0; i < bytes / 128; i++) {
++		__m128i xmm0, xmm1, xmm2, xmm3;
++		__m128i xmm4, xmm5, xmm6, xmm7;
++
++		xmm0 = xmm_load_128((const __m128i*)src + 0);
++		xmm1 = xmm_load_128((const __m128i*)src + 1);
++		xmm2 = xmm_load_128((const __m128i*)src + 2);
++		xmm3 = xmm_load_128((const __m128i*)src + 3);
++		xmm4 = xmm_load_128((const __m128i*)src + 4);
++		xmm5 = xmm_load_128((const __m128i*)src + 5);
++		xmm6 = xmm_load_128((const __m128i*)src + 6);
++		xmm7 = xmm_load_128((const __m128i*)src + 7);
++
++		xmm_save_128((__m128i*)dst + 0, xmm0);
++		xmm_save_128((__m128i*)dst + 1, xmm1);
++		xmm_save_128((__m128i*)dst + 2, xmm2);
++		xmm_save_128((__m128i*)dst + 3, xmm3);
++		xmm_save_128((__m128i*)dst + 4, xmm4);
++		xmm_save_128((__m128i*)dst + 5, xmm5);
++		xmm_save_128((__m128i*)dst + 6, xmm6);
++		xmm_save_128((__m128i*)dst + 7, xmm7);
++
++		dst += 128;
++		src += 128;
++	}
++}
++
++static force_inline void
++from_sse64u(uint8_t *dst, const uint8_t *src)
++{
++	__m128i xmm1, xmm2, xmm3, xmm4;
++
++	assert(((uintptr_t)src & 15) == 0);
++
++	xmm1 = xmm_load_128((const __m128i*)src + 0);
++	xmm2 = xmm_load_128((const __m128i*)src + 1);
++	xmm3 = xmm_load_128((const __m128i*)src + 2);
++	xmm4 = xmm_load_128((const __m128i*)src + 3);
++
++	xmm_save_128u((__m128i*)dst + 0, xmm1);
++	xmm_save_128u((__m128i*)dst + 1, xmm2);
++	xmm_save_128u((__m128i*)dst + 2, xmm3);
++	xmm_save_128u((__m128i*)dst + 3, xmm4);
++}
++
++static force_inline void
++from_sse64a(uint8_t *dst, const uint8_t *src)
++{
++	__m128i xmm1, xmm2, xmm3, xmm4;
++
++	assert(((uintptr_t)dst & 15) == 0);
++	assert(((uintptr_t)src & 15) == 0);
++
++	xmm1 = xmm_load_128((const __m128i*)src + 0);
++	xmm2 = xmm_load_128((const __m128i*)src + 1);
++	xmm3 = xmm_load_128((const __m128i*)src + 2);
++	xmm4 = xmm_load_128((const __m128i*)src + 3);
++
++	xmm_save_128((__m128i*)dst + 0, xmm1);
++	xmm_save_128((__m128i*)dst + 1, xmm2);
++	xmm_save_128((__m128i*)dst + 2, xmm3);
++	xmm_save_128((__m128i*)dst + 3, xmm4);
++}
++
++static force_inline void
++from_sse32u(uint8_t *dst, const uint8_t *src)
++{
++	__m128i xmm1, xmm2;
++
++	xmm1 = xmm_load_128((const __m128i*)src + 0);
++	xmm2 = xmm_load_128((const __m128i*)src + 1);
++
++	xmm_save_128u((__m128i*)dst + 0, xmm1);
++	xmm_save_128u((__m128i*)dst + 1, xmm2);
++}
++
++static force_inline void
++from_sse32a(uint8_t *dst, const uint8_t *src)
++{
++	__m128i xmm1, xmm2;
++
++	assert(((uintptr_t)dst & 15) == 0);
++	assert(((uintptr_t)src & 15) == 0);
++
++	xmm1 = xmm_load_128((const __m128i*)src + 0);
++	xmm2 = xmm_load_128((const __m128i*)src + 1);
++
++	xmm_save_128((__m128i*)dst + 0, xmm1);
++	xmm_save_128((__m128i*)dst + 1, xmm2);
++}
++
++static force_inline void
++from_sse16u(uint8_t *dst, const uint8_t *src)
++{
++	assert(((uintptr_t)src & 15) == 0);
++
++	xmm_save_128u((__m128i*)dst, xmm_load_128((const __m128i*)src));
++}
++
++static force_inline void
++from_sse16a(uint8_t *dst, const uint8_t *src)
++{
++	assert(((uintptr_t)dst & 15) == 0);
++	assert(((uintptr_t)src & 15) == 0);
++
++	xmm_save_128((__m128i*)dst, xmm_load_128((const __m128i*)src));
++}
++
++static void
++memcpy_from_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp,
++				     int32_t src_stride, int32_t dst_stride,
++				     int16_t src_x, int16_t src_y,
++				     int16_t dst_x, int16_t dst_y,
++				     uint16_t width, uint16_t height)
++{
++	const unsigned tile_width = 512;
++	const unsigned tile_height = 8;
++	const unsigned tile_size = 4096;
++
++	const unsigned cpp = bpp / 8;
++	const unsigned tile_pixels = tile_width / cpp;
++	const unsigned tile_shift = ffs(tile_pixels) - 1;
++	const unsigned tile_mask = tile_pixels - 1;
++
++	unsigned length_x, offset_x;
++
++	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
++	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
++	assert(src != dst);
++
++	if (dst_x | dst_y)
++		dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp;
++	width *= cpp;
++	assert(dst_stride >= width);
++	if (src_x & tile_mask) {
++		offset_x = (src_x & tile_mask) * cpp;
++		length_x = min(tile_width - offset_x, width);
++		dst_stride -= width;
++		dst_stride += (width - length_x) & 15;
++	} else {
++		offset_x = 0;
++		dst_stride -= width & ~15;
++	}
++	assert(dst_stride >= 0);
++	src = (const uint8_t *)src + (src_x >> tile_shift) * tile_size;
++
++	while (height--) {
++		unsigned w = width;
++		const uint8_t *tile_row = src;
++
++		tile_row += src_y / tile_height * src_stride * tile_height;
++		tile_row += (src_y & (tile_height-1)) * tile_width;
++		src_y++;
++
++		if (offset_x) {
++			memcpy(dst, tile_row + offset_x, length_x);
++			tile_row += tile_size;
++			dst = (uint8_t *)dst + length_x;
++			w -= length_x;
++		}
++
++		if ((uintptr_t)dst & 15) {
++			while (w >= tile_width) {
++				from_sse128xNu(dst,
++					       assume_aligned(tile_row, tile_width),
++					       tile_width);
++				tile_row += tile_size;
++				dst = (uint8_t *)dst + tile_width;
++				w -= tile_width;
++			}
++			while (w >= 64) {
++				from_sse64u(dst, tile_row);
++				tile_row += 64;
++				dst = (uint8_t *)dst + 64;
++				w -= 64;
++			}
++			if (w & 32) {
++				from_sse32u(dst, tile_row);
++				tile_row += 32;
++				dst = (uint8_t *)dst + 32;
++			}
++			if (w & 16) {
++				from_sse16u(dst, tile_row);
++				tile_row += 16;
++				dst = (uint8_t *)dst + 16;
++			}
++			memcpy(dst, assume_aligned(tile_row, 16), w & 15);
++		} else {
++			while (w >= tile_width) {
++				from_sse128xNa(assume_aligned(dst, 16),
++					       assume_aligned(tile_row, tile_width),
++					       tile_width);
++				tile_row += tile_size;
++				dst = (uint8_t *)dst + tile_width;
++				w -= tile_width;
++			}
++			while (w >= 64) {
++				from_sse64a(dst, tile_row);
++				tile_row += 64;
++				dst = (uint8_t *)dst + 64;
++				w -= 64;
++			}
++			if (w & 32) {
++				from_sse32a(dst, tile_row);
++				tile_row += 32;
++				dst = (uint8_t *)dst + 32;
++			}
++			if (w & 16) {
++				from_sse16a(dst, tile_row);
++				tile_row += 16;
++				dst = (uint8_t *)dst + 16;
++			}
++			memcpy(assume_aligned(dst, 16),
++			       assume_aligned(tile_row, 16),
++			       w & 15);
++		}
++		dst = (uint8_t *)dst + dst_stride;
++	}
++}
++
++static void
++memcpy_between_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp,
++					int32_t src_stride, int32_t dst_stride,
++					int16_t src_x, int16_t src_y,
++					int16_t dst_x, int16_t dst_y,
++					uint16_t width, uint16_t height)
++{
++	const unsigned tile_width = 512;
++	const unsigned tile_height = 8;
++	const unsigned tile_size = 4096;
++
++	const unsigned cpp = bpp / 8;
++	const unsigned tile_pixels = tile_width / cpp;
++	const unsigned tile_shift = ffs(tile_pixels) - 1;
++	const unsigned tile_mask = tile_pixels - 1;
++
++	unsigned ox, lx;
++
++	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
++	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
++	assert(src != dst);
++
++	width *= cpp;
++	dst_stride *= tile_height;
++	src_stride *= tile_height;
++
++	assert((dst_x & tile_mask) == (src_x & tile_mask));
++	if (dst_x & tile_mask) {
++		ox = (dst_x & tile_mask) * cpp;
++		lx = min(tile_width - ox, width);
++		assert(lx != 0);
++	} else
++		lx = 0;
++
++	if (dst_x)
++		dst = (uint8_t *)dst + (dst_x >> tile_shift) * tile_size;
++	if (src_x)
++		src = (const uint8_t *)src + (src_x >> tile_shift) * tile_size;
++
++	while (height--) {
++		const uint8_t *src_row;
++		uint8_t *dst_row;
++		unsigned w = width;
++
++		dst_row = dst;
++		dst_row += dst_y / tile_height * dst_stride;
++		dst_row += (dst_y & (tile_height-1)) * tile_width;
++		dst_y++;
++
++		src_row = src;
++		src_row += src_y / tile_height * src_stride;
++		src_row += (src_y & (tile_height-1)) * tile_width;
++		src_y++;
++
++		if (lx) {
++			to_memcpy(dst_row + ox, src_row + ox, lx);
++			dst_row += tile_size;
++			src_row += tile_size;
++			w -= lx;
++		}
++		while (w >= tile_width) {
++			assert(((uintptr_t)dst_row & (tile_width - 1)) == 0);
++			assert(((uintptr_t)src_row & (tile_width - 1)) == 0);
++			to_sse128xN(assume_aligned(dst_row, tile_width),
++				    assume_aligned(src_row, tile_width),
++				    tile_width);
++			dst_row += tile_size;
++			src_row += tile_size;
++			w -= tile_width;
++		}
++		if (w) {
++			assert(((uintptr_t)dst_row & (tile_width - 1)) == 0);
++			assert(((uintptr_t)src_row & (tile_width - 1)) == 0);
++			to_memcpy(assume_aligned(dst_row, tile_width),
++				  assume_aligned(src_row, tile_width),
++				  w);
++		}
++	}
++}
++
++#pragma GCC push_options
+ #endif
+ 
+ fast void
+@@ -257,7 +753,8 @@ memcpy_to_tiled_x__swizzle_0(const void *src, void *dst, int bpp,
+ 			if (dst_x & tile_mask) {
+ 				const unsigned x = (dst_x & tile_mask) * cpp;
+ 				const unsigned len = min(tile_width - x, w);
+-				memcpy(tile_row + x, src, len);
++				memcpy(assume_misaligned(tile_row + x, tile_width, x),
++				       src, len);
+ 
+ 				tile_row += tile_size;
+ 				src = (const uint8_t *)src + len;
+@@ -265,13 +762,13 @@ memcpy_to_tiled_x__swizzle_0(const void *src, void *dst, int bpp,
+ 			}
+ 		}
+ 		while (w >= tile_width) {
+-			memcpy(tile_row, src, tile_width);
+-
++			memcpy(assume_aligned(tile_row, tile_width),
++			       src, tile_width);
+ 			tile_row += tile_size;
+ 			src = (const uint8_t *)src + tile_width;
+ 			w -= tile_width;
+ 		}
+-		memcpy(tile_row, src, w);
++		memcpy(assume_aligned(tile_row, tile_width), src, w);
+ 		src = (const uint8_t *)src + src_stride + w;
+ 		dst_y++;
+ 	}
+@@ -313,7 +810,7 @@ memcpy_from_tiled_x__swizzle_0(const void *src, void *dst, int bpp,
+ 			if (src_x & tile_mask) {
+ 				const unsigned x = (src_x & tile_mask) * cpp;
+ 				const unsigned len = min(tile_width - x, w);
+-				memcpy(dst, tile_row + x, len);
++				memcpy(dst, assume_misaligned(tile_row + x, tile_width, x), len);
+ 
+ 				tile_row += tile_size;
+ 				dst = (uint8_t *)dst + len;
+@@ -321,440 +818,371 @@ memcpy_from_tiled_x__swizzle_0(const void *src, void *dst, int bpp,
+ 			}
+ 		}
+ 		while (w >= tile_width) {
+-			memcpy(dst, tile_row, tile_width);
++			memcpy(dst,
++			       assume_aligned(tile_row, tile_width),
++			       tile_width);
+ 
+ 			tile_row += tile_size;
+ 			dst = (uint8_t *)dst + tile_width;
+ 			w -= tile_width;
+ 		}
+-		memcpy(dst, tile_row, w);
++		memcpy(dst, assume_aligned(tile_row, tile_width), w);
+ 		dst = (uint8_t *)dst + dst_stride + w;
+ 		src_y++;
+ 	}
+ }
+ 
+-fast_memcpy static void
+-memcpy_to_tiled_x__swizzle_9(const void *src, void *dst, int bpp,
+-			     int32_t src_stride, int32_t dst_stride,
+-			     int16_t src_x, int16_t src_y,
+-			     int16_t dst_x, int16_t dst_y,
+-			     uint16_t width, uint16_t height)
++static fast_memcpy void
++memcpy_between_tiled_x__swizzle_0(const void *src, void *dst, int bpp,
++				  int32_t src_stride, int32_t dst_stride,
++				  int16_t src_x, int16_t src_y,
++				  int16_t dst_x, int16_t dst_y,
++				  uint16_t width, uint16_t height)
+ {
+ 	const unsigned tile_width = 512;
+ 	const unsigned tile_height = 8;
+ 	const unsigned tile_size = 4096;
+ 
+ 	const unsigned cpp = bpp / 8;
+-	const unsigned stride_tiles = dst_stride / tile_width;
+-	const unsigned swizzle_pixels = 64 / cpp;
+-	const unsigned tile_pixels = ffs(tile_width / cpp) - 1;
+-	const unsigned tile_mask = (1 << tile_pixels) - 1;
+-
+-	unsigned x, y;
++	const unsigned tile_pixels = tile_width / cpp;
++	const unsigned tile_shift = ffs(tile_pixels) - 1;
++	const unsigned tile_mask = tile_pixels - 1;
+ 
+ 	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
+ 	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
++	assert(src != dst);
++	assert((dst_x & tile_mask) == (src_x & tile_mask));
+ 
+-	src = (const uint8_t *)src + src_y * src_stride + src_x * cpp;
+-
+-	for (y = 0; y < height; ++y) {
+-		const uint32_t dy = y + dst_y;
+-		const uint32_t tile_row =
+-			(dy / tile_height * stride_tiles * tile_size +
+-			 (dy & (tile_height-1)) * tile_width);
+-		const uint8_t *src_row = (const uint8_t *)src + src_stride * y;
+-		uint32_t dx = dst_x, offset;
+-
+-		x = width * cpp;
+-		if (dx & (swizzle_pixels - 1)) {
+-			const uint32_t swizzle_bound_pixels = ALIGN(dx + 1, swizzle_pixels);
+-			const uint32_t length = min(dst_x + width, swizzle_bound_pixels) - dx;
+-			offset = tile_row +
+-				(dx >> tile_pixels) * tile_size +
+-				(dx & tile_mask) * cpp;
+-			offset ^= (offset >> 3) & 64;
+-
+-			memcpy((char *)dst + offset, src_row, length * cpp);
+-
+-			src_row += length * cpp;
+-			x -= length * cpp;
+-			dx += length;
+-		}
+-		while (x >= 64) {
+-			offset = tile_row +
+-				(dx >> tile_pixels) * tile_size +
+-				(dx & tile_mask) * cpp;
+-			offset ^= (offset >> 3) & 64;
+-
+-			memcpy((char *)dst + offset, src_row, 64);
+-
+-			src_row += 64;
+-			x -= 64;
+-			dx += swizzle_pixels;
+-		}
+-		if (x) {
+-			offset = tile_row +
+-				(dx >> tile_pixels) * tile_size +
+-				(dx & tile_mask) * cpp;
+-			offset ^= (offset >> 3) & 64;
+-			memcpy((char *)dst + offset, src_row, x);
+-		}
+-	}
+-}
++	while (height--) {
++		unsigned w = width * cpp;
++		uint8_t *dst_row = dst;
++		const uint8_t *src_row = src;
+ 
+-fast_memcpy static void
+-memcpy_from_tiled_x__swizzle_9(const void *src, void *dst, int bpp,
+-			       int32_t src_stride, int32_t dst_stride,
+-			       int16_t src_x, int16_t src_y,
+-			       int16_t dst_x, int16_t dst_y,
+-			       uint16_t width, uint16_t height)
+-{
+-	const unsigned tile_width = 512;
+-	const unsigned tile_height = 8;
+-	const unsigned tile_size = 4096;
++		dst_row += dst_y / tile_height * dst_stride * tile_height;
++		dst_row += (dst_y & (tile_height-1)) * tile_width;
++		if (dst_x)
++			dst_row += (dst_x >> tile_shift) * tile_size;
++		dst_y++;
+ 
+-	const unsigned cpp = bpp / 8;
+-	const unsigned stride_tiles = src_stride / tile_width;
+-	const unsigned swizzle_pixels = 64 / cpp;
+-	const unsigned tile_pixels = ffs(tile_width / cpp) - 1;
+-	const unsigned tile_mask = (1 << tile_pixels) - 1;
++		src_row += src_y / tile_height * src_stride * tile_height;
++		src_row += (src_y & (tile_height-1)) * tile_width;
++		if (src_x)
++			src_row += (src_x >> tile_shift) * tile_size;
++		src_y++;
+ 
+-	unsigned x, y;
++		if (dst_x & tile_mask) {
++			const unsigned x = (dst_x & tile_mask) * cpp;
++			const unsigned len = min(tile_width - x, w);
+ 
+-	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
+-	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
++			memcpy(assume_misaligned(dst_row + x, tile_width, x),
++			       assume_misaligned(src_row + x, tile_width, x),
++			       len);
+ 
+-	dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp;
+-
+-	for (y = 0; y < height; ++y) {
+-		const uint32_t sy = y + src_y;
+-		const uint32_t tile_row =
+-			(sy / tile_height * stride_tiles * tile_size +
+-			 (sy & (tile_height-1)) * tile_width);
+-		uint8_t *dst_row = (uint8_t *)dst + dst_stride * y;
+-		uint32_t sx = src_x, offset;
+-
+-		x = width * cpp;
+-		if (sx & (swizzle_pixels - 1)) {
+-			const uint32_t swizzle_bound_pixels = ALIGN(sx + 1, swizzle_pixels);
+-			const uint32_t length = min(src_x + width, swizzle_bound_pixels) - sx;
+-			offset = tile_row +
+-				(sx >> tile_pixels) * tile_size +
+-				(sx & tile_mask) * cpp;
+-			offset ^= (offset >> 3) & 64;
+-
+-			memcpy(dst_row, (const char *)src + offset, length * cpp);
+-
+-			dst_row += length * cpp;
+-			x -= length * cpp;
+-			sx += length;
++			dst_row += tile_size;
++			src_row += tile_size;
++			w -= len;
+ 		}
+-		while (x >= 64) {
+-			offset = tile_row +
+-				(sx >> tile_pixels) * tile_size +
+-				(sx & tile_mask) * cpp;
+-			offset ^= (offset >> 3) & 64;
+ 
+-			memcpy(dst_row, (const char *)src + offset, 64);
+-
+-			dst_row += 64;
+-			x -= 64;
+-			sx += swizzle_pixels;
+-		}
+-		if (x) {
+-			offset = tile_row +
+-				(sx >> tile_pixels) * tile_size +
+-				(sx & tile_mask) * cpp;
+-			offset ^= (offset >> 3) & 64;
+-			memcpy(dst_row, (const char *)src + offset, x);
++		while (w >= tile_width) {
++			memcpy(assume_aligned(dst_row, tile_width),
++			       assume_aligned(src_row, tile_width),
++			       tile_width);
++			dst_row += tile_size;
++			src_row += tile_size;
++			w -= tile_width;
+ 		}
++		memcpy(assume_aligned(dst_row, tile_width),
++		       assume_aligned(src_row, tile_width),
++		       w);
+ 	}
+ }
+ 
+-fast_memcpy static void
+-memcpy_to_tiled_x__swizzle_9_10(const void *src, void *dst, int bpp,
+-				int32_t src_stride, int32_t dst_stride,
+-				int16_t src_x, int16_t src_y,
+-				int16_t dst_x, int16_t dst_y,
+-				uint16_t width, uint16_t height)
+-{
+-	const unsigned tile_width = 512;
+-	const unsigned tile_height = 8;
+-	const unsigned tile_size = 4096;
+-
+-	const unsigned cpp = bpp / 8;
+-	const unsigned stride_tiles = dst_stride / tile_width;
+-	const unsigned swizzle_pixels = 64 / cpp;
+-	const unsigned tile_pixels = ffs(tile_width / cpp) - 1;
+-	const unsigned tile_mask = (1 << tile_pixels) - 1;
++#define memcpy_to_tiled_x(swizzle) \
++fast_memcpy static void \
++memcpy_to_tiled_x__##swizzle (const void *src, void *dst, int bpp, \
++			      int32_t src_stride, int32_t dst_stride, \
++			      int16_t src_x, int16_t src_y, \
++			      int16_t dst_x, int16_t dst_y, \
++			      uint16_t width, uint16_t height) \
++{ \
++	const unsigned tile_width = 512; \
++	const unsigned tile_height = 8; \
++	const unsigned tile_size = 4096; \
++	const unsigned cpp = bpp / 8; \
++	const unsigned stride_tiles = dst_stride / tile_width; \
++	const unsigned swizzle_pixels = 64 / cpp; \
++	const unsigned tile_pixels = ffs(tile_width / cpp) - 1; \
++	const unsigned tile_mask = (1 << tile_pixels) - 1; \
++	unsigned x, y; \
++	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", \
++	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); \
++	src = (const uint8_t *)src + src_y * src_stride + src_x * cpp; \
++	for (y = 0; y < height; ++y) { \
++		const uint32_t dy = y + dst_y; \
++		const uint32_t tile_row = \
++			(dy / tile_height * stride_tiles * tile_size + \
++			 (dy & (tile_height-1)) * tile_width); \
++		const uint8_t *src_row = (const uint8_t *)src + src_stride * y; \
++		uint32_t dx = dst_x; \
++		x = width * cpp; \
++		if (dx & (swizzle_pixels - 1)) { \
++			const uint32_t swizzle_bound_pixels = ALIGN(dx + 1, swizzle_pixels); \
++			const uint32_t length = min(dst_x + width, swizzle_bound_pixels) - dx; \
++			uint32_t offset = \
++				tile_row + \
++				(dx >> tile_pixels) * tile_size + \
++				(dx & tile_mask) * cpp; \
++			memcpy((char *)dst + swizzle(offset), src_row, length * cpp); \
++			src_row += length * cpp; \
++			x -= length * cpp; \
++			dx += length; \
++		} \
++		while (x >= 64) { \
++			uint32_t offset = \
++				tile_row + \
++				(dx >> tile_pixels) * tile_size + \
++				(dx & tile_mask) * cpp; \
++			memcpy(assume_aligned((char *)dst+swizzle(offset),64), \
++			       src_row, 64); \
++			src_row += 64; \
++			x -= 64; \
++			dx += swizzle_pixels; \
++		} \
++		if (x) { \
++			uint32_t offset = \
++				tile_row + \
++				(dx >> tile_pixels) * tile_size + \
++				(dx & tile_mask) * cpp; \
++			memcpy(assume_aligned((char *)dst + swizzle(offset), 64), src_row, x); \
++		} \
++	} \
++}
+ 
+-	unsigned x, y;
++#define memcpy_from_tiled_x(swizzle) \
++fast_memcpy static void \
++memcpy_from_tiled_x__##swizzle (const void *src, void *dst, int bpp, \
++				int32_t src_stride, int32_t dst_stride, \
++				int16_t src_x, int16_t src_y, \
++				int16_t dst_x, int16_t dst_y, \
++				uint16_t width, uint16_t height) \
++{ \
++	const unsigned tile_width = 512; \
++	const unsigned tile_height = 8; \
++	const unsigned tile_size = 4096; \
++	const unsigned cpp = bpp / 8; \
++	const unsigned stride_tiles = src_stride / tile_width; \
++	const unsigned swizzle_pixels = 64 / cpp; \
++	const unsigned tile_pixels = ffs(tile_width / cpp) - 1; \
++	const unsigned tile_mask = (1 << tile_pixels) - 1; \
++	unsigned x, y; \
++	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", \
++	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); \
++	dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp; \
++	for (y = 0; y < height; ++y) { \
++		const uint32_t sy = y + src_y; \
++		const uint32_t tile_row = \
++			(sy / tile_height * stride_tiles * tile_size + \
++			 (sy & (tile_height-1)) * tile_width); \
++		uint8_t *dst_row = (uint8_t *)dst + dst_stride * y; \
++		uint32_t sx = src_x; \
++		x = width * cpp; \
++		if (sx & (swizzle_pixels - 1)) { \
++			const uint32_t swizzle_bound_pixels = ALIGN(sx + 1, swizzle_pixels); \
++			const uint32_t length = min(src_x + width, swizzle_bound_pixels) - sx; \
++			uint32_t offset = \
++				tile_row + \
++				(sx >> tile_pixels) * tile_size + \
++				(sx & tile_mask) * cpp; \
++			memcpy(dst_row, (const char *)src + swizzle(offset), length * cpp); \
++			dst_row += length * cpp; \
++			x -= length * cpp; \
++			sx += length; \
++		} \
++		while (x >= 64) { \
++			uint32_t offset = \
++				tile_row + \
++				(sx >> tile_pixels) * tile_size + \
++				(sx & tile_mask) * cpp; \
++			memcpy(dst_row, assume_aligned((const char *)src + swizzle(offset), 64), 64); \
++			dst_row += 64; \
++			x -= 64; \
++			sx += swizzle_pixels; \
++		} \
++		if (x) { \
++			uint32_t offset = \
++				tile_row + \
++				(sx >> tile_pixels) * tile_size + \
++				(sx & tile_mask) * cpp; \
++			memcpy(dst_row, assume_aligned((const char *)src + swizzle(offset), 64), x); \
++		} \
++	} \
++}
+ 
+-	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
+-	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
++#define swizzle_9(X) ((X) ^ (((X) >> 3) & 64))
++memcpy_to_tiled_x(swizzle_9)
++memcpy_from_tiled_x(swizzle_9)
++#undef swizzle_9
+ 
+-	src = (const uint8_t *)src + src_y * src_stride + src_x * cpp;
+-
+-	for (y = 0; y < height; ++y) {
+-		const uint32_t dy = y + dst_y;
+-		const uint32_t tile_row =
+-			(dy / tile_height * stride_tiles * tile_size +
+-			 (dy & (tile_height-1)) * tile_width);
+-		const uint8_t *src_row = (const uint8_t *)src + src_stride * y;
+-		uint32_t dx = dst_x, offset;
+-
+-		x = width * cpp;
+-		if (dx & (swizzle_pixels - 1)) {
+-			const uint32_t swizzle_bound_pixels = ALIGN(dx + 1, swizzle_pixels);
+-			const uint32_t length = min(dst_x + width, swizzle_bound_pixels) - dx;
+-			offset = tile_row +
+-				(dx >> tile_pixels) * tile_size +
+-				(dx & tile_mask) * cpp;
+-			offset ^= ((offset ^ (offset >> 1)) >> 3) & 64;
+-
+-			memcpy((char *)dst + offset, src_row, length * cpp);
+-
+-			src_row += length * cpp;
+-			x -= length * cpp;
+-			dx += length;
+-		}
+-		while (x >= 64) {
+-			offset = tile_row +
+-				(dx >> tile_pixels) * tile_size +
+-				(dx & tile_mask) * cpp;
+-			offset ^= ((offset ^ (offset >> 1)) >> 3) & 64;
++#define swizzle_9_10(X) ((X) ^ ((((X) ^ ((X) >> 1)) >> 3) & 64))
++memcpy_to_tiled_x(swizzle_9_10)
++memcpy_from_tiled_x(swizzle_9_10)
++#undef swizzle_9_10
+ 
+-			memcpy((char *)dst + offset, src_row, 64);
++#define swizzle_9_11(X) ((X) ^ ((((X) ^ ((X) >> 2)) >> 3) & 64))
++memcpy_to_tiled_x(swizzle_9_11)
++memcpy_from_tiled_x(swizzle_9_11)
++#undef swizzle_9_11
+ 
+-			src_row += 64;
+-			x -= 64;
+-			dx += swizzle_pixels;
+-		}
+-		if (x) {
+-			offset = tile_row +
+-				(dx >> tile_pixels) * tile_size +
+-				(dx & tile_mask) * cpp;
+-			offset ^= ((offset ^ (offset >> 1)) >> 3) & 64;
+-			memcpy((char *)dst + offset, src_row, x);
+-		}
+-	}
+-}
++#define swizzle_9_10_11(X) ((X) ^ ((((X) ^ ((X) >> 1) ^ ((X) >> 2)) >> 3) & 64))
++memcpy_to_tiled_x(swizzle_9_10_11)
++memcpy_from_tiled_x(swizzle_9_10_11)
++#undef swizzle_9_10_11
+ 
+-fast_memcpy static void
+-memcpy_from_tiled_x__swizzle_9_10(const void *src, void *dst, int bpp,
+-				  int32_t src_stride, int32_t dst_stride,
+-				  int16_t src_x, int16_t src_y,
+-				  int16_t dst_x, int16_t dst_y,
+-				  uint16_t width, uint16_t height)
++static fast_memcpy void
++memcpy_to_tiled_x__gen2(const void *src, void *dst, int bpp,
++			int32_t src_stride, int32_t dst_stride,
++			int16_t src_x, int16_t src_y,
++			int16_t dst_x, int16_t dst_y,
++			uint16_t width, uint16_t height)
+ {
+-	const unsigned tile_width = 512;
+-	const unsigned tile_height = 8;
+-	const unsigned tile_size = 4096;
++	const unsigned tile_width = 128;
++	const unsigned tile_height = 16;
++	const unsigned tile_size = 2048;
+ 
+ 	const unsigned cpp = bpp / 8;
+-	const unsigned stride_tiles = src_stride / tile_width;
+-	const unsigned swizzle_pixels = 64 / cpp;
+-	const unsigned tile_pixels = ffs(tile_width / cpp) - 1;
+-	const unsigned tile_mask = (1 << tile_pixels) - 1;
+-
+-	unsigned x, y;
++	const unsigned tile_pixels = tile_width / cpp;
++	const unsigned tile_shift = ffs(tile_pixels) - 1;
++	const unsigned tile_mask = tile_pixels - 1;
+ 
+ 	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
+ 	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
++	assert(src != dst);
+ 
+-	dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp;
+-
+-	for (y = 0; y < height; ++y) {
+-		const uint32_t sy = y + src_y;
+-		const uint32_t tile_row =
+-			(sy / tile_height * stride_tiles * tile_size +
+-			 (sy & (tile_height-1)) * tile_width);
+-		uint8_t *dst_row = (uint8_t *)dst + dst_stride * y;
+-		uint32_t sx = src_x, offset;
+-
+-		x = width * cpp;
+-		if (sx & (swizzle_pixels - 1)) {
+-			const uint32_t swizzle_bound_pixels = ALIGN(sx + 1, swizzle_pixels);
+-			const uint32_t length = min(src_x + width, swizzle_bound_pixels) - sx;
+-			offset = tile_row +
+-				(sx >> tile_pixels) * tile_size +
+-				(sx & tile_mask) * cpp;
+-			offset ^= ((offset ^ (offset >> 1)) >> 3) & 64;
+-
+-			memcpy(dst_row, (const char *)src + offset, length * cpp);
+-
+-			dst_row += length * cpp;
+-			x -= length * cpp;
+-			sx += length;
+-		}
+-		while (x >= 64) {
+-			offset = tile_row +
+-				(sx >> tile_pixels) * tile_size +
+-				(sx & tile_mask) * cpp;
+-			offset ^= ((offset ^ (offset >> 1)) >> 3) & 64;
+-
+-			memcpy(dst_row, (const char *)src + offset, 64);
+-
+-			dst_row += 64;
+-			x -= 64;
+-			sx += swizzle_pixels;
+-		}
+-		if (x) {
+-			offset = tile_row +
+-				(sx >> tile_pixels) * tile_size +
+-				(sx & tile_mask) * cpp;
+-			offset ^= ((offset ^ (offset >> 1)) >> 3) & 64;
+-			memcpy(dst_row, (const char *)src + offset, x);
+-		}
+-	}
+-}
+-
+-fast_memcpy static void
+-memcpy_to_tiled_x__swizzle_9_11(const void *src, void *dst, int bpp,
+-				int32_t src_stride, int32_t dst_stride,
+-				int16_t src_x, int16_t src_y,
+-				int16_t dst_x, int16_t dst_y,
+-				uint16_t width, uint16_t height)
+-{
+-	const unsigned tile_width = 512;
+-	const unsigned tile_height = 8;
+-	const unsigned tile_size = 4096;
+-
+-	const unsigned cpp = bpp / 8;
+-	const unsigned stride_tiles = dst_stride / tile_width;
+-	const unsigned swizzle_pixels = 64 / cpp;
+-	const unsigned tile_pixels = ffs(tile_width / cpp) - 1;
+-	const unsigned tile_mask = (1 << tile_pixels) - 1;
++	if (src_x | src_y)
++		src = (const uint8_t *)src + src_y * src_stride + src_x * cpp;
++	assert(src_stride >= width * cpp);
++	src_stride -= width * cpp;
+ 
+-	unsigned x, y;
++	while (height--) {
++		unsigned w = width * cpp;
++		uint8_t *tile_row = dst;
+ 
+-	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
+-	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
++		tile_row += dst_y / tile_height * dst_stride * tile_height;
++		tile_row += (dst_y & (tile_height-1)) * tile_width;
++		if (dst_x) {
++			tile_row += (dst_x >> tile_shift) * tile_size;
++			if (dst_x & tile_mask) {
++				const unsigned x = (dst_x & tile_mask) * cpp;
++				const unsigned len = min(tile_width - x, w);
++				memcpy(assume_misaligned(tile_row + x, tile_width, x), src, len);
+ 
+-	src = (const uint8_t *)src + src_y * src_stride + src_x * cpp;
+-
+-	for (y = 0; y < height; ++y) {
+-		const uint32_t dy = y + dst_y;
+-		const uint32_t tile_row =
+-			(dy / tile_height * stride_tiles * tile_size +
+-			 (dy & (tile_height-1)) * tile_width);
+-		const uint8_t *src_row = (const uint8_t *)src + src_stride * y;
+-		uint32_t dx = dst_x, offset;
+-
+-		x = width * cpp;
+-		if (dx & (swizzle_pixels - 1)) {
+-			const uint32_t swizzle_bound_pixels = ALIGN(dx + 1, swizzle_pixels);
+-			const uint32_t length = min(dst_x + width, swizzle_bound_pixels) - dx;
+-			offset = tile_row +
+-				(dx >> tile_pixels) * tile_size +
+-				(dx & tile_mask) * cpp;
+-			offset ^= ((offset ^ (offset >> 2)) >> 3) & 64;
+-			memcpy((char *)dst + offset, src_row, length * cpp);
+-
+-			src_row += length * cpp;
+-			x -= length * cpp;
+-			dx += length;
++				tile_row += tile_size;
++				src = (const uint8_t *)src + len;
++				w -= len;
++			}
+ 		}
+-		while (x >= 64) {
+-			offset = tile_row +
+-				(dx >> tile_pixels) * tile_size +
+-				(dx & tile_mask) * cpp;
+-			offset ^= ((offset ^ (offset >> 2)) >> 3) & 64;
+-
+-			memcpy((char *)dst + offset, src_row, 64);
++		while (w >= tile_width) {
++			memcpy(assume_aligned(tile_row, tile_width),
++			       src, tile_width);
+ 
+-			src_row += 64;
+-			x -= 64;
+-			dx += swizzle_pixels;
+-		}
+-		if (x) {
+-			offset = tile_row +
+-				(dx >> tile_pixels) * tile_size +
+-				(dx & tile_mask) * cpp;
+-			offset ^= ((offset ^ (offset >> 2)) >> 3) & 64;
+-			memcpy((char *)dst + offset, src_row, x);
++			tile_row += tile_size;
++			src = (const uint8_t *)src + tile_width;
++			w -= tile_width;
+ 		}
++		memcpy(assume_aligned(tile_row, tile_width), src, w);
++		src = (const uint8_t *)src + src_stride + w;
++		dst_y++;
+ 	}
+ }
+ 
+-fast_memcpy static void
+-memcpy_from_tiled_x__swizzle_9_11(const void *src, void *dst, int bpp,
+-				  int32_t src_stride, int32_t dst_stride,
+-				  int16_t src_x, int16_t src_y,
+-				  int16_t dst_x, int16_t dst_y,
+-				  uint16_t width, uint16_t height)
++static fast_memcpy void
++memcpy_from_tiled_x__gen2(const void *src, void *dst, int bpp,
++			  int32_t src_stride, int32_t dst_stride,
++			  int16_t src_x, int16_t src_y,
++			  int16_t dst_x, int16_t dst_y,
++			  uint16_t width, uint16_t height)
+ {
+-	const unsigned tile_width = 512;
+-	const unsigned tile_height = 8;
+-	const unsigned tile_size = 4096;
++	const unsigned tile_width = 128;
++	const unsigned tile_height = 16;
++	const unsigned tile_size = 2048;
+ 
+ 	const unsigned cpp = bpp / 8;
+-	const unsigned stride_tiles = src_stride / tile_width;
+-	const unsigned swizzle_pixels = 64 / cpp;
+-	const unsigned tile_pixels = ffs(tile_width / cpp) - 1;
+-	const unsigned tile_mask = (1 << tile_pixels) - 1;
+-
+-	unsigned x, y;
++	const unsigned tile_pixels = tile_width / cpp;
++	const unsigned tile_shift = ffs(tile_pixels) - 1;
++	const unsigned tile_mask = tile_pixels - 1;
+ 
+ 	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
+ 	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
++	assert(src != dst);
+ 
+-	dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp;
+-
+-	for (y = 0; y < height; ++y) {
+-		const uint32_t sy = y + src_y;
+-		const uint32_t tile_row =
+-			(sy / tile_height * stride_tiles * tile_size +
+-			 (sy & (tile_height-1)) * tile_width);
+-		uint8_t *dst_row = (uint8_t *)dst + dst_stride * y;
+-		uint32_t sx = src_x, offset;
+-
+-		x = width * cpp;
+-		if (sx & (swizzle_pixels - 1)) {
+-			const uint32_t swizzle_bound_pixels = ALIGN(sx + 1, swizzle_pixels);
+-			const uint32_t length = min(src_x + width, swizzle_bound_pixels) - sx;
+-			offset = tile_row +
+-				(sx >> tile_pixels) * tile_size +
+-				(sx & tile_mask) * cpp;
+-			offset ^= ((offset ^ (offset >> 2)) >> 3) & 64;
+-			memcpy(dst_row, (const char *)src + offset, length * cpp);
+-
+-			dst_row += length * cpp;
+-			x -= length * cpp;
+-			sx += length;
+-		}
+-		while (x >= 64) {
+-			offset = tile_row +
+-				(sx >> tile_pixels) * tile_size +
+-				(sx & tile_mask) * cpp;
+-			offset ^= ((offset ^ (offset >> 2)) >> 3) & 64;
++	if (dst_x | dst_y)
++		dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp;
++	assert(dst_stride >= width * cpp);
++	dst_stride -= width * cpp;
++
++	while (height--) {
++		unsigned w = width * cpp;
++		const uint8_t *tile_row = src;
+ 
+-			memcpy(dst_row, (const char *)src + offset, 64);
++		tile_row += src_y / tile_height * src_stride * tile_height;
++		tile_row += (src_y & (tile_height-1)) * tile_width;
++		if (src_x) {
++			tile_row += (src_x >> tile_shift) * tile_size;
++			if (src_x & tile_mask) {
++				const unsigned x = (src_x & tile_mask) * cpp;
++				const unsigned len = min(tile_width - x, w);
++				memcpy(dst, assume_misaligned(tile_row + x, tile_width, x), len);
+ 
+-			dst_row += 64;
+-			x -= 64;
+-			sx += swizzle_pixels;
++				tile_row += tile_size;
++				dst = (uint8_t *)dst + len;
++				w -= len;
++			}
+ 		}
+-		if (x) {
+-			offset = tile_row +
+-				(sx >> tile_pixels) * tile_size +
+-				(sx & tile_mask) * cpp;
+-			offset ^= ((offset ^ (offset >> 2)) >> 3) & 64;
+-			memcpy(dst_row, (const char *)src + offset, x);
++		while (w >= tile_width) {
++			memcpy(dst,
++			       assume_aligned(tile_row, tile_width),
++			       tile_width);
++
++			tile_row += tile_size;
++			dst = (uint8_t *)dst + tile_width;
++			w -= tile_width;
+ 		}
++		memcpy(dst, assume_aligned(tile_row, tile_width), w);
++		dst = (uint8_t *)dst + dst_stride + w;
++		src_y++;
+ 	}
+ }
+ 
+-void choose_memcpy_tiled_x(struct kgem *kgem, int swizzling)
++void choose_memcpy_tiled_x(struct kgem *kgem, int swizzling, unsigned cpu)
+ {
++	if (kgem->gen < 030) {
++		if (swizzling == I915_BIT_6_SWIZZLE_NONE) {
++			DBG(("%s: gen2, no swizzling\n", __FUNCTION__));
++			kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__gen2;
++			kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__gen2;
++		} else
++			DBG(("%s: no detiling with swizzle functions for gen2\n", __FUNCTION__));
++		return;
++	}
++
+ 	switch (swizzling) {
+ 	default:
+ 		DBG(("%s: unknown swizzling, %d\n", __FUNCTION__, swizzling));
+ 		break;
+ 	case I915_BIT_6_SWIZZLE_NONE:
+ 		DBG(("%s: no swizzling\n", __FUNCTION__));
+-		kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_0;
+-		kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_0;
++#if defined(sse2)
++		if (cpu & SSE2) {
++			kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_0__sse2;
++			kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_0__sse2;
++			kgem->memcpy_between_tiled_x = memcpy_between_tiled_x__swizzle_0__sse2;
++		} else
++#endif
++	       	{
++			kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_0;
++			kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_0;
++			kgem->memcpy_between_tiled_x = memcpy_between_tiled_x__swizzle_0;
++		}
+ 		break;
+ 	case I915_BIT_6_SWIZZLE_9:
+ 		DBG(("%s: 6^9 swizzling\n", __FUNCTION__));
+@@ -771,6 +1199,11 @@ void choose_memcpy_tiled_x(struct kgem *kgem, int swizzling)
+ 		kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_9_11;
+ 		kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_9_11;
+ 		break;
++	case I915_BIT_6_SWIZZLE_9_10_11:
++		DBG(("%s: 6^9^10^11 swizzling\n", __FUNCTION__));
++		kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_9_10_11;
++		kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_9_10_11;
++		break;
+ 	}
+ }
+ 
+@@ -995,7 +1428,7 @@ memcpy_xor(const void *src, void *dst, int bpp,
+ 				height = 1;
+ 			}
+ 
+-#if USE_SSE2
++#if defined(sse2) && __x86_64__
+ 			if (have_sse2()) {
+ 				do {
+ 					uint32_t *d = (uint32_t *)dst_bytes;
+@@ -1118,3 +1551,241 @@ memcpy_xor(const void *src, void *dst, int bpp,
+ 		}
+ 	}
+ }
++
++#define BILINEAR_INTERPOLATION_BITS 4
++static inline int
++bilinear_weight(pixman_fixed_t x)
++{
++	return (x >> (16 - BILINEAR_INTERPOLATION_BITS)) &
++		((1 << BILINEAR_INTERPOLATION_BITS) - 1);
++}
++
++#if BILINEAR_INTERPOLATION_BITS <= 4
++/* Inspired by Filter_32_opaque from Skia */
++static inline uint32_t
++bilinear_interpolation(uint32_t tl, uint32_t tr,
++		       uint32_t bl, uint32_t br,
++		       int distx, int disty)
++{
++	int distxy, distxiy, distixy, distixiy;
++	uint32_t lo, hi;
++
++	distx <<= (4 - BILINEAR_INTERPOLATION_BITS);
++	disty <<= (4 - BILINEAR_INTERPOLATION_BITS);
++
++	distxy = distx * disty;
++	distxiy = (distx << 4) - distxy;	/* distx * (16 - disty) */
++	distixy = (disty << 4) - distxy;	/* disty * (16 - distx) */
++	distixiy =
++		16 * 16 - (disty << 4) -
++		(distx << 4) + distxy; /* (16 - distx) * (16 - disty) */
++
++	lo = (tl & 0xff00ff) * distixiy;
++	hi = ((tl >> 8) & 0xff00ff) * distixiy;
++
++	lo += (tr & 0xff00ff) * distxiy;
++	hi += ((tr >> 8) & 0xff00ff) * distxiy;
++
++	lo += (bl & 0xff00ff) * distixy;
++	hi += ((bl >> 8) & 0xff00ff) * distixy;
++
++	lo += (br & 0xff00ff) * distxy;
++	hi += ((br >> 8) & 0xff00ff) * distxy;
++
++	return ((lo >> 8) & 0xff00ff) | (hi & ~0xff00ff);
++}
++#elif SIZEOF_LONG > 4
++static inline uint32_t
++bilinear_interpolation(uint32_t tl, uint32_t tr,
++		       uint32_t bl, uint32_t br,
++		       int distx, int disty)
++{
++	uint64_t distxy, distxiy, distixy, distixiy;
++	uint64_t tl64, tr64, bl64, br64;
++	uint64_t f, r;
++
++	distx <<= (8 - BILINEAR_INTERPOLATION_BITS);
++	disty <<= (8 - BILINEAR_INTERPOLATION_BITS);
++
++	distxy = distx * disty;
++	distxiy = distx * (256 - disty);
++	distixy = (256 - distx) * disty;
++	distixiy = (256 - distx) * (256 - disty);
++
++	/* Alpha and Blue */
++	tl64 = tl & 0xff0000ff;
++	tr64 = tr & 0xff0000ff;
++	bl64 = bl & 0xff0000ff;
++	br64 = br & 0xff0000ff;
++
++	f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
++	r = f & 0x0000ff0000ff0000ull;
++
++	/* Red and Green */
++	tl64 = tl;
++	tl64 = ((tl64 << 16) & 0x000000ff00000000ull) | (tl64 & 0x0000ff00ull);
++
++	tr64 = tr;
++	tr64 = ((tr64 << 16) & 0x000000ff00000000ull) | (tr64 & 0x0000ff00ull);
++
++	bl64 = bl;
++	bl64 = ((bl64 << 16) & 0x000000ff00000000ull) | (bl64 & 0x0000ff00ull);
++
++	br64 = br;
++	br64 = ((br64 << 16) & 0x000000ff00000000ull) | (br64 & 0x0000ff00ull);
++
++	f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
++	r |= ((f >> 16) & 0x000000ff00000000ull) | (f & 0xff000000ull);
++
++	return (uint32_t)(r >> 16);
++}
++#else
++static inline uint32_t
++bilinear_interpolation(uint32_t tl, uint32_t tr,
++		       uint32_t bl, uint32_t br,
++		       int distx, int disty)
++{
++	int distxy, distxiy, distixy, distixiy;
++	uint32_t f, r;
++
++	distx <<= (8 - BILINEAR_INTERPOLATION_BITS);
++	disty <<= (8 - BILINEAR_INTERPOLATION_BITS);
++
++	distxy = distx * disty;
++	distxiy = (distx << 8) - distxy;	/* distx * (256 - disty) */
++	distixy = (disty << 8) - distxy;	/* disty * (256 - distx) */
++	distixiy =
++		256 * 256 - (disty << 8) -
++		(distx << 8) + distxy;		/* (256 - distx) * (256 - disty) */
++
++	/* Blue */
++	r = ((tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy +
++	     (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy);
++
++	/* Green */
++	f = ((tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy +
++	     (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy);
++	r |= f & 0xff000000;
++
++	tl >>= 16;
++	tr >>= 16;
++	bl >>= 16;
++	br >>= 16;
++	r >>= 16;
++
++	/* Red */
++	f = ((tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy +
++	     (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy);
++	r |= f & 0x00ff0000;
++
++	/* Alpha */
++	f = ((tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy +
++	     (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy);
++	r |= f & 0xff000000;
++
++	return r;
++}
++#endif
++
++static inline uint32_t convert_pixel(const uint8_t *p, int x)
++{
++	return ((uint32_t *)p)[x];
++}
++
++fast void
++affine_blt(const void *src, void *dst, int bpp,
++	   int16_t src_x, int16_t src_y,
++	   int16_t src_width, int16_t src_height,
++	   int32_t src_stride,
++	   int16_t dst_x, int16_t dst_y,
++	   uint16_t dst_width, uint16_t dst_height,
++	   int32_t dst_stride,
++	   const struct pixman_f_transform *t)
++{
++	static const uint8_t zero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
++	const pixman_fixed_t ux = pixman_double_to_fixed(t->m[0][0]);
++	const pixman_fixed_t uy = pixman_double_to_fixed(t->m[1][0]);
++	int i, j;
++
++	assert(bpp == 32);
++
++	for (j = 0; j < dst_height; j++) {
++		pixman_fixed_t x, y;
++		struct pixman_f_vector v;
++		uint32_t *b;
++
++		/* reference point is the center of the pixel */
++		v.v[0] = dst_x + 0.5;
++		v.v[1] = dst_y + j + 0.5;
++		v.v[2] = 1.0;
++
++		pixman_f_transform_point_3d(t, &v);
++
++		x = pixman_double_to_fixed(v.v[0]);
++		x += pixman_int_to_fixed(src_x - dst_x);
++		y = pixman_double_to_fixed(v.v[1]);
++		y +=  pixman_int_to_fixed(src_y - dst_y);
++
++		b = (uint32_t*)((uint8_t *)dst + (dst_y + j) * dst_stride + dst_x * bpp / 8);
++		for (i = 0; i < dst_width; i++) {
++			const uint8_t *row1;
++			const uint8_t *row2;
++			int x1, y1, x2, y2;
++			uint32_t tl, tr, bl, br;
++			int32_t fx, fy;
++
++			x1 = x - pixman_fixed_1/2;
++			y1 = y - pixman_fixed_1/2;
++
++			fx = bilinear_weight(x1);
++			fy = bilinear_weight(y1);
++
++			x1 = pixman_fixed_to_int(x1);
++			x2 = x1 + 1;
++			y1 = pixman_fixed_to_int(y1);
++			y2 = y1 + 1;
++
++			if (x1 >= src_width  || x2 < 0 ||
++			    y1 >= src_height || y2 < 0) {
++				b[i] = 0;
++				goto next;
++			}
++
++			if (y2 == 0) {
++				row1 = zero;
++			} else {
++				row1 = (uint8_t *)src + src_stride * y1;
++				row1 += bpp / 8 * x1;
++			}
++
++			if (y1 == src_height - 1) {
++				row2 = zero;
++			} else {
++				row2 = (uint8_t *)src + src_stride * y2;
++				row2 += bpp / 8 * x1;
++			}
++
++			if (x2 == 0) {
++				tl = 0;
++				bl = 0;
++			} else {
++				tl = convert_pixel(row1, 0);
++				bl = convert_pixel(row2, 0);
++			}
++
++			if (x1 == src_width - 1) {
++				tr = 0;
++				br = 0;
++			} else {
++				tr = convert_pixel(row1, 1);
++				br = convert_pixel(row2, 1);
++			}
++
++			b[i] = bilinear_interpolation(tl, tr, bl, br, fx, fy);
++
++next:
++			x += ux;
++			y += uy;
++		}
++	}
++}
+diff --git a/src/sna/brw/brw_eu_emit.c b/src/sna/brw/brw_eu_emit.c
+index 00c984d9..154f939a 100644
+--- a/src/sna/brw/brw_eu_emit.c
++++ b/src/sna/brw/brw_eu_emit.c
+@@ -178,7 +178,7 @@ validate_reg(struct brw_instruction *insn, struct brw_reg reg)
+ 	}
+ 
+ 	if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
+-	    reg.file == BRW_ARF_NULL)
++	    reg.nr == BRW_ARF_NULL)
+ 		return;
+ 
+ 	assert(reg.hstride >= 0 && reg.hstride < ARRAY_SIZE(hstride_for_reg));
+@@ -700,7 +700,7 @@ push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
+  *
+  * When the matching 'else' instruction is reached (presumably by
+  * countdown of the instruction count patched in by our ELSE/ENDIF
+- * functions), the relevent flags are inverted.
++ * functions), the relevant flags are inverted.
+  *
+  * When the matching 'endif' instruction is reached, the flags are
+  * popped off.  If the stack is now empty, normal execution resumes.
+diff --git a/src/sna/compiler.h b/src/sna/compiler.h
+index ff412179..0f3775ec 100644
+--- a/src/sna/compiler.h
++++ b/src/sna/compiler.h
+@@ -39,6 +39,7 @@
+ #define pure __attribute__((pure))
+ #define tightly_packed __attribute__((__packed__))
+ #define flatten __attribute__((flatten))
++#define nonnull __attribute__((nonnull))
+ #define page_aligned __attribute__((aligned(4096)))
+ #else
+ #define likely(expr) (expr)
+@@ -51,18 +52,15 @@
+ #define pure
+ #define tighly_packed
+ #define flatten
++#define nonnull
+ #define page_aligned
+ #endif
+ 
+ #define HAS_GCC(major, minor) defined(__GNUC__) && (__GNUC__ > (major) || __GNUC__ == (major) && __GNUC_MINOR__ >= (minor))
+ 
+ #if HAS_GCC(4, 5)
+-#define sse2 __attribute__((target("sse2,fpmath=sse")))
+-#define sse4_2 __attribute__((target("sse4.2,sse2,fpmath=sse")))
+-#endif
+-
+-#if HAS_GCC(4, 7)
+-#define avx2 __attribute__((target("avx2,sse4.2,sse2,fpmath=sse")))
++#define sse2 fast __attribute__((target("sse2,fpmath=sse")))
++#define sse4_2 fast __attribute__((target("sse4.2,sse2,fpmath=sse")))
+ #endif
+ 
+ #if HAS_GCC(4, 6) && defined(__OPTIMIZE__)
+@@ -71,10 +69,17 @@
+ #define fast
+ #endif
+ 
+-#if HAS_GCC(4, 6) && defined(__OPTIMIZE__)
+-#define fast_memcpy __attribute__((optimize("Ofast"))) __attribute__((target("inline-all-stringops")))
+-#elif HAS_GCC(4, 5) && defined(__OPTIMIZE__)
+-#define fast_memcpy __attribute__((target("inline-all-stringops")))
++#if HAS_GCC(4, 7)
++#define avx2 fast __attribute__((target("avx2,avx,sse4.2,sse2,fpmath=sse")))
++#define assume_aligned(ptr, align) __builtin_assume_aligned((ptr), (align))
++#define assume_misaligned(ptr, align, offset) __builtin_assume_aligned((ptr), (align), (offset))
++#else
++#define assume_aligned(ptr, align) (ptr)
++#define assume_misaligned(ptr, align, offset) (ptr)
++#endif
++
++#if HAS_GCC(4, 5) && defined(__OPTIMIZE__)
++#define fast_memcpy fast __attribute__((target("inline-all-stringops")))
+ #else
+ #define fast_memcpy
+ #endif
+diff --git a/src/sna/fb/fb.h b/src/sna/fb/fb.h
+index 8bf9008a..90431747 100644
+--- a/src/sna/fb/fb.h
++++ b/src/sna/fb/fb.h
+@@ -24,10 +24,6 @@
+ #ifndef FB_H
+ #define FB_H
+ 
+-#ifdef HAVE_CONFIG_H
+-#include "config.h"
+-#endif
+-
+ #include <xorg-server.h>
+ #include <servermd.h>
+ #include <gcstruct.h>
+diff --git a/src/sna/fb/fbimage.c b/src/sna/fb/fbimage.c
+index 5af23890..cc81c85b 100644
+--- a/src/sna/fb/fbimage.c
++++ b/src/sna/fb/fbimage.c
+@@ -229,13 +229,19 @@ fbGetImage(DrawablePtr drawable,
+ 		FbBits pm;
+ 
+ 		pm = fbReplicatePixel(planeMask, srcBpp);
++
+ 		dstStride = PixmapBytePad(w, drawable->depth);
+-		if (pm != FB_ALLONES)
+-			memset(d, 0, dstStride * h);
+ 		dstStride /= sizeof(FbStip);
++
+ 		fbBltStip((FbStip *)(src + (y + srcYoff) * srcStride), srcStride,
+ 			  (x + srcXoff) * srcBpp,
+-			  dst, dstStride, 0, w * srcBpp, h, GXcopy, pm, srcBpp);
++			  dst, dstStride, 0, w * srcBpp, h, GXcopy, FB_ALLONES, srcBpp);
++
++		if (pm != FB_ALLONES) {
++			int i = dstStride * h;
++			while (i--)
++				*dst++ &= pm;
++		}
+ 	} else {
+ 		dstStride = BitmapBytePad(w) / sizeof(FbStip);
+ 		fbBltPlane(src + (y + srcYoff) * srcStride,
+diff --git a/src/sna/fb/fbpict.h b/src/sna/fb/fbpict.h
+index 932032f9..20877777 100644
+--- a/src/sna/fb/fbpict.h
++++ b/src/sna/fb/fbpict.h
+@@ -24,10 +24,6 @@
+ #ifndef FBPICT_H
+ #define FBPICT_H
+ 
+-#ifdef HAVE_CONFIG_H
+-#include "config.h"
+-#endif
+-
+ #include <xorg-server.h>
+ #include <picturestr.h>
+ 
+diff --git a/src/sna/gen2_render.c b/src/sna/gen2_render.c
+index 1104f462..49ad16a3 100644
+--- a/src/sna/gen2_render.c
++++ b/src/sna/gen2_render.c
+@@ -35,6 +35,7 @@
+ #include "sna_reg.h"
+ #include "sna_render.h"
+ #include "sna_render_inline.h"
++#include "sna_video.h"
+ 
+ #include "gen2_render.h"
+ 
+@@ -48,6 +49,7 @@
+ 
+ #define MAX_3D_SIZE 2048
+ #define MAX_3D_PITCH 8192
++#define MAX_INLINE (1 << 18)
+ 
+ #define BATCH(v) batch_emit(sna, v)
+ #define BATCH_F(v) batch_emit_float(sna, v)
+@@ -596,39 +598,43 @@ gen2_get_batch(struct sna *sna, const struct sna_composite_op *op)
+ 		gen2_emit_invariant(sna);
+ }
+ 
+-static void gen2_emit_target(struct sna *sna, const struct sna_composite_op *op)
++static void gen2_emit_target(struct sna *sna,
++			     struct kgem_bo *bo,
++			     int width,
++			     int height,
++			     int format)
+ {
+-	assert(!too_large(op->dst.width, op->dst.height));
+-	assert(op->dst.bo->pitch >= 8 && op->dst.bo->pitch <= MAX_3D_PITCH);
++	assert(!too_large(width, height));
++	assert(bo->pitch >= 8 && bo->pitch <= MAX_3D_PITCH);
+ 	assert(sna->render.vertex_offset == 0);
+ 
+-	assert(op->dst.bo->unique_id);
+-	if (sna->render_state.gen2.target == op->dst.bo->unique_id) {
+-		kgem_bo_mark_dirty(op->dst.bo);
++	assert(bo->unique_id);
++	if (sna->render_state.gen2.target == bo->unique_id) {
++		kgem_bo_mark_dirty(bo);
+ 		return;
+ 	}
+ 
+ 	BATCH(_3DSTATE_BUF_INFO_CMD);
+ 	BATCH(BUF_3D_ID_COLOR_BACK |
+-	      gen2_buf_tiling(op->dst.bo->tiling) |
+-	      BUF_3D_PITCH(op->dst.bo->pitch));
++	      gen2_buf_tiling(bo->tiling) |
++	      BUF_3D_PITCH(bo->pitch));
+ 	BATCH(kgem_add_reloc(&sna->kgem, sna->kgem.nbatch,
+-			     op->dst.bo,
++			     bo,
+ 			     I915_GEM_DOMAIN_RENDER << 16 |
+ 			     I915_GEM_DOMAIN_RENDER,
+ 			     0));
+ 
+ 	BATCH(_3DSTATE_DST_BUF_VARS_CMD);
+-	BATCH(gen2_get_dst_format(op->dst.format));
++	BATCH(gen2_get_dst_format(format));
+ 
+ 	BATCH(_3DSTATE_DRAW_RECT_CMD);
+ 	BATCH(0);
+ 	BATCH(0);	/* ymin, xmin */
+-	BATCH(DRAW_YMAX(op->dst.height - 1) |
+-	      DRAW_XMAX(op->dst.width - 1));
++	BATCH(DRAW_YMAX(height - 1) |
++	      DRAW_XMAX(width - 1));
+ 	BATCH(0);	/* yorig, xorig */
+ 
+-	sna->render_state.gen2.target = op->dst.bo->unique_id;
++	sna->render_state.gen2.target = bo->unique_id;
+ }
+ 
+ static void gen2_disable_logic_op(struct sna *sna)
+@@ -701,7 +707,11 @@ static void gen2_emit_composite_state(struct sna *sna,
+ 		kgem_clear_dirty(&sna->kgem);
+ 	}
+ 
+-	gen2_emit_target(sna, op);
++	gen2_emit_target(sna,
++			 op->dst.bo,
++			 op->dst.width,
++			 op->dst.height,
++			 op->dst.format);
+ 
+ 	unwind = sna->kgem.nbatch;
+ 	BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 |
+@@ -1190,7 +1200,13 @@ inline static int gen2_get_rectangles(struct sna *sna,
+ 			sna->render.vertex_offset = sna->kgem.nbatch;
+ 			BATCH(PRIM3D_INLINE | PRIM3D_RECTLIST);
+ 		}
+-	}
++
++		need = 0;
++	} else
++		need = sna->kgem.nbatch - sna->render.vertex_offset;
++
++	if (rem > MAX_INLINE - need)
++		rem = MAX_INLINE -need;
+ 
+ 	if (want > 1 && want * size > rem)
+ 		want = rem / size;
+@@ -1572,12 +1588,12 @@ gen2_composite_picture(struct sna *sna,
+ 		if (channel->repeat &&
+ 		    (x >= 0 &&
+ 		     y >= 0 &&
+-		     x + w < pixmap->drawable.width &&
+-		     y + h < pixmap->drawable.height)) {
++		     x + w <= pixmap->drawable.width &&
++		     y + h <= pixmap->drawable.height)) {
+ 			struct sna_pixmap *priv = sna_pixmap(pixmap);
+ 			if (priv && priv->clear) {
+ 				DBG(("%s: converting large pixmap source into solid [%08x]\n", __FUNCTION__, priv->clear_color));
+-				return gen2_composite_solid_init(sna, channel, priv->clear_color);
++				return gen2_composite_solid_init(sna, channel, solid_color(picture->format, priv->clear_color));
+ 			}
+ 		}
+ 	} else
+@@ -1619,7 +1635,9 @@ gen2_composite_set_target(struct sna *sna,
+ 	} else
+ 		sna_render_picture_extents(dst, &box);
+ 
+-	hint = PREFER_GPU | FORCE_GPU | RENDER_GPU;
++	hint = PREFER_GPU | RENDER_GPU;
++	if (!need_tiling(sna, op->dst.width, op->dst.height))
++		hint |= FORCE_GPU;
+ 	if (!partial) {
+ 		hint |= IGNORE_DAMAGE;
+ 		if (w == op->dst.width && h == op->dst.height)
+@@ -2423,7 +2441,11 @@ static void gen2_emit_composite_spans_state(struct sna *sna,
+ 	uint32_t unwind;
+ 
+ 	gen2_get_batch(sna, &op->base);
+-	gen2_emit_target(sna, &op->base);
++	gen2_emit_target(sna,
++			 op->base.dst.bo,
++			 op->base.dst.width,
++			 op->base.dst.height,
++			 op->base.dst.format);
+ 
+ 	unwind = sna->kgem.nbatch;
+ 	BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 |
+@@ -2706,7 +2728,11 @@ static void gen2_emit_fill_composite_state(struct sna *sna,
+ 	uint32_t ls1;
+ 
+ 	gen2_get_batch(sna, op);
+-	gen2_emit_target(sna, op);
++	gen2_emit_target(sna,
++			 op->dst.bo,
++			 op->dst.width,
++			 op->dst.height,
++			 op->dst.format);
+ 
+ 	ls1 = sna->kgem.nbatch;
+ 	BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 |
+@@ -2868,7 +2894,11 @@ static void gen2_emit_fill_state(struct sna *sna,
+ 	uint32_t ls1;
+ 
+ 	gen2_get_batch(sna, op);
+-	gen2_emit_target(sna, op);
++	gen2_emit_target(sna,
++			 op->dst.bo,
++			 op->dst.width,
++			 op->dst.height,
++			 op->dst.format);
+ 
+ 	ls1 = sna->kgem.nbatch;
+ 	BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 |
+@@ -3102,6 +3132,276 @@ gen2_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
+ }
+ 
+ static void
++gen2_emit_video_state(struct sna *sna,
++		      struct sna_video *video,
++		      struct sna_video_frame *frame,
++		      PixmapPtr pixmap,
++		      struct kgem_bo *dst_bo,
++		      int width, int height,
++		      bool bilinear)
++{
++	uint32_t ms1, v, unwind;
++
++	gen2_emit_target(sna, dst_bo, width, height,
++			 sna_format_for_depth(pixmap->drawable.depth));
++
++	unwind = sna->kgem.nbatch;
++	BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 |
++	      I1_LOAD_S(2) | I1_LOAD_S(3) | I1_LOAD_S(8) | 2);
++	BATCH(1 << 12);
++	BATCH(S3_CULLMODE_NONE | S3_VERTEXHAS_XY);
++	BATCH(S8_ENABLE_COLOR_BUFFER_WRITE);
++	if (memcmp(sna->kgem.batch + sna->render_state.gen2.ls1 + 1,
++		   sna->kgem.batch + unwind + 1,
++		   3 * sizeof(uint32_t)) == 0)
++		sna->kgem.nbatch = unwind;
++	else
++		sna->render_state.gen2.ls1 = unwind;
++
++	gen2_disable_logic_op(sna);
++
++	unwind = sna->kgem.nbatch;
++	BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_2 |
++	      LOAD_TEXTURE_BLEND_STAGE(0) | 1);
++	BATCH(TB0C_LAST_STAGE | TB0C_RESULT_SCALE_1X | TB0C_OUTPUT_WRITE_CURRENT |
++	      TB0C_OP_ARG1 | TB0C_ARG1_SEL_TEXEL0);
++	BATCH(TB0A_RESULT_SCALE_1X | TB0A_OUTPUT_WRITE_CURRENT |
++	      TB0A_OP_ARG1 | TB0A_ARG1_SEL_ONE);
++	if (memcmp(sna->kgem.batch + sna->render_state.gen2.ls2 + 1,
++		   sna->kgem.batch + unwind + 1,
++		   2 * sizeof(uint32_t)) == 0)
++		sna->kgem.nbatch = unwind;
++	else
++		sna->render_state.gen2.ls2 = unwind;
++
++	BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_2 | LOAD_TEXTURE_MAP(0) | 4);
++	BATCH(kgem_add_reloc(&sna->kgem, sna->kgem.nbatch,
++			     frame->bo,
++			     I915_GEM_DOMAIN_SAMPLER << 16,
++			     0));
++	ms1 = MAPSURF_422 | TM0S1_COLORSPACE_CONVERSION;
++	switch (frame->id) {
++	case FOURCC_YUY2:
++		ms1 |= MT_422_YCRCB_NORMAL;
++		break;
++	case FOURCC_UYVY:
++		ms1 |= MT_422_YCRCB_SWAPY;
++		break;
++	}
++	BATCH(((frame->height - 1) << TM0S1_HEIGHT_SHIFT) |
++	      ((frame->width - 1)  << TM0S1_WIDTH_SHIFT) |
++	      ms1 |
++	      gen2_sampler_tiling_bits(frame->bo->tiling));
++	BATCH((frame->pitch[0] / 4 - 1) << TM0S2_PITCH_SHIFT | TM0S2_MAP_2D);
++	if (bilinear)
++		BATCH(FILTER_LINEAR << TM0S3_MAG_FILTER_SHIFT |
++		      FILTER_LINEAR << TM0S3_MIN_FILTER_SHIFT |
++		      MIPFILTER_NONE << TM0S3_MIP_FILTER_SHIFT);
++	else
++		BATCH(FILTER_NEAREST << TM0S3_MAG_FILTER_SHIFT |
++		      FILTER_NEAREST << TM0S3_MIN_FILTER_SHIFT |
++		      MIPFILTER_NONE << TM0S3_MIP_FILTER_SHIFT);
++	BATCH(0);	/* default color */
++
++	BATCH(_3DSTATE_MAP_COORD_SET_CMD | TEXCOORD_SET(0) |
++	      ENABLE_TEXCOORD_PARAMS | TEXCOORDS_ARE_NORMAL | TEXCOORDTYPE_CARTESIAN |
++	      ENABLE_ADDR_V_CNTL | TEXCOORD_ADDR_V_MODE(TEXCOORDMODE_CLAMP) |
++	      ENABLE_ADDR_U_CNTL | TEXCOORD_ADDR_U_MODE(TEXCOORDMODE_CLAMP));
++
++	v = _3DSTATE_VERTEX_FORMAT_2_CMD | TEXCOORDFMT_2D;
++	if (sna->render_state.gen2.vft != v) {
++		BATCH(v);
++		sna->render_state.gen2.vft = v;
++	}
++}
++
++static void
++gen2_video_get_batch(struct sna *sna, struct kgem_bo *bo)
++{
++	kgem_set_mode(&sna->kgem, KGEM_RENDER, bo);
++
++	if (!kgem_check_batch(&sna->kgem, 120) ||
++	    !kgem_check_reloc(&sna->kgem, 4) ||
++	    !kgem_check_exec(&sna->kgem, 2)) {
++		_kgem_submit(&sna->kgem);
++		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
++	}
++
++	if (sna->render_state.gen2.need_invariant)
++		gen2_emit_invariant(sna);
++}
++
++static int
++gen2_get_inline_rectangles(struct sna *sna, int want, int floats_per_vertex)
++{
++	int size = floats_per_vertex * 3;
++	int rem = batch_space(sna) - 1;
++
++	if (rem > MAX_INLINE)
++		rem = MAX_INLINE;
++
++	if (size * want > rem)
++		want = rem / size;
++
++	return want;
++}
++
++static bool
++gen2_render_video(struct sna *sna,
++		  struct sna_video *video,
++		  struct sna_video_frame *frame,
++		  RegionPtr dstRegion,
++		  PixmapPtr pixmap)
++{
++	struct sna_pixmap *priv = sna_pixmap(pixmap);
++	const BoxRec *pbox = region_rects(dstRegion);
++	int nbox = region_num_rects(dstRegion);
++	int dst_width = dstRegion->extents.x2 - dstRegion->extents.x1;
++	int dst_height = dstRegion->extents.y2 - dstRegion->extents.y1;
++	int src_width = frame->src.x2 - frame->src.x1;
++	int src_height = frame->src.y2 - frame->src.y1;
++	float src_offset_x, src_offset_y;
++	float src_scale_x, src_scale_y;
++	int pix_xoff, pix_yoff;
++	struct kgem_bo *dst_bo;
++	bool bilinear;
++	int copy = 0;
++
++	DBG(("%s: src:%dx%d (frame:%dx%d) -> dst:%dx%d\n", __FUNCTION__,
++	     src_width, src_height, frame->width, frame->height, dst_width, dst_height));
++
++	assert(priv->gpu_bo);
++	dst_bo = priv->gpu_bo;
++
++	bilinear = src_width != dst_width || src_height != dst_height;
++
++	src_scale_x = (float)src_width / dst_width / frame->width;
++	src_offset_x = (float)frame->src.x1 / frame->width - dstRegion->extents.x1 * src_scale_x;
++
++	src_scale_y = (float)src_height / dst_height / frame->height;
++	src_offset_y = (float)frame->src.y1 / frame->height - dstRegion->extents.y1 * src_scale_y;
++	DBG(("%s: src offset (%f, %f), scale (%f, %f)\n",
++	     __FUNCTION__, src_offset_x, src_offset_y, src_scale_x, src_scale_y));
++
++	if (too_large(pixmap->drawable.width, pixmap->drawable.height) ||
++	    dst_bo->pitch > MAX_3D_PITCH) {
++		int bpp = pixmap->drawable.bitsPerPixel;
++
++		if (too_large(dst_width, dst_height))
++			return false;
++
++		dst_bo = kgem_create_2d(&sna->kgem,
++					dst_width, dst_height, bpp,
++					kgem_choose_tiling(&sna->kgem,
++							   I915_TILING_X,
++							   dst_width, dst_height, bpp),
++					0);
++		if (!dst_bo)
++			return false;
++
++		pix_xoff = -dstRegion->extents.x1;
++		pix_yoff = -dstRegion->extents.y1;
++		copy = 1;
++	} else {
++		/* Set up the offset for translating from the given region
++		 * (in screen coordinates) to the backing pixmap.
++		 */
++#ifdef COMPOSITE
++		pix_xoff = -pixmap->screen_x + pixmap->drawable.x;
++		pix_yoff = -pixmap->screen_y + pixmap->drawable.y;
++#else
++		pix_xoff = 0;
++		pix_yoff = 0;
++#endif
++
++		dst_width  = pixmap->drawable.width;
++		dst_height = pixmap->drawable.height;
++	}
++
++	gen2_video_get_batch(sna, dst_bo);
++	gen2_emit_video_state(sna, video, frame, pixmap,
++			      dst_bo, dst_width, dst_height, bilinear);
++	do {
++		int nbox_this_time = gen2_get_inline_rectangles(sna, nbox, 4);
++		if (nbox_this_time == 0) {
++			gen2_video_get_batch(sna, dst_bo);
++			gen2_emit_video_state(sna, video, frame, pixmap,
++					      dst_bo, dst_width, dst_height, bilinear);
++			nbox_this_time = gen2_get_inline_rectangles(sna, nbox, 4);
++			assert(nbox_this_time);
++		}
++		nbox -= nbox_this_time;
++
++		BATCH(PRIM3D_INLINE | PRIM3D_RECTLIST |
++		      ((12 * nbox_this_time) - 1));
++		do {
++			int box_x1 = pbox->x1;
++			int box_y1 = pbox->y1;
++			int box_x2 = pbox->x2;
++			int box_y2 = pbox->y2;
++
++			pbox++;
++
++			DBG(("%s: dst (%d, %d), (%d, %d) + (%d, %d); src (%f, %f), (%f, %f)\n",
++			     __FUNCTION__, box_x1, box_y1, box_x2, box_y2, pix_xoff, pix_yoff,
++			     box_x1 * src_scale_x + src_offset_x,
++			     box_y1 * src_scale_y + src_offset_y,
++			     box_x2 * src_scale_x + src_offset_x,
++			     box_y2 * src_scale_y + src_offset_y));
++
++			/* bottom right */
++			BATCH_F(box_x2 + pix_xoff);
++			BATCH_F(box_y2 + pix_yoff);
++			BATCH_F(box_x2 * src_scale_x + src_offset_x);
++			BATCH_F(box_y2 * src_scale_y + src_offset_y);
++
++			/* bottom left */
++			BATCH_F(box_x1 + pix_xoff);
++			BATCH_F(box_y2 + pix_yoff);
++			BATCH_F(box_x1 * src_scale_x + src_offset_x);
++			BATCH_F(box_y2 * src_scale_y + src_offset_y);
++
++			/* top left */
++			BATCH_F(box_x1 + pix_xoff);
++			BATCH_F(box_y1 + pix_yoff);
++			BATCH_F(box_x1 * src_scale_x + src_offset_x);
++			BATCH_F(box_y1 * src_scale_y + src_offset_y);
++		} while (--nbox_this_time);
++	} while (nbox);
++
++	if (copy) {
++#ifdef COMPOSITE
++		pix_xoff = -pixmap->screen_x + pixmap->drawable.x;
++		pix_yoff = -pixmap->screen_y + pixmap->drawable.y;
++#else
++		pix_xoff = 0;
++		pix_yoff = 0;
++#endif
++		sna_blt_copy_boxes(sna, GXcopy,
++				   dst_bo, -dstRegion->extents.x1, -dstRegion->extents.y1,
++				   priv->gpu_bo, pix_xoff, pix_yoff,
++				   pixmap->drawable.bitsPerPixel,
++				   region_rects(dstRegion),
++				   region_num_rects(dstRegion));
++
++		kgem_bo_destroy(&sna->kgem, dst_bo);
++	}
++
++	if (!DAMAGE_IS_ALL(priv->gpu_damage)) {
++		if ((pix_xoff | pix_yoff) == 0) {
++			sna_damage_add(&priv->gpu_damage, dstRegion);
++		} else {
++			sna_damage_add_boxes(&priv->gpu_damage,
++					     region_rects(dstRegion),
++					     region_num_rects(dstRegion),
++					     pix_xoff, pix_yoff);
++		}
++	}
++
++	return true;
++}
++
++static void
+ gen2_render_copy_setup_source(struct sna_composite_channel *channel,
+ 			      const DrawableRec *draw,
+ 			      struct kgem_bo *bo)
+@@ -3176,7 +3476,11 @@ static void gen2_emit_copy_state(struct sna *sna, const struct sna_composite_op
+ 			      PIPELINE_FLUSH_TEXTURE_CACHE);
+ 		kgem_clear_dirty(&sna->kgem);
+ 	}
+-	gen2_emit_target(sna, op);
++	gen2_emit_target(sna,
++			 op->dst.bo,
++			 op->dst.width,
++			 op->dst.height,
++			 op->dst.format);
+ 
+ 	ls1 = sna->kgem.nbatch;
+ 	BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 |
+@@ -3511,7 +3815,7 @@ const char *gen2_render_init(struct sna *sna, const char *backend)
+ 	render->copy = gen2_render_copy;
+ 	render->copy_boxes = gen2_render_copy_boxes;
+ 
+-	/* XXX YUV color space conversion for video? */
++	render->video = gen2_render_video;
+ 
+ 	render->reset = gen2_render_reset;
+ 	render->flush = gen2_render_flush;
+diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
+index 78289f00..4459a562 100644
+--- a/src/sna/gen3_render.c
++++ b/src/sna/gen3_render.c
+@@ -448,14 +448,14 @@ gen3_emit_composite_boxes_constant(const struct sna_composite_op *op,
+ 				   float *v)
+ {
+ 	do {
+-		v[0] = box->x2;
+-		v[1] = box->y2;
++		v[0] = box->x2 + op->dst.x;
++		v[1] = box->y2 + op->dst.y;
+ 
+-		v[2] = box->x1;
+-		v[3] = box->y2;
++		v[2] = box->x1 + op->dst.x;
++		v[3] = box->y2 + op->dst.y;
+ 
+-		v[4] = box->x1;
+-		v[5] = box->y1;
++		v[4] = box->x1 + op->dst.x;
++		v[5] = box->y1 + op->dst.y;
+ 
+ 		box++;
+ 		v += 6;
+@@ -494,18 +494,18 @@ gen3_emit_composite_boxes_identity_gradient(const struct sna_composite_op *op,
+ 					    float *v)
+ {
+ 	do {
+-		v[0] = box->x2;
+-		v[1] = box->y2;
++		v[0] = box->x2 + op->dst.x;
++		v[1] = box->y2 + op->dst.y;
+ 		v[2] = box->x2 + op->src.offset[0];
+ 		v[3] = box->y2 + op->src.offset[1];
+ 
+-		v[4] = box->x1;
+-		v[5] = box->y2;
++		v[4] = box->x1 + op->dst.x;
++		v[5] = box->y2 + op->dst.y;
+ 		v[6] = box->x1 + op->src.offset[0];
+ 		v[7] = box->y2 + op->src.offset[1];
+ 
+-		v[8] = box->x1;
+-		v[9] = box->y1;
++		v[8] = box->x1 + op->dst.x;
++		v[9] = box->y1 + op->dst.y;
+ 		v[10] = box->x1 + op->src.offset[0];
+ 		v[11] = box->y1 + op->src.offset[1];
+ 
+@@ -531,6 +531,7 @@ gen3_emit_composite_primitive_affine_gradient(struct sna *sna,
+ 
+ 	v = sna->render.vertices + sna->render.vertex_used;
+ 	sna->render.vertex_used += 12;
++	assert(sna->render.vertex_used <= sna->render.vertex_size);
+ 
+ 	v[0] = dst_x + r->width;
+ 	v[1] = dst_y + r->height;
+@@ -559,22 +560,22 @@ gen3_emit_composite_boxes_affine_gradient(const struct sna_composite_op *op,
+ 	const PictTransform *transform = op->src.transform;
+ 
+ 	do {
+-		v[0] = box->x2;
+-		v[1] = box->y2;
++		v[0] = box->x2 + op->dst.x;
++		v[1] = box->y2 + op->dst.y;
+ 		_sna_get_transformed_scaled(box->x2 + op->src.offset[0],
+ 					    box->y2 + op->src.offset[1],
+ 					    transform, op->src.scale,
+ 					    &v[2], &v[3]);
+ 
+-		v[4] = box->x1;
+-		v[5] = box->y2;
++		v[4] = box->x1 + op->dst.x;
++		v[5] = box->y2 + op->dst.y;
+ 		_sna_get_transformed_scaled(box->x1 + op->src.offset[0],
+ 					    box->y2 + op->src.offset[1],
+ 					    transform, op->src.scale,
+ 					    &v[6], &v[7]);
+ 
+-		v[8] = box->x1;
+-		v[9] = box->y1;
++		v[8] = box->x1 + op->dst.x;
++		v[9] = box->y1 + op->dst.y;
+ 		_sna_get_transformed_scaled(box->x1 + op->src.offset[0],
+ 					    box->y1 + op->src.offset[1],
+ 					    transform, op->src.scale,
+@@ -596,6 +597,7 @@ gen3_emit_composite_primitive_identity_source(struct sna *sna,
+ 
+ 	v = sna->render.vertices + sna->render.vertex_used;
+ 	sna->render.vertex_used += 12;
++	assert(sna->render.vertex_used <= sna->render.vertex_size);
+ 
+ 	v[8] = v[4] = r->dst.x + op->dst.x;
+ 	v[0] = v[4] + w;
+@@ -643,6 +645,7 @@ gen3_emit_composite_primitive_identity_source_no_offset(struct sna *sna,
+ 
+ 	v = sna->render.vertices + sna->render.vertex_used;
+ 	sna->render.vertex_used += 12;
++	assert(sna->render.vertex_used <= sna->render.vertex_size);
+ 
+ 	v[8] = v[4] = r->dst.x;
+ 	v[9] = r->dst.y;
+@@ -693,6 +696,7 @@ gen3_emit_composite_primitive_affine_source(struct sna *sna,
+ 
+ 	v = sna->render.vertices + sna->render.vertex_used;
+ 	sna->render.vertex_used += 12;
++	assert(sna->render.vertex_used <= sna->render.vertex_size);
+ 
+ 	v[0] = dst_x + r->width;
+ 	v[5] = v[1] = dst_y + r->height;
+@@ -720,10 +724,10 @@ gen3_emit_composite_boxes_affine_source(const struct sna_composite_op *op,
+ 	const PictTransform *transform = op->src.transform;
+ 
+ 	do {
+-		v[0] = box->x2;
+-		v[5] = v[1] = box->y2;
+-		v[8] = v[4] = box->x1;
+-		v[9] = box->y1;
++		v[0] = box->x2 + op->dst.x;
++		v[5] = v[1] = box->y2 + op->dst.y;
++		v[8] = v[4] = box->x1 + op->dst.x;
++		v[9] = box->y1 + op->dst.y;
+ 
+ 		_sna_get_transformed_scaled(box->x2 + op->src.offset[0],
+ 					    box->y2 + op->src.offset[1],
+@@ -756,6 +760,7 @@ gen3_emit_composite_primitive_constant_identity_mask(struct sna *sna,
+ 
+ 	v = sna->render.vertices + sna->render.vertex_used;
+ 	sna->render.vertex_used += 12;
++	assert(sna->render.vertex_used <= sna->render.vertex_size);
+ 
+ 	v[8] = v[4] = r->dst.x + op->dst.x;
+ 	v[0] = v[4] + w;
+@@ -781,6 +786,7 @@ gen3_emit_composite_primitive_constant_identity_mask_no_offset(struct sna *sna,
+ 
+ 	v = sna->render.vertices + sna->render.vertex_used;
+ 	sna->render.vertex_used += 12;
++	assert(sna->render.vertex_used <= sna->render.vertex_size);
+ 
+ 	v[8] = v[4] = r->dst.x;
+ 	v[9] = r->dst.y;
+@@ -817,6 +823,7 @@ gen3_emit_composite_primitive_identity_source_mask(struct sna *sna,
+ 
+ 	v = sna->render.vertices + sna->render.vertex_used;
+ 	sna->render.vertex_used += 18;
++	assert(sna->render.vertex_used <= sna->render.vertex_size);
+ 
+ 	v[0] = dst_x + w;
+ 	v[1] = dst_y + h;
+@@ -862,6 +869,7 @@ gen3_emit_composite_primitive_affine_source_mask(struct sna *sna,
+ 
+ 	v = sna->render.vertices + sna->render.vertex_used;
+ 	sna->render.vertex_used += 18;
++	assert(sna->render.vertex_used <= sna->render.vertex_size);
+ 
+ 	v[0] = dst_x + w;
+ 	v[1] = dst_y + h;
+@@ -978,6 +986,7 @@ gen3_emit_composite_primitive_constant__sse2(struct sna *sna,
+ 
+ 	v = sna->render.vertices + sna->render.vertex_used;
+ 	sna->render.vertex_used += 6;
++	assert(sna->render.vertex_used <= sna->render.vertex_size);
+ 
+ 	v[4] = v[2] = r->dst.x + op->dst.x;
+ 	v[5] = r->dst.y + op->dst.y;
+@@ -993,10 +1002,10 @@ gen3_emit_composite_boxes_constant__sse2(const struct sna_composite_op *op,
+ 					 float *v)
+ {
+ 	do {
+-		v[0] = box->x2;
+-		v[3] = v[1] = box->y2;
+-		v[4] = v[2] = box->x1;
+-		v[5] = box->y1;
++		v[0] = box->x2 + op->dst.x;
++		v[3] = v[1] = box->y2 + op->dst.y;
++		v[4] = v[2] = box->x1 + op->dst.x;
++		v[5] = box->y1 + op->dst.y;
+ 
+ 		box++;
+ 		v += 6;
+@@ -1013,6 +1022,7 @@ gen3_emit_composite_primitive_identity_gradient__sse2(struct sna *sna,
+ 
+ 	v = sna->render.vertices + sna->render.vertex_used;
+ 	sna->render.vertex_used += 12;
++	assert(sna->render.vertex_used <= sna->render.vertex_size);
+ 
+ 	x = r->dst.x + op->dst.x;
+ 	y = r->dst.y + op->dst.y;
+@@ -1035,10 +1045,10 @@ gen3_emit_composite_boxes_identity_gradient__sse2(const struct sna_composite_op
+ 						  float *v)
+ {
+ 	do {
+-		v[0] = box->x2;
+-		v[5] = v[1] = box->y2;
+-		v[8] = v[4] = box->x1;
+-		v[9] = box->y1;
++		v[0] = box->x2 + op->dst.x;
++		v[5] = v[1] = box->y2 + op->dst.y;
++		v[8] = v[4] = box->x1 + op->dst.x;
++		v[9] = box->y1 + op->dst.y;
+ 
+ 		v[2] = box->x2 + op->src.offset[0];
+ 		v[7] = v[3] = box->y2 + op->src.offset[1];
+@@ -1067,6 +1077,7 @@ gen3_emit_composite_primitive_affine_gradient__sse2(struct sna *sna,
+ 
+ 	v = sna->render.vertices + sna->render.vertex_used;
+ 	sna->render.vertex_used += 12;
++	assert(sna->render.vertex_used <= sna->render.vertex_size);
+ 
+ 	v[0] = dst_x + r->width;
+ 	v[1] = dst_y + r->height;
+@@ -1095,22 +1106,22 @@ gen3_emit_composite_boxes_affine_gradient__sse2(const struct sna_composite_op *o
+ 	const PictTransform *transform = op->src.transform;
+ 
+ 	do {
+-		v[0] = box->x2;
+-		v[1] = box->y2;
++		v[0] = box->x2 + op->dst.x;
++		v[1] = box->y2 + op->dst.y;
+ 		_sna_get_transformed_scaled(box->x2 + op->src.offset[0],
+ 					    box->y2 + op->src.offset[1],
+ 					    transform, op->src.scale,
+ 					    &v[2], &v[3]);
+ 
+-		v[4] = box->x1;
+-		v[5] = box->y2;
++		v[4] = box->x1 + op->dst.x;
++		v[5] = box->y2 + op->dst.y;
+ 		_sna_get_transformed_scaled(box->x1 + op->src.offset[0],
+ 					    box->y2 + op->src.offset[1],
+ 					    transform, op->src.scale,
+ 					    &v[6], &v[7]);
+ 
+-		v[8] = box->x1;
+-		v[9] = box->y1;
++		v[8] = box->x1 + op->dst.x;
++		v[9] = box->y1 + op->dst.y;
+ 		_sna_get_transformed_scaled(box->x1 + op->src.offset[0],
+ 					    box->y1 + op->src.offset[1],
+ 					    transform, op->src.scale,
+@@ -1132,6 +1143,7 @@ gen3_emit_composite_primitive_identity_source__sse2(struct sna *sna,
+ 
+ 	v = sna->render.vertices + sna->render.vertex_used;
+ 	sna->render.vertex_used += 12;
++	assert(sna->render.vertex_used <= sna->render.vertex_size);
+ 
+ 	v[8] = v[4] = r->dst.x + op->dst.x;
+ 	v[0] = v[4] + w;
+@@ -1179,6 +1191,7 @@ gen3_emit_composite_primitive_identity_source_no_offset__sse2(struct sna *sna,
+ 
+ 	v = sna->render.vertices + sna->render.vertex_used;
+ 	sna->render.vertex_used += 12;
++	assert(sna->render.vertex_used <= sna->render.vertex_size);
+ 
+ 	v[8] = v[4] = r->dst.x;
+ 	v[9] = r->dst.y;
+@@ -1227,8 +1240,12 @@ gen3_emit_composite_primitive_affine_source__sse2(struct sna *sna,
+ 	int src_y = r->src.y + (int)op->src.offset[1];
+ 	float *v;
+ 
++	DBG(("%s: src=(%d, %d), dst=(%d, %d), size=%dx%d\n",
++	     __FUNCTION__, src_x, src_y, dst_x, dst_y, r->width, r->height));
++
+ 	v = sna->render.vertices + sna->render.vertex_used;
+ 	sna->render.vertex_used += 12;
++	assert(sna->render.vertex_used <= sna->render.vertex_size);
+ 
+ 	v[0] = dst_x + r->width;
+ 	v[5] = v[1] = dst_y + r->height;
+@@ -1256,10 +1273,13 @@ gen3_emit_composite_boxes_affine_source__sse2(const struct sna_composite_op *op,
+ 	const PictTransform *transform = op->src.transform;
+ 
+ 	do {
+-		v[0] = box->x2;
+-		v[5] = v[1] = box->y2;
+-		v[8] = v[4] = box->x1;
+-		v[9] = box->y1;
++		DBG(("%s: box=(%d, %d), (%d, %d), src.offset=(%d, %d)\n",
++		     __FUNCTION__, box->x1, box->y1, box->x2, box->y2, op->src.offset[0], op->src.offset[1]));
++
++		v[0] = box->x2 + op->dst.x;
++		v[5] = v[1] = box->y2 + op->dst.y;
++		v[8] = v[4] = box->x1 + op->dst.x;
++		v[9] = box->y1 + op->dst.y;
+ 
+ 		_sna_get_transformed_scaled(box->x2 + op->src.offset[0],
+ 					    box->y2 + op->src.offset[1],
+@@ -1292,6 +1312,7 @@ gen3_emit_composite_primitive_constant_identity_mask__sse2(struct sna *sna,
+ 
+ 	v = sna->render.vertices + sna->render.vertex_used;
+ 	sna->render.vertex_used += 12;
++	assert(sna->render.vertex_used <= sna->render.vertex_size);
+ 
+ 	v[8] = v[4] = r->dst.x + op->dst.x;
+ 	v[0] = v[4] + w;
+@@ -1317,6 +1338,7 @@ gen3_emit_composite_primitive_constant_identity_mask_no_offset__sse2(struct sna
+ 
+ 	v = sna->render.vertices + sna->render.vertex_used;
+ 	sna->render.vertex_used += 12;
++	assert(sna->render.vertex_used <= sna->render.vertex_size);
+ 
+ 	v[8] = v[4] = r->dst.x;
+ 	v[9] = r->dst.y;
+@@ -1353,6 +1375,7 @@ gen3_emit_composite_primitive_identity_source_mask__sse2(struct sna *sna,
+ 
+ 	v = sna->render.vertices + sna->render.vertex_used;
+ 	sna->render.vertex_used += 18;
++	assert(sna->render.vertex_used <= sna->render.vertex_size);
+ 
+ 	v[0] = dst_x + w;
+ 	v[1] = dst_y + h;
+@@ -1398,6 +1421,7 @@ gen3_emit_composite_primitive_affine_source_mask__sse2(struct sna *sna,
+ 
+ 	v = sna->render.vertices + sna->render.vertex_used;
+ 	sna->render.vertex_used += 18;
++	assert(sna->render.vertex_used <= sna->render.vertex_size);
+ 
+ 	v[0] = dst_x + w;
+ 	v[1] = dst_y + h;
+@@ -2233,6 +2257,7 @@ static void gen3_vertex_flush(struct sna *sna)
+ static int gen3_vertex_finish(struct sna *sna)
+ {
+ 	struct kgem_bo *bo;
++	unsigned hint, size;
+ 
+ 	DBG(("%s: used=%d/%d, vbo active? %d\n",
+ 	     __FUNCTION__, sna->render.vertex_used, sna->render.vertex_size,
+@@ -2243,6 +2268,7 @@ static int gen3_vertex_finish(struct sna *sna)
+ 
+ 	sna_vertex_wait__locked(&sna->render);
+ 
++	hint = CREATE_GTT_MAP;
+ 	bo = sna->render.vbo;
+ 	if (bo) {
+ 		DBG(("%s: reloc = %d\n", __FUNCTION__,
+@@ -2251,7 +2277,7 @@ static int gen3_vertex_finish(struct sna *sna)
+ 		if (sna->render.vertex_reloc[0]) {
+ 			sna->kgem.batch[sna->render.vertex_reloc[0]] =
+ 				kgem_add_reloc(&sna->kgem, sna->render.vertex_reloc[0],
+-					       bo, I915_GEM_DOMAIN_VERTEX << 16, 0);
++					       bo, I915_GEM_DOMAIN_VERTEX << 16 | KGEM_RELOC_FENCED, 0);
+ 
+ 			sna->render.vertex_reloc[0] = 0;
+ 		}
+@@ -2260,17 +2286,29 @@ static int gen3_vertex_finish(struct sna *sna)
+ 		sna->render.vbo = NULL;
+ 
+ 		kgem_bo_destroy(&sna->kgem, bo);
++		hint |= CREATE_CACHED | CREATE_NO_THROTTLE;
+ 	}
+ 
++	size = 256*1024;
+ 	sna->render.vertices = NULL;
+-	sna->render.vbo = kgem_create_linear(&sna->kgem,
+-					     256*1024, CREATE_GTT_MAP);
+-	if (sna->render.vbo)
++	sna->render.vbo = kgem_create_linear(&sna->kgem, size, hint);
++	while (sna->render.vbo == NULL && size > sizeof(sna->render.vertex_data)) {
++		size /= 2;
++		sna->render.vbo = kgem_create_linear(&sna->kgem, size, hint);
++	}
++	if (sna->render.vbo == NULL)
++		sna->render.vbo = kgem_create_linear(&sna->kgem,
++						     256*1024, CREATE_GTT_MAP);
++	if (sna->render.vbo &&
++	    kgem_check_bo(&sna->kgem, sna->render.vbo, NULL))
+ 		sna->render.vertices = kgem_bo_map(&sna->kgem, sna->render.vbo);
+ 	if (sna->render.vertices == NULL) {
+-		if (sna->render.vbo)
++		if (sna->render.vbo) {
+ 			kgem_bo_destroy(&sna->kgem, sna->render.vbo);
+-		sna->render.vbo = NULL;
++			sna->render.vbo = NULL;
++		}
++		sna->render.vertices = sna->render.vertex_data;
++		sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
+ 		return 0;
+ 	}
+ 	assert(sna->render.vbo->snoop == false);
+@@ -2280,8 +2318,14 @@ static int gen3_vertex_finish(struct sna *sna)
+ 		       sna->render.vertex_data,
+ 		       sizeof(float)*sna->render.vertex_used);
+ 	}
+-	sna->render.vertex_size = 64 * 1024 - 1;
+-	return sna->render.vertex_size - sna->render.vertex_used;
++
++	size = __kgem_bo_size(sna->render.vbo)/4;
++	if (size >= UINT16_MAX)
++		size = UINT16_MAX - 1;
++	assert(size > sna->render.vertex_used);
++
++	sna->render.vertex_size = size;
++	return size - sna->render.vertex_used;
+ }
+ 
+ static void gen3_vertex_close(struct sna *sna)
+@@ -2345,7 +2389,7 @@ static void gen3_vertex_close(struct sna *sna)
+ 	DBG(("%s: reloc = %d\n", __FUNCTION__, sna->render.vertex_reloc[0]));
+ 	sna->kgem.batch[sna->render.vertex_reloc[0]] =
+ 		kgem_add_reloc(&sna->kgem, sna->render.vertex_reloc[0],
+-			       bo, I915_GEM_DOMAIN_VERTEX << 16, delta);
++			       bo, I915_GEM_DOMAIN_VERTEX << 16 | KGEM_RELOC_FENCED, delta);
+ 	sna->render.vertex_reloc[0] = 0;
+ 
+ 	if (sna->render.vbo == NULL) {
+@@ -2580,6 +2624,7 @@ gen3_render_composite_boxes(struct sna *sna,
+ 
+ 		v = sna->render.vertices + sna->render.vertex_used;
+ 		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
++		assert(sna->render.vertex_used <= sna->render.vertex_size);
+ 
+ 		op->emit_boxes(op, box, nbox_this_time, v);
+ 		box += nbox_this_time;
+@@ -2604,6 +2649,7 @@ gen3_render_composite_boxes__thread(struct sna *sna,
+ 
+ 		v = sna->render.vertices + sna->render.vertex_used;
+ 		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
++		assert(sna->render.vertex_used <= sna->render.vertex_size);
+ 
+ 		sna_vertex_acquire__locked(&sna->render);
+ 		sna_vertex_unlock(&sna->render);
+@@ -3065,7 +3111,7 @@ gen3_composite_picture(struct sna *sna,
+ 
+ 	if (sna_picture_is_clear(picture, x, y, w, h, &color)) {
+ 		DBG(("%s: clear drawable [%08x]\n", __FUNCTION__, color));
+-		return gen3_init_solid(channel, color_convert(color, picture->format, PICT_a8r8g8b8));
++		return gen3_init_solid(channel, solid_color(picture->format, color));
+ 	}
+ 
+ 	if (!gen3_check_repeat(picture))
+@@ -3097,12 +3143,12 @@ gen3_composite_picture(struct sna *sna,
+ 		if (channel->repeat ||
+ 		    (x >= 0 &&
+ 		     y >= 0 &&
+-		     x + w < pixmap->drawable.width &&
+-		     y + h < pixmap->drawable.height)) {
++		     x + w <= pixmap->drawable.width &&
++		     y + h <= pixmap->drawable.height)) {
+ 			struct sna_pixmap *priv = sna_pixmap(pixmap);
+ 			if (priv && priv->clear) {
+ 				DBG(("%s: converting large pixmap source into solid [%08x]\n", __FUNCTION__, priv->clear_color));
+-				return gen3_init_solid(channel, priv->clear_color);
++				return gen3_init_solid(channel, solid_color(picture->format, priv->clear_color));
+ 			}
+ 		}
+ 	} else {
+@@ -3182,7 +3228,9 @@ gen3_composite_set_target(struct sna *sna,
+ 	} else
+ 		sna_render_picture_extents(dst, &box);
+ 
+-	hint = PREFER_GPU | FORCE_GPU | RENDER_GPU;
++	hint = PREFER_GPU | RENDER_GPU;
++	if (!need_tiling(sna, op->dst.width, op->dst.height))
++		hint |= FORCE_GPU;
+ 	if (!partial) {
+ 		hint |= IGNORE_DAMAGE;
+ 		if (w == op->dst.width && h == op->dst.height)
+@@ -3645,8 +3693,11 @@ gen3_render_composite(struct sna *sna,
+ 			}
+ 		}
+ 	}
+-	DBG(("%s: final src/mask type=%d/%d, affine=%d/%d\n", __FUNCTION__,
++	DBG(("%s: final src/mask type=%d/%d [constant? %d/%d], transform? %d/%d, affine=%d/%d\n", __FUNCTION__,
+ 	     tmp->src.u.gen3.type, tmp->mask.u.gen3.type,
++	     is_constant_ps(tmp->src.u.gen3.type),
++	     is_constant_ps(tmp->mask.u.gen3.type),
++	     !!tmp->src.transform, !!tmp->mask.transform,
+ 	     tmp->src.is_affine, tmp->mask.is_affine));
+ 
+ 	tmp->prim_emit = gen3_emit_composite_primitive;
+@@ -3862,6 +3913,7 @@ gen3_emit_composite_spans_primitive_zero(struct sna *sna,
+ {
+ 	float *v = sna->render.vertices + sna->render.vertex_used;
+ 	sna->render.vertex_used += 6;
++	assert(sna->render.vertex_used <= sna->render.vertex_size);
+ 
+ 	v[0] = op->base.dst.x + box->x2;
+ 	v[1] = op->base.dst.y + box->y2;
+@@ -3901,6 +3953,7 @@ gen3_emit_composite_spans_primitive_zero_no_offset(struct sna *sna,
+ {
+ 	float *v = sna->render.vertices + sna->render.vertex_used;
+ 	sna->render.vertex_used += 6;
++	assert(sna->render.vertex_used <= sna->render.vertex_size);
+ 
+ 	v[0] = box->x2;
+ 	v[3] = v[1] = box->y2;
+@@ -3932,6 +3985,7 @@ gen3_emit_composite_spans_primitive_constant(struct sna *sna,
+ {
+ 	float *v = sna->render.vertices + sna->render.vertex_used;
+ 	sna->render.vertex_used += 9;
++	assert(sna->render.vertex_used <= sna->render.vertex_size);
+ 
+ 	v[0] = op->base.dst.x + box->x2;
+ 	v[6] = v[3] = op->base.dst.x + box->x1;
+@@ -3966,6 +4020,7 @@ gen3_emit_composite_spans_primitive_constant_no_offset(struct sna *sna,
+ {
+ 	float *v = sna->render.vertices + sna->render.vertex_used;
+ 	sna->render.vertex_used += 9;
++	assert(sna->render.vertex_used <= sna->render.vertex_size);
+ 
+ 	v[0] = box->x2;
+ 	v[6] = v[3] = box->x1;
+@@ -3999,6 +4054,7 @@ gen3_emit_composite_spans_primitive_identity_source(struct sna *sna,
+ {
+ 	float *v = sna->render.vertices + sna->render.vertex_used;
+ 	sna->render.vertex_used += 15;
++	assert(sna->render.vertex_used <= sna->render.vertex_size);
+ 
+ 	v[0] = op->base.dst.x + box->x2;
+ 	v[1] = op->base.dst.y + box->y2;
+@@ -4060,6 +4116,7 @@ gen3_emit_composite_spans_primitive_affine_source(struct sna *sna,
+ 
+ 	v = sna->render.vertices + sna->render.vertex_used;
+ 	sna->render.vertex_used += 15;
++	assert(sna->render.vertex_used <= sna->render.vertex_size);
+ 
+ 	v[0]  = op->base.dst.x + box->x2;
+ 	v[6]  = v[1] = op->base.dst.y + box->y2;
+@@ -4125,6 +4182,7 @@ gen3_emit_composite_spans_primitive_identity_gradient(struct sna *sna,
+ {
+ 	float *v = sna->render.vertices + sna->render.vertex_used;
+ 	sna->render.vertex_used += 15;
++	assert(sna->render.vertex_used <= sna->render.vertex_size);
+ 
+ 	v[0] = op->base.dst.x + box->x2;
+ 	v[1] = op->base.dst.y + box->y2;
+@@ -4184,6 +4242,7 @@ gen3_emit_composite_spans_primitive_constant__sse2(struct sna *sna,
+ {
+ 	float *v = sna->render.vertices + sna->render.vertex_used;
+ 	sna->render.vertex_used += 9;
++	assert(sna->render.vertex_used <= sna->render.vertex_size);
+ 
+ 	v[0] = op->base.dst.x + box->x2;
+ 	v[6] = v[3] = op->base.dst.x + box->x1;
+@@ -4229,6 +4288,7 @@ gen3_render_composite_spans_constant_box__sse2(struct sna *sna,
+ 
+ 	v = sna->render.vertices + sna->render.vertex_used;
+ 	sna->render.vertex_used += 9;
++	assert(sna->render.vertex_used <= sna->render.vertex_size);
+ 
+ 	v[0] = box->x2;
+ 	v[6] = v[3] = box->x1;
+@@ -4259,6 +4319,7 @@ gen3_render_composite_spans_constant_thread__sse2__boxes(struct sna *sna,
+ 
+ 		v = sna->render.vertices + sna->render.vertex_used;
+ 		sna->render.vertex_used += nbox_this_time * 9;
++		assert(sna->render.vertex_used <= sna->render.vertex_size);
+ 
+ 		sna_vertex_acquire__locked(&sna->render);
+ 		sna_vertex_unlock(&sna->render);
+@@ -4287,6 +4348,7 @@ gen3_emit_composite_spans_primitive_constant__sse2__no_offset(struct sna *sna,
+ {
+ 	float *v = sna->render.vertices + sna->render.vertex_used;
+ 	sna->render.vertex_used += 9;
++	assert(sna->render.vertex_used <= sna->render.vertex_size);
+ 
+ 	v[0] = box->x2;
+ 	v[6] = v[3] = box->x1;
+@@ -4320,6 +4382,7 @@ gen3_emit_composite_spans_primitive_identity_source__sse2(struct sna *sna,
+ {
+ 	float *v = sna->render.vertices + sna->render.vertex_used;
+ 	sna->render.vertex_used += 15;
++	assert(sna->render.vertex_used <= sna->render.vertex_size);
+ 
+ 	v[0] = op->base.dst.x + box->x2;
+ 	v[1] = op->base.dst.y + box->y2;
+@@ -4380,6 +4443,7 @@ gen3_emit_composite_spans_primitive_affine_source__sse2(struct sna *sna,
+ 
+ 	v = sna->render.vertices + sna->render.vertex_used;
+ 	sna->render.vertex_used += 15;
++	assert(sna->render.vertex_used <= sna->render.vertex_size);
+ 
+ 	v[0]  = op->base.dst.x + box->x2;
+ 	v[6]  = v[1] = op->base.dst.y + box->y2;
+@@ -4445,6 +4509,7 @@ gen3_emit_composite_spans_primitive_identity_gradient__sse2(struct sna *sna,
+ {
+ 	float *v = sna->render.vertices + sna->render.vertex_used;
+ 	sna->render.vertex_used += 15;
++	assert(sna->render.vertex_used <= sna->render.vertex_size);
+ 
+ 	v[0] = op->base.dst.x + box->x2;
+ 	v[1] = op->base.dst.y + box->y2;
+@@ -4504,6 +4569,7 @@ gen3_emit_composite_spans_primitive_affine_gradient__sse2(struct sna *sna,
+ 	PictTransform *transform = op->base.src.transform;
+ 	float *v = sna->render.vertices + sna->render.vertex_used;
+ 	sna->render.vertex_used += 15;
++	assert(sna->render.vertex_used <= sna->render.vertex_size);
+ 
+ 	v[0] = op->base.dst.x + box->x2;
+ 	v[1] = op->base.dst.y + box->y2;
+@@ -4577,6 +4643,7 @@ gen3_emit_composite_spans_primitive_affine_gradient(struct sna *sna,
+ 	PictTransform *transform = op->base.src.transform;
+ 	float *v = sna->render.vertices + sna->render.vertex_used;
+ 	sna->render.vertex_used += 15;
++	assert(sna->render.vertex_used <= sna->render.vertex_size);
+ 
+ 	v[0] = op->base.dst.x + box->x2;
+ 	v[1] = op->base.dst.y + box->y2;
+@@ -4676,6 +4743,7 @@ gen3_render_composite_spans_constant_box(struct sna *sna,
+ 
+ 	v = sna->render.vertices + sna->render.vertex_used;
+ 	sna->render.vertex_used += 9;
++	assert(sna->render.vertex_used <= sna->render.vertex_size);
+ 
+ 	v[0] = box->x2;
+ 	v[6] = v[3] = box->x1;
+@@ -4706,6 +4774,7 @@ gen3_render_composite_spans_constant_thread_boxes(struct sna *sna,
+ 
+ 		v = sna->render.vertices + sna->render.vertex_used;
+ 		sna->render.vertex_used += nbox_this_time * 9;
++		assert(sna->render.vertex_used <= sna->render.vertex_size);
+ 
+ 		sna_vertex_acquire__locked(&sna->render);
+ 		sna_vertex_unlock(&sna->render);
+@@ -4795,6 +4864,7 @@ gen3_render_composite_spans_boxes__thread(struct sna *sna,
+ 
+ 		v = sna->render.vertices + sna->render.vertex_used;
+ 		sna->render.vertex_used += nbox_this_time * op->base.floats_per_rect;
++		assert(sna->render.vertex_used <= sna->render.vertex_size);
+ 
+ 		sna_vertex_acquire__locked(&sna->render);
+ 		sna_vertex_unlock(&sna->render);
+@@ -5436,17 +5506,7 @@ gen3_render_video(struct sna *sna,
+ 		pix_yoff = -dstRegion->extents.y1;
+ 		copy = 1;
+ 	} else {
+-		/* Set up the offset for translating from the given region
+-		 * (in screen coordinates) to the backing pixmap.
+-		 */
+-#ifdef COMPOSITE
+-		pix_xoff = -pixmap->screen_x + pixmap->drawable.x;
+-		pix_yoff = -pixmap->screen_y + pixmap->drawable.y;
+-#else
+-		pix_xoff = 0;
+-		pix_yoff = 0;
+-#endif
+-
++		pix_xoff = pix_yoff = 0;
+ 		dst_width  = pixmap->drawable.width;
+ 		dst_height = pixmap->drawable.height;
+ 	}
+@@ -5502,16 +5562,9 @@ gen3_render_video(struct sna *sna,
+ 	} while (nbox);
+ 
+ 	if (copy) {
+-#ifdef COMPOSITE
+-		pix_xoff = -pixmap->screen_x + pixmap->drawable.x;
+-		pix_yoff = -pixmap->screen_y + pixmap->drawable.y;
+-#else
+-		pix_xoff = 0;
+-		pix_yoff = 0;
+-#endif
+ 		sna_blt_copy_boxes(sna, GXcopy,
+ 				   dst_bo, -dstRegion->extents.x1, -dstRegion->extents.y1,
+-				   priv->gpu_bo, pix_xoff, pix_yoff,
++				   priv->gpu_bo, 0, 0,
+ 				   pixmap->drawable.bitsPerPixel,
+ 				   region_rects(dstRegion),
+ 				   region_num_rects(dstRegion));
+@@ -5519,21 +5572,8 @@ gen3_render_video(struct sna *sna,
+ 		kgem_bo_destroy(&sna->kgem, dst_bo);
+ 	}
+ 
+-	if (!DAMAGE_IS_ALL(priv->gpu_damage)) {
+-		if ((pix_xoff | pix_yoff) == 0) {
+-			sna_damage_add(&priv->gpu_damage, dstRegion);
+-			sna_damage_subtract(&priv->cpu_damage, dstRegion);
+-		} else {
+-			sna_damage_add_boxes(&priv->gpu_damage,
+-					     region_rects(dstRegion),
+-					     region_num_rects(dstRegion),
+-					     pix_xoff, pix_yoff);
+-			sna_damage_subtract_boxes(&priv->cpu_damage,
+-						  region_rects(dstRegion),
+-						  region_num_rects(dstRegion),
+-						  pix_xoff, pix_yoff);
+-		}
+-	}
++	if (!DAMAGE_IS_ALL(priv->gpu_damage))
++		sna_damage_add(&priv->gpu_damage, dstRegion);
+ 
+ 	return true;
+ }
+diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c
+index 6c2d3808..72a98aee 100644
+--- a/src/sna/gen4_render.c
++++ b/src/sna/gen4_render.c
+@@ -1405,8 +1405,8 @@ gen4_render_video(struct sna *sna,
+ 	int src_height = frame->src.y2 - frame->src.y1;
+ 	float src_offset_x, src_offset_y;
+ 	float src_scale_x, src_scale_y;
+-	int nbox, pix_xoff, pix_yoff;
+ 	const BoxRec *box;
++	int nbox;
+ 
+ 	DBG(("%s: %dx%d -> %dx%d\n", __FUNCTION__,
+ 	     src_width, src_height, dst_width, dst_height));
+@@ -1445,17 +1445,6 @@ gen4_render_video(struct sna *sna,
+ 	gen4_align_vertex(sna, &tmp);
+ 	gen4_video_bind_surfaces(sna, &tmp);
+ 
+-	/* Set up the offset for translating from the given region (in screen
+-	 * coordinates) to the backing pixmap.
+-	 */
+-#ifdef COMPOSITE
+-	pix_xoff = -pixmap->screen_x + pixmap->drawable.x;
+-	pix_yoff = -pixmap->screen_y + pixmap->drawable.y;
+-#else
+-	pix_xoff = 0;
+-	pix_yoff = 0;
+-#endif
+-
+ 	src_scale_x = (float)src_width / dst_width / frame->width;
+ 	src_offset_x = (float)frame->src.x1 / frame->width - dstRegion->extents.x1 * src_scale_x;
+ 
+@@ -1473,34 +1462,26 @@ gen4_render_video(struct sna *sna,
+ 		nbox -= n;
+ 
+ 		do {
+-			BoxRec r;
+-
+-			r.x1 = box->x1 + pix_xoff;
+-			r.x2 = box->x2 + pix_xoff;
+-			r.y1 = box->y1 + pix_yoff;
+-			r.y2 = box->y2 + pix_yoff;
+-
+-			OUT_VERTEX(r.x2, r.y2);
++			OUT_VERTEX(box->x2, box->y2);
+ 			OUT_VERTEX_F(box->x2 * src_scale_x + src_offset_x);
+ 			OUT_VERTEX_F(box->y2 * src_scale_y + src_offset_y);
+ 
+-			OUT_VERTEX(r.x1, r.y2);
++			OUT_VERTEX(box->x1, box->y2);
+ 			OUT_VERTEX_F(box->x1 * src_scale_x + src_offset_x);
+ 			OUT_VERTEX_F(box->y2 * src_scale_y + src_offset_y);
+ 
+-			OUT_VERTEX(r.x1, r.y1);
++			OUT_VERTEX(box->x1, box->y1);
+ 			OUT_VERTEX_F(box->x1 * src_scale_x + src_offset_x);
+ 			OUT_VERTEX_F(box->y1 * src_scale_y + src_offset_y);
+ 
+-			if (!DAMAGE_IS_ALL(priv->gpu_damage)) {
+-				sna_damage_add_box(&priv->gpu_damage, &r);
+-				sna_damage_subtract_box(&priv->cpu_damage, &r);
+-			}
+ 			box++;
+ 		} while (--n);
+ 	} while (nbox);
+ 	gen4_vertex_flush(sna);
+ 
++	if (!DAMAGE_IS_ALL(priv->gpu_damage))
++		sna_damage_add(&priv->gpu_damage, dstRegion);
++
+ 	return true;
+ }
+ 
+@@ -1585,12 +1566,14 @@ gen4_composite_picture(struct sna *sna,
+ 		if (channel->repeat &&
+ 		    (x >= 0 &&
+ 		     y >= 0 &&
+-		     x + w < pixmap->drawable.width &&
+-		     y + h < pixmap->drawable.height)) {
++		     x + w <= pixmap->drawable.width &&
++		     y + h <= pixmap->drawable.height)) {
+ 			struct sna_pixmap *priv = sna_pixmap(pixmap);
+ 			if (priv && priv->clear) {
+ 				DBG(("%s: converting large pixmap source into solid [%08x]\n", __FUNCTION__, priv->clear_color));
+-				return gen4_channel_init_solid(sna, channel, priv->clear_color);
++				return gen4_channel_init_solid(sna, channel,
++							       solid_color(picture->format,
++									   priv->clear_color));
+ 			}
+ 		}
+ 	} else
+@@ -1664,7 +1647,9 @@ gen4_composite_set_target(struct sna *sna,
+ 	} else
+ 		sna_render_picture_extents(dst, &box);
+ 
+-	hint = PREFER_GPU | FORCE_GPU | RENDER_GPU;
++	hint = PREFER_GPU | RENDER_GPU;
++	if (!need_tiling(sna, op->dst.width, op->dst.height))
++		hint |= FORCE_GPU;
+ 	if (!partial) {
+ 		hint |= IGNORE_DAMAGE;
+ 		if (w == op->dst.width && h == op->dst.height)
+@@ -2738,6 +2723,20 @@ gen4_render_fill_boxes(struct sna *sna,
+ 	tmp.dst.format = format;
+ 	tmp.dst.bo = dst_bo;
+ 
++	sna_render_composite_redirect_init(&tmp);
++	if (too_large(dst->width, dst->height)) {
++		BoxRec extents;
++
++		boxes_extents(box, n, &extents);
++		if (!sna_render_composite_redirect(sna, &tmp,
++						   extents.x1, extents.y1,
++						   extents.x2 - extents.x1,
++						   extents.y2 - extents.y1,
++						   n > 1))
++			return sna_tiling_fill_boxes(sna, op, format, color,
++						     dst, dst_bo, box, n);
++	}
++
+ 	gen4_channel_init_solid(sna, &tmp.src, pixel);
+ 
+ 	tmp.is_affine = true;
+@@ -2748,8 +2747,10 @@ gen4_render_fill_boxes(struct sna *sna,
+ 
+ 	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
+ 		kgem_submit(&sna->kgem);
+-		if (!kgem_check_bo(&sna->kgem, dst_bo, NULL))
++		if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
++			kgem_bo_destroy(&sna->kgem, tmp.src.bo);
+ 			return false;
++		}
+ 	}
+ 
+ 	gen4_align_vertex(sna, &tmp);
+@@ -2765,6 +2766,7 @@ gen4_render_fill_boxes(struct sna *sna,
+ 
+ 	gen4_vertex_flush(sna);
+ 	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
++	sna_render_composite_redirect_done(sna, &tmp);
+ 	return true;
+ }
+ 
+diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c
+index 37cf1ff9..fb3e79bf 100644
+--- a/src/sna/gen5_render.c
++++ b/src/sna/gen5_render.c
+@@ -1355,8 +1355,8 @@ gen5_render_video(struct sna *sna,
+ 	int src_height = frame->src.y2 - frame->src.y1;
+ 	float src_offset_x, src_offset_y;
+ 	float src_scale_x, src_scale_y;
+-	int nbox, pix_xoff, pix_yoff;
+ 	const BoxRec *box;
++	int nbox;
+ 
+ 	DBG(("%s: %dx%d -> %dx%d\n", __FUNCTION__,
+ 	     src_width, src_height, dst_width, dst_height));
+@@ -1395,17 +1395,6 @@ gen5_render_video(struct sna *sna,
+ 	gen5_align_vertex(sna, &tmp);
+ 	gen5_video_bind_surfaces(sna, &tmp);
+ 
+-	/* Set up the offset for translating from the given region (in screen
+-	 * coordinates) to the backing pixmap.
+-	 */
+-#ifdef COMPOSITE
+-	pix_xoff = -pixmap->screen_x + pixmap->drawable.x;
+-	pix_yoff = -pixmap->screen_y + pixmap->drawable.y;
+-#else
+-	pix_xoff = 0;
+-	pix_yoff = 0;
+-#endif
+-
+ 	src_scale_x = (float)src_width / dst_width / frame->width;
+ 	src_offset_x = (float)frame->src.x1 / frame->width - dstRegion->extents.x1 * src_scale_x;
+ 
+@@ -1415,35 +1404,27 @@ gen5_render_video(struct sna *sna,
+ 	box = region_rects(dstRegion);
+ 	nbox = region_num_rects(dstRegion);
+ 	while (nbox--) {
+-		BoxRec r;
+-
+-		r.x1 = box->x1 + pix_xoff;
+-		r.x2 = box->x2 + pix_xoff;
+-		r.y1 = box->y1 + pix_yoff;
+-		r.y2 = box->y2 + pix_yoff;
+-
+ 		gen5_get_rectangles(sna, &tmp, 1, gen5_video_bind_surfaces);
+ 
+-		OUT_VERTEX(r.x2, r.y2);
++		OUT_VERTEX(box->x2, box->y2);
+ 		OUT_VERTEX_F(box->x2 * src_scale_x + src_offset_x);
+ 		OUT_VERTEX_F(box->y2 * src_scale_y + src_offset_y);
+ 
+-		OUT_VERTEX(r.x1, r.y2);
++		OUT_VERTEX(box->x1, box->y2);
+ 		OUT_VERTEX_F(box->x1 * src_scale_x + src_offset_x);
+ 		OUT_VERTEX_F(box->y2 * src_scale_y + src_offset_y);
+ 
+-		OUT_VERTEX(r.x1, r.y1);
++		OUT_VERTEX(box->x1, box->y1);
+ 		OUT_VERTEX_F(box->x1 * src_scale_x + src_offset_x);
+ 		OUT_VERTEX_F(box->y1 * src_scale_y + src_offset_y);
+ 
+-		if (!DAMAGE_IS_ALL(priv->gpu_damage)) {
+-			sna_damage_add_box(&priv->gpu_damage, &r);
+-			sna_damage_subtract_box(&priv->cpu_damage, &r);
+-		}
+ 		box++;
+ 	}
+-
+ 	gen4_vertex_flush(sna);
++
++	if (!DAMAGE_IS_ALL(priv->gpu_damage))
++		sna_damage_add(&priv->gpu_damage, dstRegion);
++
+ 	return true;
+ }
+ 
+@@ -1524,12 +1505,12 @@ gen5_composite_picture(struct sna *sna,
+ 		if (channel->repeat ||
+ 		    (x >= 0 &&
+ 		     y >= 0 &&
+-		     x + w < pixmap->drawable.width &&
+-		     y + h < pixmap->drawable.height)) {
++		     x + w <= pixmap->drawable.width &&
++		     y + h <= pixmap->drawable.height)) {
+ 			struct sna_pixmap *priv = sna_pixmap(pixmap);
+ 			if (priv && priv->clear) {
+ 				DBG(("%s: converting large pixmap source into solid [%08x]\n", __FUNCTION__, priv->clear_color));
+-				return gen4_channel_init_solid(sna, channel, priv->clear_color);
++				return gen4_channel_init_solid(sna, channel, solid_color(picture->format, priv->clear_color));
+ 			}
+ 		}
+ 	} else
+@@ -1618,7 +1599,9 @@ gen5_composite_set_target(struct sna *sna,
+ 	} else
+ 		sna_render_picture_extents(dst, &box);
+ 
+-	hint = PREFER_GPU | FORCE_GPU | RENDER_GPU;
++	hint = PREFER_GPU | RENDER_GPU;
++	if (!need_tiling(sna, op->dst.width, op->dst.height))
++		hint |= FORCE_GPU;
+ 	if (!partial) {
+ 		hint |= IGNORE_DAMAGE;
+ 		if (w == op->dst.width && h == op->dst.height)
+@@ -2734,6 +2717,19 @@ gen5_render_fill_boxes(struct sna *sna,
+ 	tmp.dst.format = format;
+ 	tmp.dst.bo = dst_bo;
+ 
++	if (too_large(dst->width, dst->height)) {
++		BoxRec extents;
++
++		boxes_extents(box, n, &extents);
++		if (!sna_render_composite_redirect(sna, &tmp,
++						   extents.x1, extents.y1,
++						   extents.x2 - extents.x1,
++						   extents.y2 - extents.y1,
++						   n > 1))
++			return sna_tiling_fill_boxes(sna, op, format, color,
++						     dst, dst_bo, box, n);
++	}
++
+ 	tmp.src.bo = sna_render_get_solid(sna, pixel);
+ 	tmp.src.filter = SAMPLER_FILTER_NEAREST;
+ 	tmp.src.repeat = SAMPLER_EXTEND_REPEAT;
+@@ -2780,6 +2776,7 @@ gen5_render_fill_boxes(struct sna *sna,
+ 
+ 	gen4_vertex_flush(sna);
+ 	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
++	sna_render_composite_redirect_done(sna, &tmp);
+ 	return true;
+ }
+ 
+diff --git a/src/sna/gen6_common.h b/src/sna/gen6_common.h
+index 6668620b..b53ec0c9 100644
+--- a/src/sna/gen6_common.h
++++ b/src/sna/gen6_common.h
+@@ -30,8 +30,8 @@
+ 
+ #include "sna.h"
+ 
+-#define NO_RING_SWITCH 0
+-#define PREFER_RENDER 0
++#define NO_RING_SWITCH(sna) (!(sna)->kgem.has_semaphores)
++#define PREFER_RENDER 0 /* -1 -> BLT, 1 -> RENDER */
+ 
+ static inline bool is_uncached(struct sna *sna,
+ 			       struct kgem_bo *bo)
+@@ -46,40 +46,28 @@ inline static bool can_switch_to_blt(struct sna *sna,
+ 	if (sna->kgem.ring != KGEM_RENDER)
+ 		return true;
+ 
+-	if (NO_RING_SWITCH)
+-		return false;
+-
+-	if (!sna->kgem.has_semaphores)
+-		return false;
+-
+-	if (flags & COPY_LAST)
+-		return true;
+-
+ 	if (bo && RQ_IS_BLT(bo->rq))
+ 		return true;
+ 
+-	if (sna->render_state.gt < 2)
+-		return true;
++	if (bo && bo->tiling == I915_TILING_Y)
++		return false;
+ 
+-	return kgem_ring_is_idle(&sna->kgem, KGEM_BLT);
+-}
++	if (bo && !kgem_bo_can_blt(&sna->kgem, bo))
++		return false;
+ 
+-inline static bool can_switch_to_render(struct sna *sna,
+-					struct kgem_bo *bo)
+-{
+-	if (sna->kgem.ring == KGEM_RENDER)
++	if (sna->render_state.gt < 2)
+ 		return true;
+ 
+-	if (NO_RING_SWITCH)
++	if (bo && RQ_IS_RENDER(bo->rq))
+ 		return false;
+ 
+-	if (!sna->kgem.has_semaphores)
++	if (NO_RING_SWITCH(sna))
+ 		return false;
+ 
+-	if (bo && !RQ_IS_BLT(bo->rq) && !is_uncached(sna, bo))
++	if (flags & COPY_LAST)
+ 		return true;
+ 
+-	return !kgem_ring_is_idle(&sna->kgem, KGEM_RENDER);
++	return kgem_ring_is_idle(&sna->kgem, KGEM_BLT);
+ }
+ 
+ static inline bool untiled_tlb_miss(struct kgem_bo *bo)
+@@ -90,57 +78,95 @@ static inline bool untiled_tlb_miss(struct kgem_bo *bo)
+ 	return bo->tiling == I915_TILING_NONE && bo->pitch >= 4096;
+ }
+ 
+-static int prefer_blt_bo(struct sna *sna, struct kgem_bo *bo)
++static int prefer_blt_bo(struct sna *sna,
++			 struct kgem_bo *src,
++			 struct kgem_bo *dst)
+ {
++	assert(dst != NULL);
++
+ 	if (PREFER_RENDER)
+ 		return PREFER_RENDER < 0;
+ 
+-	if (bo->rq)
+-		return RQ_IS_BLT(bo->rq);
++	if (dst->rq)
++		return RQ_IS_BLT(dst->rq);
+ 
+ 	if (sna->flags & SNA_POWERSAVE)
+ 		return true;
+ 
+-	return bo->tiling == I915_TILING_NONE || is_uncached(sna, bo);
+-}
++	if (src) {
++		if (sna->render_state.gt > 1)
++			return false;
+ 
+-inline static bool force_blt_ring(struct sna *sna)
+-{
+-	if (sna->flags & SNA_POWERSAVE)
++		if (src->rq)
++			return RQ_IS_BLT(src->rq);
++
++		if (src->tiling == I915_TILING_Y)
++			return false;
++        } else {
++                if (sna->render_state.gt > 2)
++                        return false;
++        }
++
++	if (sna->render_state.gt < 2)
+ 		return true;
+ 
++	return dst->tiling == I915_TILING_NONE || is_uncached(sna, dst);
++}
++
++inline static bool force_blt_ring(struct sna *sna, struct kgem_bo *bo)
++{
+ 	if (sna->kgem.mode == KGEM_RENDER)
+ 		return false;
+ 
++	if (NO_RING_SWITCH(sna))
++		return sna->kgem.ring == KGEM_BLT;
++
++	if (bo->tiling == I915_TILING_Y)
++		return false;
++
++	if (sna->flags & SNA_POWERSAVE)
++		return true;
++
+ 	if (sna->render_state.gt < 2)
+ 		return true;
+ 
+ 	return false;
+ }
+ 
+-inline static bool prefer_blt_ring(struct sna *sna,
+-				   struct kgem_bo *bo,
+-				   unsigned flags)
++nonnull inline static bool
++prefer_blt_ring(struct sna *sna, struct kgem_bo *bo, unsigned flags)
+ {
+ 	if (PREFER_RENDER)
+ 		return PREFER_RENDER < 0;
+ 
+-	assert(!force_blt_ring(sna));
+-	assert(!kgem_bo_is_render(bo));
++	assert(!force_blt_ring(sna, bo));
++	assert(!kgem_bo_is_render(bo) || NO_RING_SWITCH(sna));
++
++	if (kgem_bo_is_blt(bo))
++		return true;
+ 
+ 	return can_switch_to_blt(sna, bo, flags);
+ }
+ 
+-inline static bool prefer_render_ring(struct sna *sna,
+-				      struct kgem_bo *bo)
++nonnull inline static bool
++prefer_render_ring(struct sna *sna, struct kgem_bo *bo)
+ {
++	if (sna->kgem.ring == KGEM_RENDER)
++		return true;
++
++	if (sna->kgem.ring != KGEM_NONE && NO_RING_SWITCH(sna))
++                return false;
++
++	if (kgem_bo_is_render(bo))
++		return true;
++
+ 	if (sna->flags & SNA_POWERSAVE)
+ 		return false;
+ 
+-	if (sna->render_state.gt < 2)
+-		return false;
++	if (!prefer_blt_bo(sna, NULL, bo))
++		return true;
+ 
+-	return can_switch_to_render(sna, bo);
++	return !kgem_ring_is_idle(&sna->kgem, KGEM_RENDER);
+ }
+ 
+ inline static bool
+@@ -153,25 +179,20 @@ prefer_blt_composite(struct sna *sna, struct sna_composite_op *tmp)
+ 	    untiled_tlb_miss(tmp->src.bo))
+ 		return true;
+ 
+-	if (force_blt_ring(sna))
++	if (force_blt_ring(sna, tmp->dst.bo))
+ 		return true;
+ 
+-	if (kgem_bo_is_render(tmp->dst.bo) ||
+-	    kgem_bo_is_render(tmp->src.bo))
+-		return false;
+-
+ 	if (prefer_render_ring(sna, tmp->dst.bo))
+ 		return false;
+ 
+ 	if (!prefer_blt_ring(sna, tmp->dst.bo, 0))
+ 		return false;
+ 
+-	return prefer_blt_bo(sna, tmp->dst.bo) || prefer_blt_bo(sna, tmp->src.bo);
++	return prefer_blt_bo(sna, tmp->src.bo, tmp->dst.bo);
+ }
+ 
+-static inline bool prefer_blt_fill(struct sna *sna,
+-				   struct kgem_bo *bo,
+-				   unsigned flags)
++nonnull static inline bool
++prefer_blt_fill(struct sna *sna, struct kgem_bo *bo, unsigned flags)
+ {
+ 	if (PREFER_RENDER)
+ 		return PREFER_RENDER < 0;
+@@ -179,24 +200,21 @@ static inline bool prefer_blt_fill(struct sna *sna,
+ 	if (untiled_tlb_miss(bo))
+ 		return true;
+ 
+-	if (force_blt_ring(sna))
++	if (force_blt_ring(sna, bo))
+ 		return true;
+ 
+ 	if ((flags & (FILL_POINTS | FILL_SPANS)) == 0) {
+-		if (kgem_bo_is_render(bo))
+-			return false;
+-
+ 		if (prefer_render_ring(sna, bo))
+ 			return false;
+ 
+ 		if (!prefer_blt_ring(sna, bo, 0))
+ 			return false;
+ 	} else {
+-	    if (can_switch_to_blt(sna, bo, 0))
++	    if (can_switch_to_blt(sna, bo, COPY_LAST))
+ 		    return true;
+ 	}
+ 
+-	return prefer_blt_bo(sna, bo);
++	return prefer_blt_bo(sna, NULL, bo);
+ }
+ 
+ void gen6_render_context_switch(struct kgem *kgem, int new_mode);
+diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
+index 25044685..6b69f216 100644
+--- a/src/sna/gen6_render.c
++++ b/src/sna/gen6_render.c
+@@ -1633,9 +1633,9 @@ gen6_render_video(struct sna *sna,
+ 	int src_height = frame->src.y2 - frame->src.y1;
+ 	float src_offset_x, src_offset_y;
+ 	float src_scale_x, src_scale_y;
+-	int nbox, pix_xoff, pix_yoff;
+ 	unsigned filter;
+ 	const BoxRec *box;
++	int nbox;
+ 
+ 	DBG(("%s: src=(%d, %d), dst=(%d, %d), %dx[(%d, %d), (%d, %d)...]\n",
+ 	     __FUNCTION__,
+@@ -1686,17 +1686,6 @@ gen6_render_video(struct sna *sna,
+ 	gen6_align_vertex(sna, &tmp);
+ 	gen6_emit_video_state(sna, &tmp);
+ 
+-	/* Set up the offset for translating from the given region (in screen
+-	 * coordinates) to the backing pixmap.
+-	 */
+-#ifdef COMPOSITE
+-	pix_xoff = -pixmap->screen_x + pixmap->drawable.x;
+-	pix_yoff = -pixmap->screen_y + pixmap->drawable.y;
+-#else
+-	pix_xoff = 0;
+-	pix_yoff = 0;
+-#endif
+-
+ 	src_scale_x = (float)src_width / dst_width / frame->width;
+ 	src_offset_x = (float)frame->src.x1 / frame->width - dstRegion->extents.x1 * src_scale_x;
+ 
+@@ -1706,35 +1695,27 @@ gen6_render_video(struct sna *sna,
+ 	box = region_rects(dstRegion);
+ 	nbox = region_num_rects(dstRegion);
+ 	while (nbox--) {
+-		BoxRec r;
+-
+-		r.x1 = box->x1 + pix_xoff;
+-		r.x2 = box->x2 + pix_xoff;
+-		r.y1 = box->y1 + pix_yoff;
+-		r.y2 = box->y2 + pix_yoff;
+-
+ 		gen6_get_rectangles(sna, &tmp, 1, gen6_emit_video_state);
+ 
+-		OUT_VERTEX(r.x2, r.y2);
++		OUT_VERTEX(box->x2, box->y2);
+ 		OUT_VERTEX_F(box->x2 * src_scale_x + src_offset_x);
+ 		OUT_VERTEX_F(box->y2 * src_scale_y + src_offset_y);
+ 
+-		OUT_VERTEX(r.x1, r.y2);
++		OUT_VERTEX(box->x1, box->y2);
+ 		OUT_VERTEX_F(box->x1 * src_scale_x + src_offset_x);
+ 		OUT_VERTEX_F(box->y2 * src_scale_y + src_offset_y);
+ 
+-		OUT_VERTEX(r.x1, r.y1);
++		OUT_VERTEX(box->x1, box->y1);
+ 		OUT_VERTEX_F(box->x1 * src_scale_x + src_offset_x);
+ 		OUT_VERTEX_F(box->y1 * src_scale_y + src_offset_y);
+ 
+-		if (!DAMAGE_IS_ALL(priv->gpu_damage)) {
+-			sna_damage_add_box(&priv->gpu_damage, &r);
+-			sna_damage_subtract_box(&priv->cpu_damage, &r);
+-		}
+ 		box++;
+ 	}
+-
+ 	gen4_vertex_flush(sna);
++
++	if (!DAMAGE_IS_ALL(priv->gpu_damage))
++		sna_damage_add(&priv->gpu_damage, dstRegion);
++
+ 	return true;
+ }
+ 
+@@ -1815,12 +1796,12 @@ gen6_composite_picture(struct sna *sna,
+ 		if (channel->repeat &&
+ 		    (x >= 0 &&
+ 		     y >= 0 &&
+-		     x + w < pixmap->drawable.width &&
+-		     y + h < pixmap->drawable.height)) {
++		     x + w <= pixmap->drawable.width &&
++		     y + h <= pixmap->drawable.height)) {
+ 			struct sna_pixmap *priv = sna_pixmap(pixmap);
+ 			if (priv && priv->clear) {
+ 				DBG(("%s: converting large pixmap source into solid [%08x]\n", __FUNCTION__, priv->clear_color));
+-				return gen4_channel_init_solid(sna, channel, priv->clear_color);
++				return gen4_channel_init_solid(sna, channel, solid_color(picture->format, priv->clear_color));
+ 			}
+ 		}
+ 	} else
+@@ -1927,7 +1908,9 @@ gen6_composite_set_target(struct sna *sna,
+ 	} else
+ 		sna_render_picture_extents(dst, &box);
+ 
+-	hint = PREFER_GPU | FORCE_GPU | RENDER_GPU;
++	hint = PREFER_GPU | RENDER_GPU;
++	if (!need_tiling(sna, op->dst.width, op->dst.height))
++		hint |= FORCE_GPU;
+ 	if (!partial) {
+ 		hint |= IGNORE_DAMAGE;
+ 		if (w == op->dst.width && h == op->dst.height)
+@@ -1965,46 +1948,77 @@ gen6_composite_set_target(struct sna *sna,
+ 
+ static bool
+ try_blt(struct sna *sna,
+-	PicturePtr dst, PicturePtr src,
+-	int width, int height)
++	uint8_t op,
++	PicturePtr src,
++	PicturePtr mask,
++	PicturePtr dst,
++	int16_t src_x, int16_t src_y,
++	int16_t msk_x, int16_t msk_y,
++	int16_t dst_x, int16_t dst_y,
++	int16_t width, int16_t height,
++	unsigned flags,
++	struct sna_composite_op *tmp)
+ {
+ 	struct kgem_bo *bo;
+ 
+ 	if (sna->kgem.mode == KGEM_BLT) {
+ 		DBG(("%s: already performing BLT\n", __FUNCTION__));
+-		return true;
++		goto execute;
+ 	}
+ 
+ 	if (too_large(width, height)) {
+ 		DBG(("%s: operation too large for 3D pipe (%d, %d)\n",
+ 		     __FUNCTION__, width, height));
+-		return true;
++		goto execute;
+ 	}
+ 
+ 	bo = __sna_drawable_peek_bo(dst->pDrawable);
+ 	if (bo == NULL)
+-		return true;
+-	if (bo->rq)
+-		return RQ_IS_BLT(bo->rq);
++		goto execute;
++
++	if (untiled_tlb_miss(bo))
++		goto execute;
++
++	if (bo->rq) {
++		if (RQ_IS_BLT(bo->rq))
++			goto execute;
++
++		return false;
++	}
++
++	if (bo->tiling == I915_TILING_Y)
++		goto upload;
++
++	if (src->pDrawable == dst->pDrawable &&
++	    can_switch_to_blt(sna, bo, 0))
++		goto execute;
+ 
+ 	if (sna_picture_is_solid(src, NULL) && can_switch_to_blt(sna, bo, 0))
+-		return true;
++		goto execute;
+ 
+ 	if (src->pDrawable) {
+-		bo = __sna_drawable_peek_bo(src->pDrawable);
+-		if (bo == NULL)
+-			return true;
++		struct kgem_bo *s = __sna_drawable_peek_bo(src->pDrawable);
++		if (s == NULL)
++			goto execute;
+ 
+-		if (prefer_blt_bo(sna, bo))
+-			return true;
++		if (prefer_blt_bo(sna, s, bo))
++			goto execute;
+ 	}
+ 
+ 	if (sna->kgem.ring == KGEM_BLT) {
+ 		DBG(("%s: already performing BLT\n", __FUNCTION__));
+-		return true;
++		goto execute;
+ 	}
+ 
+-	return false;
++upload:
++	flags |= COMPOSITE_UPLOAD;
++execute:
++	return sna_blt_composite(sna, op,
++				 src, dst,
++				 src_x, src_y,
++				 dst_x, dst_y,
++				 width, height,
++				 flags, tmp);
+ }
+ 
+ static bool
+@@ -2234,13 +2248,13 @@ gen6_render_composite(struct sna *sna,
+ 	     width, height, sna->kgem.ring));
+ 
+ 	if (mask == NULL &&
+-	    try_blt(sna, dst, src, width, height) &&
+-	    sna_blt_composite(sna, op,
+-			      src, dst,
+-			      src_x, src_y,
+-			      dst_x, dst_y,
+-			      width, height,
+-			      flags, tmp))
++	    try_blt(sna, op,
++		    src, mask, dst,
++		    src_x, src_y,
++		    msk_x, msk_y,
++		    dst_x, dst_y,
++		    width, height,
++		    flags, tmp))
+ 		return true;
+ 
+ 	if (gen6_composite_fallback(sna, src, mask, dst))
+@@ -2676,27 +2690,35 @@ static inline bool prefer_blt_copy(struct sna *sna,
+ 	if (sna->kgem.ring == KGEM_BLT)
+ 		return true;
+ 
+-	if (src_bo == dst_bo && can_switch_to_blt(sna, dst_bo, flags))
++	if (flags & COPY_DRI && !sna->kgem.has_semaphores)
++		return false;
++
++	if ((flags & COPY_SMALL || src_bo == dst_bo) &&
++	    can_switch_to_blt(sna, dst_bo, flags))
+ 		return true;
+ 
+ 	if (untiled_tlb_miss(src_bo) ||
+ 	    untiled_tlb_miss(dst_bo))
+ 		return true;
+ 
+-	if (force_blt_ring(sna))
++	if (force_blt_ring(sna, dst_bo))
+ 		return true;
+ 
+ 	if (kgem_bo_is_render(dst_bo) ||
+ 	    kgem_bo_is_render(src_bo))
+ 		return false;
+ 
++	if (flags & COPY_LAST &&
++            can_switch_to_blt(sna, dst_bo, flags))
++		return true;
++
+ 	if (prefer_render_ring(sna, dst_bo))
+ 		return false;
+ 
+ 	if (!prefer_blt_ring(sna, dst_bo, flags))
+ 		return false;
+ 
+-	return prefer_blt_bo(sna, src_bo) || prefer_blt_bo(sna, dst_bo);
++	return prefer_blt_bo(sna, src_bo, dst_bo);
+ }
+ 
+ static bool
+@@ -2758,8 +2780,7 @@ fallback_blt:
+ 		assert(src->depth == dst->depth);
+ 		assert(src->width == dst->width);
+ 		assert(src->height == dst->height);
+-		return sna_render_copy_boxes__overlap(sna, alu,
+-						      src, src_bo,
++		return sna_render_copy_boxes__overlap(sna, alu, dst, dst_bo,
+ 						      src_dx, src_dy,
+ 						      dst_dx, dst_dy,
+ 						      box, n, &extents);
+diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
+index 2ecfd641..aabb8693 100644
+--- a/src/sna/gen7_render.c
++++ b/src/sna/gen7_render.c
+@@ -60,8 +60,6 @@
+ #define NO_FILL_ONE 0
+ #define NO_FILL_CLEAR 0
+ 
+-#define NO_RING_SWITCH 0
+-
+ #define USE_8_PIXEL_DISPATCH 1
+ #define USE_16_PIXEL_DISPATCH 1
+ #define USE_32_PIXEL_DISPATCH 0
+@@ -149,7 +147,7 @@ static const struct gt_info hsw_gt1_info = {
+ 	.max_vs_threads = 70,
+ 	.max_gs_threads = 70,
+ 	.max_wm_threads =
+-		(102 - 1) << HSW_PS_MAX_THREADS_SHIFT |
++		(70 - 1) << HSW_PS_MAX_THREADS_SHIFT |
+ 		1 << HSW_PS_SAMPLE_MASK_SHIFT,
+ 	.urb = { 128, 640, 256, 8 },
+ 	.gt = 1,
+@@ -209,6 +207,12 @@ static const uint32_t ps_kernel_planar[][4] = {
+ #include "exa_wm_write.g7b"
+ };
+ 
++static const uint32_t ps_kernel_rgb[][4] = {
++#include "exa_wm_src_affine.g7b"
++#include "exa_wm_src_sample_argb.g7b"
++#include "exa_wm_write.g7b"
++};
++
+ #define KERNEL(kernel_enum, kernel, num_surfaces) \
+     [GEN7_WM_KERNEL_##kernel_enum] = {#kernel_enum, kernel, sizeof(kernel), num_surfaces}
+ #define NOKERNEL(kernel_enum, func, num_surfaces) \
+@@ -218,7 +222,7 @@ static const struct wm_kernel_info {
+ 	const void *data;
+ 	unsigned int size;
+ 	int num_surfaces;
+-} wm_kernels[] = {
++} wm_kernels[GEN7_WM_KERNEL_COUNT] = {
+ 	NOKERNEL(NOMASK, brw_wm_kernel__affine, 2),
+ 	NOKERNEL(NOMASK_P, brw_wm_kernel__projective, 2),
+ 
+@@ -236,6 +240,7 @@ static const struct wm_kernel_info {
+ 
+ 	KERNEL(VIDEO_PLANAR, ps_kernel_planar, 7),
+ 	KERNEL(VIDEO_PACKED, ps_kernel_packed, 2),
++	KERNEL(VIDEO_RGB, ps_kernel_rgb, 2),
+ };
+ #undef KERNEL
+ 
+@@ -810,7 +815,7 @@ gen7_emit_cc(struct sna *sna, uint32_t blend_offset)
+ 
+ 	DBG(("%s: blend = %x\n", __FUNCTION__, blend_offset));
+ 
+-	/* XXX can have upto 8 blend states preload, selectable via
++	/* XXX can have up to 8 blend states preload, selectable via
+ 	 * Render Target Index. What other side-effects of Render Target Index?
+ 	 */
+ 
+@@ -1792,7 +1797,9 @@ static void gen7_emit_video_state(struct sna *sna,
+ 			frame->pitch[0];
+ 		n_src = 6;
+ 	} else {
+-		if (frame->id == FOURCC_UYVY)
++		if (frame->id == FOURCC_RGB888)
++			src_surf_format = GEN7_SURFACEFORMAT_B8G8R8X8_UNORM;
++		else if (frame->id == FOURCC_UYVY)
+ 			src_surf_format = GEN7_SURFACEFORMAT_YCRCB_SWAPY;
+ 		else
+ 			src_surf_format = GEN7_SURFACEFORMAT_YCRCB_NORMAL;
+@@ -1826,6 +1833,23 @@ static void gen7_emit_video_state(struct sna *sna,
+ 	gen7_emit_state(sna, op, offset | dirty);
+ }
+ 
++static unsigned select_video_kernel(const struct sna_video_frame *frame)
++{
++	switch (frame->id) {
++	case FOURCC_YV12:
++	case FOURCC_I420:
++	case FOURCC_XVMC:
++		return GEN7_WM_KERNEL_VIDEO_PLANAR;
++
++	case FOURCC_RGB888:
++	case FOURCC_RGB565:
++		return GEN7_WM_KERNEL_VIDEO_RGB;
++
++	default:
++		return GEN7_WM_KERNEL_VIDEO_PACKED;
++	}
++}
++
+ static bool
+ gen7_render_video(struct sna *sna,
+ 		  struct sna_video *video,
+@@ -1841,9 +1865,9 @@ gen7_render_video(struct sna *sna,
+ 	int src_height = frame->src.y2 - frame->src.y1;
+ 	float src_offset_x, src_offset_y;
+ 	float src_scale_x, src_scale_y;
+-	int nbox, pix_xoff, pix_yoff;
+ 	unsigned filter;
+ 	const BoxRec *box;
++	int nbox;
+ 
+ 	DBG(("%s: src=(%d, %d), dst=(%d, %d), %dx[(%d, %d), (%d, %d)...]\n",
+ 	     __FUNCTION__,
+@@ -1878,9 +1902,7 @@ gen7_render_video(struct sna *sna,
+ 		GEN7_SET_FLAGS(SAMPLER_OFFSET(filter, SAMPLER_EXTEND_PAD,
+ 					      SAMPLER_FILTER_NEAREST, SAMPLER_EXTEND_NONE),
+ 			       NO_BLEND,
+-			       is_planar_fourcc(frame->id) ?
+-			       GEN7_WM_KERNEL_VIDEO_PLANAR :
+-			       GEN7_WM_KERNEL_VIDEO_PACKED,
++			       select_video_kernel(frame),
+ 			       2);
+ 	tmp.priv = frame;
+ 
+@@ -1896,17 +1918,6 @@ gen7_render_video(struct sna *sna,
+ 	gen7_align_vertex(sna, &tmp);
+ 	gen7_emit_video_state(sna, &tmp);
+ 
+-	/* Set up the offset for translating from the given region (in screen
+-	 * coordinates) to the backing pixmap.
+-	 */
+-#ifdef COMPOSITE
+-	pix_xoff = -pixmap->screen_x + pixmap->drawable.x;
+-	pix_yoff = -pixmap->screen_y + pixmap->drawable.y;
+-#else
+-	pix_xoff = 0;
+-	pix_yoff = 0;
+-#endif
+-
+ 	DBG(("%s: src=(%d, %d)x(%d, %d); frame=(%dx%d), dst=(%dx%d)\n",
+ 	     __FUNCTION__,
+ 	     frame->src.x1, frame->src.y1,
+@@ -1928,45 +1939,36 @@ gen7_render_video(struct sna *sna,
+ 	box = region_rects(dstRegion);
+ 	nbox = region_num_rects(dstRegion);
+ 	while (nbox--) {
+-		BoxRec r;
+-
+-		DBG(("%s: dst=(%d, %d), (%d, %d) + (%d, %d); src=(%f, %f), (%f, %f)\n",
++		DBG(("%s: dst=(%d, %d), (%d, %d); src=(%f, %f), (%f, %f)\n",
+ 		     __FUNCTION__,
+ 		     box->x1, box->y1,
+ 		     box->x2, box->y2,
+-		     pix_xoff, pix_yoff,
+ 		     box->x1 * src_scale_x + src_offset_x,
+ 		     box->y1 * src_scale_y + src_offset_y,
+ 		     box->x2 * src_scale_x + src_offset_x,
+ 		     box->y2 * src_scale_y + src_offset_y));
+ 
+-		r.x1 = box->x1 + pix_xoff;
+-		r.x2 = box->x2 + pix_xoff;
+-		r.y1 = box->y1 + pix_yoff;
+-		r.y2 = box->y2 + pix_yoff;
+-
+ 		gen7_get_rectangles(sna, &tmp, 1, gen7_emit_video_state);
+ 
+-		OUT_VERTEX(r.x2, r.y2);
++		OUT_VERTEX(box->x2, box->y2);
+ 		OUT_VERTEX_F(box->x2 * src_scale_x + src_offset_x);
+ 		OUT_VERTEX_F(box->y2 * src_scale_y + src_offset_y);
+ 
+-		OUT_VERTEX(r.x1, r.y2);
++		OUT_VERTEX(box->x1, box->y2);
+ 		OUT_VERTEX_F(box->x1 * src_scale_x + src_offset_x);
+ 		OUT_VERTEX_F(box->y2 * src_scale_y + src_offset_y);
+ 
+-		OUT_VERTEX(r.x1, r.y1);
++		OUT_VERTEX(box->x1, box->y1);
+ 		OUT_VERTEX_F(box->x1 * src_scale_x + src_offset_x);
+ 		OUT_VERTEX_F(box->y1 * src_scale_y + src_offset_y);
+ 
+-		if (!DAMAGE_IS_ALL(priv->gpu_damage)) {
+-			sna_damage_add_box(&priv->gpu_damage, &r);
+-			sna_damage_subtract_box(&priv->cpu_damage, &r);
+-		}
+ 		box++;
+ 	}
+-
+ 	gen4_vertex_flush(sna);
++
++	if (!DAMAGE_IS_ALL(priv->gpu_damage))
++		sna_damage_add(&priv->gpu_damage, dstRegion);
++
+ 	return true;
+ }
+ 
+@@ -2048,12 +2050,13 @@ gen7_composite_picture(struct sna *sna,
+ 		if (channel->repeat ||
+ 		    (x >= 0 &&
+ 		     y >= 0 &&
+-		     x + w < pixmap->drawable.width &&
+-		     y + h < pixmap->drawable.height)) {
++		     x + w <= pixmap->drawable.width &&
++		     y + h <= pixmap->drawable.height)) {
+ 			struct sna_pixmap *priv = sna_pixmap(pixmap);
+ 			if (priv && priv->clear) {
+ 				DBG(("%s: converting large pixmap source into solid [%08x]\n", __FUNCTION__, priv->clear_color));
+-				return gen4_channel_init_solid(sna, channel, priv->clear_color);
++				return gen4_channel_init_solid(sna, channel,
++							       solid_color(picture->format, priv->clear_color));
+ 			}
+ 		}
+ 	} else
+@@ -2147,7 +2150,9 @@ gen7_composite_set_target(struct sna *sna,
+ 	} else
+ 		sna_render_picture_extents(dst, &box);
+ 
+-	hint = PREFER_GPU | FORCE_GPU | RENDER_GPU;
++	hint = PREFER_GPU | RENDER_GPU;
++	if (!need_tiling(sna, op->dst.width, op->dst.height))
++		hint |= FORCE_GPU;
+ 	if (!partial) {
+ 		hint |= IGNORE_DAMAGE;
+ 		if (w == op->dst.width && h == op->dst.height)
+@@ -2185,46 +2190,78 @@ gen7_composite_set_target(struct sna *sna,
+ 
+ static bool
+ try_blt(struct sna *sna,
+-	PicturePtr dst, PicturePtr src,
+-	int width, int height)
++	uint8_t op,
++	PicturePtr src,
++	PicturePtr mask,
++	PicturePtr dst,
++	int16_t src_x, int16_t src_y,
++	int16_t msk_x, int16_t msk_y,
++	int16_t dst_x, int16_t dst_y,
++	int16_t width, int16_t height,
++	unsigned flags,
++	struct sna_composite_op *tmp)
+ {
+ 	struct kgem_bo *bo;
+ 
+ 	if (sna->kgem.mode == KGEM_BLT) {
+ 		DBG(("%s: already performing BLT\n", __FUNCTION__));
+-		return true;
++		goto execute;
+ 	}
+ 
+ 	if (too_large(width, height)) {
+ 		DBG(("%s: operation too large for 3D pipe (%d, %d)\n",
+ 		     __FUNCTION__, width, height));
+-		return true;
++		goto execute;
+ 	}
+ 
+ 	bo = __sna_drawable_peek_bo(dst->pDrawable);
+ 	if (bo == NULL)
+-		return true;
+-	if (bo->rq)
+-		return RQ_IS_BLT(bo->rq);
++		goto execute;
++
++	if (untiled_tlb_miss(bo))
++		goto execute;
++
++	if (bo->rq) {
++		if (RQ_IS_BLT(bo->rq))
++			goto execute;
++
++		return false;
++	}
++
++	if (bo->tiling == I915_TILING_Y)
++		goto upload;
++
++	if (src->pDrawable == dst->pDrawable &&
++	    (sna->render_state.gt < 3 || width*height < 1024) &&
++	    can_switch_to_blt(sna, bo, 0))
++		goto execute;
+ 
+ 	if (sna_picture_is_solid(src, NULL) && can_switch_to_blt(sna, bo, 0))
+-		return true;
++		goto execute;
+ 
+ 	if (src->pDrawable) {
+-		bo = __sna_drawable_peek_bo(src->pDrawable);
+-		if (bo == NULL)
+-			return true;
++		struct kgem_bo *s = __sna_drawable_peek_bo(src->pDrawable);
++		if (s == NULL)
++			goto upload;
+ 
+-		if (prefer_blt_bo(sna, bo))
+-			return true;
++		if (prefer_blt_bo(sna, s, bo))
++			goto execute;
+ 	}
+ 
+ 	if (sna->kgem.ring == KGEM_BLT) {
+ 		DBG(("%s: already performing BLT\n", __FUNCTION__));
+-		return true;
++		goto execute;
+ 	}
+ 
+-	return false;
++upload:
++	flags |= COMPOSITE_UPLOAD;
++execute:
++	return sna_blt_composite(sna, op,
++				 src, dst,
++				 src_x, src_y,
++				 dst_x, dst_y,
++				 width, height,
++				 flags, tmp);
+ }
+ 
+ static bool
+@@ -2454,13 +2491,13 @@ gen7_render_composite(struct sna *sna,
+ 	     width, height, sna->kgem.mode, sna->kgem.ring));
+ 
+ 	if (mask == NULL &&
+-	    try_blt(sna, dst, src, width, height) &&
+-	    sna_blt_composite(sna, op,
+-			      src, dst,
+-			      src_x, src_y,
+-			      dst_x, dst_y,
+-			      width, height,
+-			      flags, tmp))
++	    try_blt(sna, op,
++		    src, mask, dst,
++		    src_x, src_y,
++		    msk_x, msk_y,
++		    dst_x, dst_y,
++		    width, height,
++		    flags, tmp))
+ 		return true;
+ 
+ 	if (gen7_composite_fallback(sna, src, mask, dst))
+@@ -2878,27 +2915,37 @@ prefer_blt_copy(struct sna *sna,
+ 
+ 	assert((flags & COPY_SYNC) == 0);
+ 
+-	if (src_bo == dst_bo && can_switch_to_blt(sna, dst_bo, flags))
+-		return true;
+-
+ 	if (untiled_tlb_miss(src_bo) ||
+ 	    untiled_tlb_miss(dst_bo))
+ 		return true;
+ 
+-	if (force_blt_ring(sna))
++	if (flags & COPY_DRI && !sna->kgem.has_semaphores)
++		return false;
++
++	if (force_blt_ring(sna, dst_bo))
++		return true;
++
++	if ((flags & COPY_SMALL ||
++	     (sna->render_state.gt < 3 && src_bo == dst_bo)) &&
++            can_switch_to_blt(sna, dst_bo, flags))
+ 		return true;
+ 
+ 	if (kgem_bo_is_render(dst_bo) ||
+ 	    kgem_bo_is_render(src_bo))
+ 		return false;
+ 
++	if (flags & COPY_LAST &&
++	    sna->render_state.gt < 3 &&
++            can_switch_to_blt(sna, dst_bo, flags))
++		return true;
++
+ 	if (prefer_render_ring(sna, dst_bo))
+ 		return false;
+ 
+ 	if (!prefer_blt_ring(sna, dst_bo, flags))
+ 		return false;
+ 
+-	return prefer_blt_bo(sna, src_bo) || prefer_blt_bo(sna, dst_bo);
++	return prefer_blt_bo(sna, src_bo, dst_bo);
+ }
+ 
+ static bool
+@@ -2946,7 +2993,7 @@ fallback_blt:
+ 		     &extents)) {
+ 		bool big = too_large(extents.x2-extents.x1, extents.y2-extents.y1);
+ 
+-		if ((big || can_switch_to_blt(sna, dst_bo, flags)) &&
++		if ((big || !prefer_render_ring(sna, dst_bo)) &&
+ 		    sna_blt_copy_boxes(sna, alu,
+ 				       src_bo, src_dx, src_dy,
+ 				       dst_bo, dst_dx, dst_dy,
+@@ -2961,8 +3008,7 @@ fallback_blt:
+ 		assert(src->depth == dst->depth);
+ 		assert(src->width == dst->width);
+ 		assert(src->height == dst->height);
+-		return sna_render_copy_boxes__overlap(sna, alu,
+-						      src, src_bo,
++		return sna_render_copy_boxes__overlap(sna, alu, dst, dst_bo,
+ 						      src_dx, src_dy,
+ 						      dst_dx, dst_dy,
+ 						      box, n, &extents);
+diff --git a/src/sna/gen8_render.c b/src/sna/gen8_render.c
+index 6eb11452..445983b1 100644
+--- a/src/sna/gen8_render.c
++++ b/src/sna/gen8_render.c
+@@ -106,6 +106,12 @@ static const uint32_t ps_kernel_planar[][4] = {
+ #include "exa_wm_yuv_rgb.g8b"
+ #include "exa_wm_write.g8b"
+ };
++
++static const uint32_t ps_kernel_rgb[][4] = {
++#include "exa_wm_src_affine.g8b"
++#include "exa_wm_src_sample_argb.g8b"
++#include "exa_wm_write.g8b"
++};
+ #endif
+ 
+ #define SURFACE_DW (64 / sizeof(uint32_t));
+@@ -119,7 +125,7 @@ static const struct wm_kernel_info {
+ 	const void *data;
+ 	unsigned int size;
+ 	int num_surfaces;
+-} wm_kernels[] = {
++} wm_kernels[GEN8_WM_KERNEL_COUNT] = {
+ 	NOKERNEL(NOMASK, gen8_wm_kernel__affine, 2),
+ 	NOKERNEL(NOMASK_P, gen8_wm_kernel__projective, 2),
+ 
+@@ -138,6 +144,7 @@ static const struct wm_kernel_info {
+ #if !NO_VIDEO
+ 	KERNEL(VIDEO_PLANAR, ps_kernel_planar, 7),
+ 	KERNEL(VIDEO_PACKED, ps_kernel_packed, 2),
++	KERNEL(VIDEO_RGB, ps_kernel_rgb, 2),
+ #endif
+ };
+ #undef KERNEL
+@@ -205,6 +212,33 @@ static const struct blendinfo {
+ #define OUT_VERTEX(x,y) vertex_emit_2s(sna, x,y)
+ #define OUT_VERTEX_F(v) vertex_emit(sna, v)
+ 
++struct gt_info {
++	const char *name;
++	struct {
++		int max_vs_entries;
++	} urb;
++};
++
++static const struct gt_info bdw_gt_info = {
++	.name = "Broadwell (gen8)",
++	.urb = { .max_vs_entries = 960 },
++};
++
++static bool is_bdw(struct sna *sna)
++{
++	return sna->kgem.gen == 0100;
++}
++
++static const struct gt_info chv_gt_info = {
++	.name = "Cherryview (gen8)",
++	.urb = { .max_vs_entries = 640 },
++};
++
++static bool is_chv(struct sna *sna)
++{
++	return sna->kgem.gen == 0101;
++}
++
+ static inline bool too_large(int width, int height)
+ {
+ 	return width > GEN8_MAX_SIZE || height > GEN8_MAX_SIZE;
+@@ -462,7 +496,7 @@ gen8_emit_urb(struct sna *sna)
+ {
+ 	/* num of VS entries must be divisible by 8 if size < 9 */
+ 	OUT_BATCH(GEN8_3DSTATE_URB_VS | (2 - 2));
+-	OUT_BATCH(960 << URB_ENTRY_NUMBER_SHIFT |
++	OUT_BATCH(sna->render_state.gen8.info->urb.max_vs_entries << URB_ENTRY_NUMBER_SHIFT |
+ 		  (2 - 1) << URB_ENTRY_SIZE_SHIFT |
+ 		  4 << URB_STARTING_ADDRESS_SHIFT);
+ 
+@@ -873,7 +907,7 @@ gen8_emit_cc(struct sna *sna, uint32_t blend)
+ 	assert(blend / GEN8_BLENDFACTOR_COUNT > 0);
+ 	assert(blend % GEN8_BLENDFACTOR_COUNT > 0);
+ 
+-	/* XXX can have upto 8 blend states preload, selectable via
++	/* XXX can have up to 8 blend states preload, selectable via
+ 	 * Render Target Index. What other side-effects of Render Target Index?
+ 	 */
+ 
+@@ -1167,6 +1201,7 @@ gen8_emit_pipe_stall(struct sna *sna)
+ {
+ 	OUT_BATCH(GEN8_PIPE_CONTROL | (6 - 2));
+ 	OUT_BATCH(PIPE_CONTROL_CS_STALL |
++		  PIPE_CONTROL_FLUSH |
+ 		  PIPE_CONTROL_STALL_AT_SCOREBOARD);
+ 	OUT_BATCH64(0);
+ 	OUT_BATCH64(0);
+@@ -1876,12 +1911,12 @@ gen8_composite_picture(struct sna *sna,
+ 		if (channel->repeat ||
+ 		    (x >= 0 &&
+ 		     y >= 0 &&
+-		     x + w < pixmap->drawable.width &&
+-		     y + h < pixmap->drawable.height)) {
++		     x + w <= pixmap->drawable.width &&
++		     y + h <= pixmap->drawable.height)) {
+ 			struct sna_pixmap *priv = sna_pixmap(pixmap);
+ 			if (priv && priv->clear) {
+ 				DBG(("%s: converting large pixmap source into solid [%08x]\n", __FUNCTION__, priv->clear_color));
+-				return gen4_channel_init_solid(sna, channel, priv->clear_color);
++				return gen4_channel_init_solid(sna, channel, solid_color(picture->format, priv->clear_color));
+ 			}
+ 		}
+ 	} else
+@@ -1961,7 +1996,9 @@ gen8_composite_set_target(struct sna *sna,
+ 	} else
+ 		sna_render_picture_extents(dst, &box);
+ 
+-	hint = PREFER_GPU | FORCE_GPU | RENDER_GPU;
++	hint = PREFER_GPU | RENDER_GPU;
++	if (!need_tiling(sna, op->dst.width, op->dst.height))
++		hint |= FORCE_GPU;
+ 	if (!partial) {
+ 		hint |= IGNORE_DAMAGE;
+ 		if (w == op->dst.width && h == op->dst.height)
+@@ -2002,46 +2039,78 @@ gen8_composite_set_target(struct sna *sna,
+ 
+ static bool
+ try_blt(struct sna *sna,
+-	PicturePtr dst, PicturePtr src,
+-	int width, int height)
++	uint8_t op,
++	PicturePtr src,
++	PicturePtr mask,
++	PicturePtr dst,
++	int16_t src_x, int16_t src_y,
++	int16_t msk_x, int16_t msk_y,
++	int16_t dst_x, int16_t dst_y,
++	int16_t width, int16_t height,
++	unsigned flags,
++	struct sna_composite_op *tmp)
+ {
+ 	struct kgem_bo *bo;
+ 
+ 	if (sna->kgem.mode == KGEM_BLT) {
+ 		DBG(("%s: already performing BLT\n", __FUNCTION__));
+-		return true;
++		goto execute;
+ 	}
+ 
+ 	if (too_large(width, height)) {
+ 		DBG(("%s: operation too large for 3D pipe (%d, %d)\n",
+ 		     __FUNCTION__, width, height));
+-		return true;
++		goto execute;
+ 	}
+ 
+ 	bo = __sna_drawable_peek_bo(dst->pDrawable);
+ 	if (bo == NULL)
+-		return true;
+-	if (bo->rq)
+-		return RQ_IS_BLT(bo->rq);
++		goto execute;
++
++	if (untiled_tlb_miss(bo))
++		goto execute;
++
++	if (bo->rq) {
++		if (RQ_IS_BLT(bo->rq))
++			goto execute;
++
++		return false;
++	}
++
++	if (bo->tiling == I915_TILING_Y)
++		goto upload;
+ 
+ 	if (sna_picture_is_solid(src, NULL) && can_switch_to_blt(sna, bo, 0))
+-		return true;
++		goto execute;
++
++	if (src->pDrawable == dst->pDrawable &&
++	    (sna->render_state.gt < 3 || width*height < 1024) &&
++	    can_switch_to_blt(sna, bo, 0))
++		goto execute;
+ 
+ 	if (src->pDrawable) {
+-		bo = __sna_drawable_peek_bo(src->pDrawable);
+-		if (bo == NULL)
+-			return true;
++		struct kgem_bo *s = __sna_drawable_peek_bo(src->pDrawable);
++		if (s == NULL)
++			goto upload;
+ 
+-		if (prefer_blt_bo(sna, bo))
+-			return RQ_IS_BLT(bo->rq);
++		if (prefer_blt_bo(sna, s, bo))
++			goto execute;
+ 	}
+ 
+ 	if (sna->kgem.ring == KGEM_BLT) {
+ 		DBG(("%s: already performing BLT\n", __FUNCTION__));
+-		return true;
++		goto execute;
+ 	}
+ 
+-	return false;
++upload:
++	flags |= COMPOSITE_UPLOAD;
++execute:
++	return sna_blt_composite(sna, op,
++				 src, dst,
++				 src_x, src_y,
++				 dst_x, dst_y,
++				 width, height,
++				 flags, tmp);
+ }
+ 
+ static bool
+@@ -2271,13 +2340,13 @@ gen8_render_composite(struct sna *sna,
+ 	     width, height, sna->kgem.mode, sna->kgem.ring));
+ 
+ 	if (mask == NULL &&
+-	    try_blt(sna, dst, src, width, height) &&
+-	    sna_blt_composite(sna, op,
+-			      src, dst,
+-			      src_x, src_y,
+-			      dst_x, dst_y,
+-			      width, height,
+-			      flags, tmp))
++	    try_blt(sna, op,
++		    src, mask, dst,
++		    src_x, src_y,
++		    msk_x, msk_y,
++		    dst_x, dst_y,
++		    width, height,
++		    flags, tmp))
+ 		return true;
+ 
+ 	if (gen8_composite_fallback(sna, src, mask, dst))
+@@ -2700,27 +2769,37 @@ prefer_blt_copy(struct sna *sna,
+ 
+ 	assert((flags & COPY_SYNC) == 0);
+ 
+-	if (src_bo == dst_bo && can_switch_to_blt(sna, dst_bo, flags))
+-		return true;
+-
+ 	if (untiled_tlb_miss(src_bo) ||
+ 	    untiled_tlb_miss(dst_bo))
+ 		return true;
+ 
+-	if (force_blt_ring(sna))
++	if (flags & COPY_DRI && !sna->kgem.has_semaphores)
++		return false;
++
++	if (force_blt_ring(sna, dst_bo))
++		return true;
++
++	if ((flags & COPY_SMALL ||
++	     (sna->render_state.gt < 3 && src_bo == dst_bo)) &&
++	    can_switch_to_blt(sna, dst_bo, flags))
+ 		return true;
+ 
+ 	if (kgem_bo_is_render(dst_bo) ||
+ 	    kgem_bo_is_render(src_bo))
+ 		return false;
+ 
++	if (flags & COPY_LAST &&
++	    sna->render_state.gt < 3 &&
++            can_switch_to_blt(sna, dst_bo, flags))
++		return true;
++
+ 	if (prefer_render_ring(sna, dst_bo))
+ 		return false;
+ 
+ 	if (!prefer_blt_ring(sna, dst_bo, flags))
+ 		return false;
+ 
+-	return prefer_blt_bo(sna, src_bo) || prefer_blt_bo(sna, dst_bo);
++	return prefer_blt_bo(sna, src_bo, dst_bo);
+ }
+ 
+ static bool
+@@ -2770,7 +2849,7 @@ fallback_blt:
+ 		     &extents)) {
+ 		bool big = too_large(extents.x2-extents.x1, extents.y2-extents.y1);
+ 
+-		if ((big || can_switch_to_blt(sna, dst_bo, flags)) &&
++		if ((big || !prefer_render_ring(sna, dst_bo)) &&
+ 		    sna_blt_copy_boxes(sna, alu,
+ 				       src_bo, src_dx, src_dy,
+ 				       dst_bo, dst_dx, dst_dy,
+@@ -2785,8 +2864,7 @@ fallback_blt:
+ 		assert(src->depth == dst->depth);
+ 		assert(src->width == dst->width);
+ 		assert(src->height == dst->height);
+-		return sna_render_copy_boxes__overlap(sna, alu,
+-						      src, src_bo,
++		return sna_render_copy_boxes__overlap(sna, alu, dst, dst_bo,
+ 						      src_dx, src_dy,
+ 						      dst_dx, dst_dy,
+ 						      box, n, &extents);
+@@ -3665,7 +3743,9 @@ static void gen8_emit_video_state(struct sna *sna,
+ 			frame->pitch[0];
+ 		n_src = 6;
+ 	} else {
+-		if (frame->id == FOURCC_UYVY)
++		if (frame->id == FOURCC_RGB888)
++			src_surf_format = SURFACEFORMAT_B8G8R8X8_UNORM;
++		else if (frame->id == FOURCC_UYVY)
+ 			src_surf_format = SURFACEFORMAT_YCRCB_SWAPY;
+ 		else
+ 			src_surf_format = SURFACEFORMAT_YCRCB_NORMAL;
+@@ -3697,6 +3777,23 @@ static void gen8_emit_video_state(struct sna *sna,
+ 	gen8_emit_state(sna, op, offset);
+ }
+ 
++static unsigned select_video_kernel(const struct sna_video_frame *frame)
++{
++	switch (frame->id) {
++	case FOURCC_YV12:
++	case FOURCC_I420:
++	case FOURCC_XVMC:
++		return GEN8_WM_KERNEL_VIDEO_PLANAR;
++
++	case FOURCC_RGB888:
++	case FOURCC_RGB565:
++		return GEN8_WM_KERNEL_VIDEO_RGB;
++
++	default:
++		return GEN8_WM_KERNEL_VIDEO_PACKED;
++	}
++}
++
+ static bool
+ gen8_render_video(struct sna *sna,
+ 		  struct sna_video *video,
+@@ -3712,9 +3809,9 @@ gen8_render_video(struct sna *sna,
+ 	int src_height = frame->src.y2 - frame->src.y1;
+ 	float src_offset_x, src_offset_y;
+ 	float src_scale_x, src_scale_y;
+-	int nbox, pix_xoff, pix_yoff;
+ 	unsigned filter;
+ 	const BoxRec *box;
++	int nbox;
+ 
+ 	DBG(("%s: src=(%d, %d), dst=(%d, %d), %dx[(%d, %d), (%d, %d)...]\n",
+ 	     __FUNCTION__,
+@@ -3743,6 +3840,11 @@ gen8_render_video(struct sna *sna,
+ 	tmp.floats_per_vertex = 3;
+ 	tmp.floats_per_rect = 9;
+ 
++	DBG(("%s: scaling?=%d, planar?=%d [%x]\n",
++	     __FUNCTION__,
++	     src_width != dst_width || src_height != dst_height,
++	     is_planar_fourcc(frame->id), frame->id));
++
+ 	if (src_width == dst_width && src_height == dst_height)
+ 		filter = SAMPLER_FILTER_NEAREST;
+ 	else
+@@ -3752,9 +3854,7 @@ gen8_render_video(struct sna *sna,
+ 		GEN8_SET_FLAGS(SAMPLER_OFFSET(filter, SAMPLER_EXTEND_PAD,
+ 					      SAMPLER_FILTER_NEAREST, SAMPLER_EXTEND_NONE),
+ 			       NO_BLEND,
+-			       is_planar_fourcc(frame->id) ?
+-			       GEN8_WM_KERNEL_VIDEO_PLANAR :
+-			       GEN8_WM_KERNEL_VIDEO_PACKED,
++			       select_video_kernel(frame),
+ 			       2);
+ 	tmp.priv = frame;
+ 
+@@ -3770,17 +3870,6 @@ gen8_render_video(struct sna *sna,
+ 	gen8_align_vertex(sna, &tmp);
+ 	gen8_emit_video_state(sna, &tmp);
+ 
+-	/* Set up the offset for translating from the given region (in screen
+-	 * coordinates) to the backing pixmap.
+-	 */
+-#ifdef COMPOSITE
+-	pix_xoff = -pixmap->screen_x + pixmap->drawable.x;
+-	pix_yoff = -pixmap->screen_y + pixmap->drawable.y;
+-#else
+-	pix_xoff = 0;
+-	pix_yoff = 0;
+-#endif
+-
+ 	DBG(("%s: src=(%d, %d)x(%d, %d); frame=(%dx%d), dst=(%dx%d)\n",
+ 	     __FUNCTION__,
+ 	     frame->src.x1, frame->src.y1,
+@@ -3802,45 +3891,36 @@ gen8_render_video(struct sna *sna,
+ 	box = region_rects(dstRegion);
+ 	nbox = region_num_rects(dstRegion);
+ 	while (nbox--) {
+-		BoxRec r;
+-
+ 		DBG(("%s: dst=(%d, %d), (%d, %d) + (%d, %d); src=(%f, %f), (%f, %f)\n",
+ 		     __FUNCTION__,
+ 		     box->x1, box->y1,
+ 		     box->x2, box->y2,
+-		     pix_xoff, pix_yoff,
+ 		     box->x1 * src_scale_x + src_offset_x,
+ 		     box->y1 * src_scale_y + src_offset_y,
+ 		     box->x2 * src_scale_x + src_offset_x,
+ 		     box->y2 * src_scale_y + src_offset_y));
+ 
+-		r.x1 = box->x1 + pix_xoff;
+-		r.x2 = box->x2 + pix_xoff;
+-		r.y1 = box->y1 + pix_yoff;
+-		r.y2 = box->y2 + pix_yoff;
+-
+ 		gen8_get_rectangles(sna, &tmp, 1, gen8_emit_video_state);
+ 
+-		OUT_VERTEX(r.x2, r.y2);
++		OUT_VERTEX(box->x2, box->y2);
+ 		OUT_VERTEX_F(box->x2 * src_scale_x + src_offset_x);
+ 		OUT_VERTEX_F(box->y2 * src_scale_y + src_offset_y);
+ 
+-		OUT_VERTEX(r.x1, r.y2);
++		OUT_VERTEX(box->x1, box->y2);
+ 		OUT_VERTEX_F(box->x1 * src_scale_x + src_offset_x);
+ 		OUT_VERTEX_F(box->y2 * src_scale_y + src_offset_y);
+ 
+-		OUT_VERTEX(r.x1, r.y1);
++		OUT_VERTEX(box->x1, box->y1);
+ 		OUT_VERTEX_F(box->x1 * src_scale_x + src_offset_x);
+ 		OUT_VERTEX_F(box->y1 * src_scale_y + src_offset_y);
+ 
+-		if (!DAMAGE_IS_ALL(priv->gpu_damage)) {
+-			sna_damage_add_box(&priv->gpu_damage, &r);
+-			sna_damage_subtract_box(&priv->cpu_damage, &r);
+-		}
+ 		box++;
+ 	}
+-
+ 	gen8_vertex_flush(sna);
++
++	if (!DAMAGE_IS_ALL(priv->gpu_damage))
++		sna_damage_add(&priv->gpu_damage, dstRegion);
++
+ 	return true;
+ }
+ #endif
+@@ -3896,6 +3976,13 @@ static bool gen8_render_setup(struct sna *sna)
+ 		state->gt = ((devid >> 4) & 0xf) + 1;
+ 	DBG(("%s: gt=%d\n", __FUNCTION__, state->gt));
+ 
++	if (is_bdw(sna))
++		state->info = &bdw_gt_info;
++	else if (is_chv(sna))
++		state->info = &chv_gt_info;
++	else
++		return false;
++
+ 	sna_static_stream_init(&general);
+ 
+ 	/* Zero pad the start. If you see an offset of 0x0 in the batchbuffer
+@@ -4007,5 +4094,5 @@ const char *gen8_render_init(struct sna *sna, const char *backend)
+ 
+ 	sna->render.max_3d_size = GEN8_MAX_SIZE;
+ 	sna->render.max_3d_pitch = 1 << 18;
+-	return "Broadwell";
++	return sna->render_state.gen8.info->name;
+ }
+diff --git a/src/sna/gen8_render.h b/src/sna/gen8_render.h
+index eb4928e7..e6a8dc55 100644
+--- a/src/sna/gen8_render.h
++++ b/src/sna/gen8_render.h
+@@ -335,6 +335,7 @@
+ #define PIPE_CONTROL_IS_FLUSH      (1 << 11)
+ #define PIPE_CONTROL_TC_FLUSH      (1 << 10)
+ #define PIPE_CONTROL_NOTIFY_ENABLE (1 << 8)
++#define PIPE_CONTROL_FLUSH         (1 << 7)
+ #define PIPE_CONTROL_GLOBAL_GTT    (1 << 2)
+ #define PIPE_CONTROL_LOCAL_PGTT    (0 << 2)
+ #define PIPE_CONTROL_STALL_AT_SCOREBOARD   (1 << 1)
+diff --git a/src/sna/gen9_render.c b/src/sna/gen9_render.c
+new file mode 100644
+index 00000000..e5f12c72
+--- /dev/null
++++ b/src/sna/gen9_render.c
+@@ -0,0 +1,4156 @@
++/*
++ * Copyright © 2012,2013 Intel Corporation
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice (including the next
++ * paragraph) shall be included in all copies or substantial portions of the
++ * Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ *
++ * Authors:
++ *    Chris Wilson <chris@chris-wilson.co.uk>
++ *
++ */
++
++#ifdef HAVE_CONFIG_H
++#include "config.h"
++#endif
++
++#include "sna.h"
++#include "sna_reg.h"
++#include "sna_render.h"
++#include "sna_render_inline.h"
++#include "sna_video.h"
++
++#include "gen9_render.h"
++#include "gen8_eu.h"
++#include "gen4_common.h"
++#include "gen4_source.h"
++#include "gen4_vertex.h"
++#include "gen6_common.h"
++#include "gen8_vertex.h"
++
++#define SIM 1
++
++#define ALWAYS_INVALIDATE 0
++#define ALWAYS_FLUSH 0
++#define ALWAYS_STALL 0
++
++#define NO_COMPOSITE 0
++#define NO_COMPOSITE_SPANS 0
++#define NO_COPY 0
++#define NO_COPY_BOXES 0
++#define NO_FILL 0
++#define NO_FILL_BOXES 0
++#define NO_FILL_ONE 0
++#define NO_FILL_CLEAR 0
++#define NO_VIDEO 0
++
++#define USE_8_PIXEL_DISPATCH 1
++#define USE_16_PIXEL_DISPATCH 1
++#define USE_32_PIXEL_DISPATCH 0
++
++#if !USE_8_PIXEL_DISPATCH && !USE_16_PIXEL_DISPATCH && !USE_32_PIXEL_DISPATCH
++#error "Must select at least 8, 16 or 32 pixel dispatch"
++#endif
++
++#define GEN9_MAX_SIZE 16384
++#define GEN9_GT_BIAS 1 /* Each GT is bigger than previous gen */
++
++/* XXX Todo
++ *
++ * STR (software tiled rendering) mode. No, really.
++ * 64x32 pixel blocks align with the rendering cache. Worth considering.
++ */
++
++#define is_aligned(x, y) (((x) & ((y) - 1)) == 0)
++
++/* Pipeline stages:
++ *  1. Command Streamer (CS)
++ *  2. Vertex Fetch (VF)
++ *  3. Vertex Shader (VS)
++ *  4. Hull Shader (HS)
++ *  5. Tesselation Engine (TE)
++ *  6. Domain Shader (DS)
++ *  7. Geometry Shader (GS)
++ *  8. Stream Output Logic (SOL)
++ *  9. Clipper (CLIP)
++ * 10. Strip/Fan (SF)
++ * 11. Windower/Masker (WM)
++ * 12. Color Calculator (CC)
++ */
++
++#if !NO_VIDEO
++static const uint32_t ps_kernel_packed[][4] = {
++#include "exa_wm_src_affine.g8b"
++#include "exa_wm_src_sample_argb.g8b"
++#include "exa_wm_yuv_rgb.g8b"
++#include "exa_wm_write.g8b"
++};
++
++static const uint32_t ps_kernel_planar[][4] = {
++#include "exa_wm_src_affine.g8b"
++#include "exa_wm_src_sample_planar.g8b"
++#include "exa_wm_yuv_rgb.g8b"
++#include "exa_wm_write.g8b"
++};
++
++static const uint32_t ps_kernel_rgb[][4] = {
++#include "exa_wm_src_affine.g8b"
++#include "exa_wm_src_sample_argb.g8b"
++#include "exa_wm_write.g8b"
++};
++#endif
++
++#define SURFACE_DW (64 / sizeof(uint32_t));
++
++#define KERNEL(kernel_enum, kernel, num_surfaces) \
++    [GEN9_WM_KERNEL_##kernel_enum] = {#kernel_enum, kernel, sizeof(kernel), num_surfaces}
++#define NOKERNEL(kernel_enum, func, num_surfaces) \
++    [GEN9_WM_KERNEL_##kernel_enum] = {#kernel_enum, (void *)func, 0, num_surfaces}
++static const struct wm_kernel_info {
++	const char *name;
++	const void *data;
++	unsigned int size;
++	int num_surfaces;
++} wm_kernels[] = {
++	NOKERNEL(NOMASK, gen8_wm_kernel__affine, 2),
++	NOKERNEL(NOMASK_P, gen8_wm_kernel__projective, 2),
++
++	NOKERNEL(MASK, gen8_wm_kernel__affine_mask, 3),
++	NOKERNEL(MASK_P, gen8_wm_kernel__projective_mask, 3),
++
++	NOKERNEL(MASKCA, gen8_wm_kernel__affine_mask_ca, 3),
++	NOKERNEL(MASKCA_P, gen8_wm_kernel__projective_mask_ca, 3),
++
++	NOKERNEL(MASKSA, gen8_wm_kernel__affine_mask_sa, 3),
++	NOKERNEL(MASKSA_P, gen8_wm_kernel__projective_mask_sa, 3),
++
++	NOKERNEL(OPACITY, gen8_wm_kernel__affine_opacity, 2),
++	NOKERNEL(OPACITY_P, gen8_wm_kernel__projective_opacity, 2),
++
++#if !NO_VIDEO
++	KERNEL(VIDEO_PLANAR, ps_kernel_planar, 7),
++	KERNEL(VIDEO_PACKED, ps_kernel_packed, 2),
++	KERNEL(VIDEO_RGB, ps_kernel_rgb, 2),
++#endif
++};
++#undef KERNEL
++
++static const struct blendinfo {
++	uint8_t src_alpha;
++	uint8_t src_blend;
++	uint8_t dst_blend;
++} gen9_blend_op[] = {
++	/* Clear */	{0, BLENDFACTOR_ZERO, BLENDFACTOR_ZERO},
++	/* Src */	{0, BLENDFACTOR_ONE, BLENDFACTOR_ZERO},
++	/* Dst */	{0, BLENDFACTOR_ZERO, BLENDFACTOR_ONE},
++	/* Over */	{1, BLENDFACTOR_ONE, BLENDFACTOR_INV_SRC_ALPHA},
++	/* OverReverse */ {0, BLENDFACTOR_INV_DST_ALPHA, BLENDFACTOR_ONE},
++	/* In */	{0, BLENDFACTOR_DST_ALPHA, BLENDFACTOR_ZERO},
++	/* InReverse */	{1, BLENDFACTOR_ZERO, BLENDFACTOR_SRC_ALPHA},
++	/* Out */	{0, BLENDFACTOR_INV_DST_ALPHA, BLENDFACTOR_ZERO},
++	/* OutReverse */ {1, BLENDFACTOR_ZERO, BLENDFACTOR_INV_SRC_ALPHA},
++	/* Atop */	{1, BLENDFACTOR_DST_ALPHA, BLENDFACTOR_INV_SRC_ALPHA},
++	/* AtopReverse */ {1, BLENDFACTOR_INV_DST_ALPHA, BLENDFACTOR_SRC_ALPHA},
++	/* Xor */	{1, BLENDFACTOR_INV_DST_ALPHA, BLENDFACTOR_INV_SRC_ALPHA},
++	/* Add */	{0, BLENDFACTOR_ONE, BLENDFACTOR_ONE},
++};
++
++/**
++ * Highest-valued BLENDFACTOR used in gen9_blend_op.
++ *
++ * This leaves out GEN9_BLENDFACTOR_INV_DST_COLOR,
++ * GEN9_BLENDFACTOR_INV_CONST_{COLOR,ALPHA},
++ * GEN9_BLENDFACTOR_INV_SRC1_{COLOR,ALPHA}
++ */
++#define GEN9_BLENDFACTOR_COUNT (BLENDFACTOR_INV_DST_ALPHA + 1)
++
++#define GEN9_BLEND_STATE_PADDED_SIZE	ALIGN(sizeof(struct gen9_blend_state), 64)
++
++#define BLEND_OFFSET(s, d) \
++	((d != BLENDFACTOR_ZERO) << 15 | ((s) * GEN9_BLENDFACTOR_COUNT + (d)) << 4)
++
++#define NO_BLEND BLEND_OFFSET(BLENDFACTOR_ONE, BLENDFACTOR_ZERO)
++#define CLEAR BLEND_OFFSET(BLENDFACTOR_ZERO, BLENDFACTOR_ZERO)
++
++#define SAMPLER_OFFSET(sf, se, mf, me) \
++	(((((sf) * EXTEND_COUNT + (se)) * FILTER_COUNT + (mf)) * EXTEND_COUNT + (me)) + 2)
++
++#define VERTEX_2s2s 0
++
++#define COPY_SAMPLER 0
++#define COPY_VERTEX VERTEX_2s2s
++#define COPY_FLAGS(a) GEN9_SET_FLAGS(COPY_SAMPLER, (a) == GXcopy ? NO_BLEND : CLEAR, GEN9_WM_KERNEL_NOMASK, COPY_VERTEX)
++
++#define FILL_SAMPLER 1
++#define FILL_VERTEX VERTEX_2s2s
++#define FILL_FLAGS(op, format) GEN9_SET_FLAGS(FILL_SAMPLER, gen9_get_blend((op), false, (format)), GEN9_WM_KERNEL_NOMASK, FILL_VERTEX)
++#define FILL_FLAGS_NOBLEND GEN9_SET_FLAGS(FILL_SAMPLER, NO_BLEND, GEN9_WM_KERNEL_NOMASK, FILL_VERTEX)
++
++#define GEN9_SAMPLER(f) (((f) >> 20) & 0xfff)
++#define GEN9_BLEND(f) (((f) >> 4) & 0x7ff)
++#define GEN9_READS_DST(f) (((f) >> 15) & 1)
++#define GEN9_KERNEL(f) (((f) >> 16) & 0xf)
++#define GEN9_VERTEX(f) (((f) >> 0) & 0xf)
++#define GEN9_SET_FLAGS(S, B, K, V)  ((S) << 20 | (K) << 16 | (B) | (V))
++
++#define OUT_BATCH(v) batch_emit(sna, v)
++#define OUT_BATCH64(v) batch_emit64(sna, v)
++#define OUT_VERTEX(x,y) vertex_emit_2s(sna, x,y)
++#define OUT_VERTEX_F(v) vertex_emit(sna, v)
++
++struct gt_info {
++	const char *name;
++	struct {
++		int max_vs_entries;
++	} urb;
++};
++
++static const struct gt_info min_gt_info = {
++	.name = "Skylake (gen9)",
++	.urb = { .max_vs_entries = 240 },
++};
++
++static const struct gt_info skl_gt_info = {
++	.name = "Skylake (gen9)",
++	.urb = { .max_vs_entries = 960 },
++};
++
++static const struct gt_info bxt_gt_info = {
++	.name = "Broxton (gen9)",
++	.urb = { .max_vs_entries = 320 },
++};
++
++static const struct gt_info kbl_gt_info = {
++	.name = "Kabylake (gen9)",
++	.urb = { .max_vs_entries = 960 },
++};
++
++static const struct gt_info glk_gt_info = {
++	.name = "Geminilake (gen9)",
++	.urb = { .max_vs_entries = 320 },
++};
++
++static bool is_skl(struct sna *sna)
++{
++	return sna->kgem.gen == 0110;
++}
++
++static bool is_bxt(struct sna *sna)
++{
++	return sna->kgem.gen == 0111;
++}
++
++static bool is_kbl(struct sna *sna)
++{
++	return sna->kgem.gen == 0112;
++}
++
++static bool is_glk(struct sna *sna)
++{
++	return sna->kgem.gen == 0113;
++}
++
++
++static inline bool too_large(int width, int height)
++{
++	return width > GEN9_MAX_SIZE || height > GEN9_MAX_SIZE;
++}
++
++static inline bool unaligned(struct kgem_bo *bo, int bpp)
++{
++	/* XXX What exactly do we need to meet H_ALIGN and V_ALIGN? */
++#if 0
++	int x, y;
++
++	if (bo->proxy == NULL)
++		return false;
++
++	/* Assume that all tiled proxies are constructed correctly. */
++	if (bo->tiling)
++		return false;
++
++	DBG(("%s: checking alignment of a linear proxy, offset=%d, pitch=%d, bpp=%d: => (%d, %d)\n",
++	     __FUNCTION__, bo->delta, bo->pitch, bpp,
++	     8 * (bo->delta % bo->pitch) / bpp, bo->delta / bo->pitch));
++
++	/* This may be a random userptr map, check that it meets the
++	 * render alignment of SURFACE_VALIGN_4 | SURFACE_HALIGN_4.
++	 */
++	y = bo->delta / bo->pitch;
++	if (y & 3)
++		return true;
++
++	x = 8 * (bo->delta - y * bo->pitch);
++	if (x & (4*bpp - 1))
++	    return true;
++
++	return false;
++#else
++	return false;
++#endif
++}
++
++static uint32_t gen9_get_blend(int op,
++			       bool has_component_alpha,
++			       uint32_t dst_format)
++{
++	uint32_t src, dst;
++
++	COMPILE_TIME_ASSERT(BLENDFACTOR_INV_DST_ALPHA*GEN9_BLENDFACTOR_COUNT + BLENDFACTOR_INV_DST_ALPHA <= 0x7ff);
++
++	src = gen9_blend_op[op].src_blend;
++	dst = gen9_blend_op[op].dst_blend;
++
++	/* If there's no dst alpha channel, adjust the blend op so that
++	 * we'll treat it always as 1.
++	 */
++	if (PICT_FORMAT_A(dst_format) == 0) {
++		if (src == BLENDFACTOR_DST_ALPHA)
++			src = BLENDFACTOR_ONE;
++		else if (src == BLENDFACTOR_INV_DST_ALPHA)
++			src = BLENDFACTOR_ZERO;
++	}
++
++	/* If the source alpha is being used, then we should only be in a
++	 * case where the source blend factor is 0, and the source blend
++	 * value is the mask channels multiplied by the source picture's alpha.
++	 */
++	if (has_component_alpha && gen9_blend_op[op].src_alpha) {
++		if (dst == BLENDFACTOR_SRC_ALPHA)
++			dst = BLENDFACTOR_SRC_COLOR;
++		else if (dst == BLENDFACTOR_INV_SRC_ALPHA)
++			dst = BLENDFACTOR_INV_SRC_COLOR;
++	}
++
++	DBG(("blend op=%d, dst=%x [A=%d] => src=%d, dst=%d => offset=%x\n",
++	     op, dst_format, PICT_FORMAT_A(dst_format),
++	     src, dst, (int)(BLEND_OFFSET(src, dst)>>4)));
++	assert(BLEND_OFFSET(src, dst) >> 4 <= 0xfff);
++	return BLEND_OFFSET(src, dst);
++}
++
++static uint32_t gen9_get_card_format(PictFormat format)
++{
++	switch (format) {
++	default:
++		return -1;
++	case PICT_a8r8g8b8:
++		return SURFACEFORMAT_B8G8R8A8_UNORM;
++	case PICT_x8r8g8b8:
++		return SURFACEFORMAT_B8G8R8X8_UNORM;
++	case PICT_a8b8g8r8:
++		return SURFACEFORMAT_R8G8B8A8_UNORM;
++	case PICT_x8b8g8r8:
++		return SURFACEFORMAT_R8G8B8X8_UNORM;
++#ifdef PICT_a2r10g10b10
++	case PICT_a2r10g10b10:
++		return SURFACEFORMAT_B10G10R10A2_UNORM;
++	case PICT_x2r10g10b10:
++		return SURFACEFORMAT_B10G10R10X2_UNORM;
++#endif
++	case PICT_r8g8b8:
++		return SURFACEFORMAT_R8G8B8_UNORM;
++	case PICT_r5g6b5:
++		return SURFACEFORMAT_B5G6R5_UNORM;
++	case PICT_a1r5g5b5:
++		return SURFACEFORMAT_B5G5R5A1_UNORM;
++	case PICT_a8:
++		return SURFACEFORMAT_A8_UNORM;
++	case PICT_a4r4g4b4:
++		return SURFACEFORMAT_B4G4R4A4_UNORM;
++	}
++}
++
++static uint32_t gen9_get_dest_format(PictFormat format)
++{
++	switch (format) {
++	default:
++		return -1;
++	case PICT_a8r8g8b8:
++	case PICT_x8r8g8b8:
++		return SURFACEFORMAT_B8G8R8A8_UNORM;
++	case PICT_a8b8g8r8:
++	case PICT_x8b8g8r8:
++		return SURFACEFORMAT_R8G8B8A8_UNORM;
++#ifdef PICT_a2r10g10b10
++	case PICT_a2r10g10b10:
++	case PICT_x2r10g10b10:
++		return SURFACEFORMAT_B10G10R10A2_UNORM;
++#endif
++	case PICT_r5g6b5:
++		return SURFACEFORMAT_B5G6R5_UNORM;
++	case PICT_x1r5g5b5:
++	case PICT_a1r5g5b5:
++		return SURFACEFORMAT_B5G5R5A1_UNORM;
++	case PICT_a8:
++		return SURFACEFORMAT_A8_UNORM;
++	case PICT_a4r4g4b4:
++	case PICT_x4r4g4b4:
++		return SURFACEFORMAT_B4G4R4A4_UNORM;
++	}
++}
++
++static bool gen9_check_dst_format(PictFormat format)
++{
++	if (gen9_get_dest_format(format) != -1)
++		return true;
++
++	DBG(("%s: unhandled format: %x\n", __FUNCTION__, (int)format));
++	return false;
++}
++
++static bool gen9_check_format(uint32_t format)
++{
++	if (gen9_get_card_format(format) != -1)
++		return true;
++
++	DBG(("%s: unhandled format: %x\n", __FUNCTION__, (int)format));
++	return false;
++}
++
++static uint32_t gen9_filter(uint32_t filter)
++{
++	switch (filter) {
++	default:
++		assert(0);
++	case PictFilterNearest:
++		return SAMPLER_FILTER_NEAREST;
++	case PictFilterBilinear:
++		return SAMPLER_FILTER_BILINEAR;
++	}
++}
++
++static uint32_t gen9_check_filter(PicturePtr picture)
++{
++	switch (picture->filter) {
++	case PictFilterNearest:
++	case PictFilterBilinear:
++		return true;
++	default:
++		return false;
++	}
++}
++
++static uint32_t gen9_repeat(uint32_t repeat)
++{
++	switch (repeat) {
++	default:
++		assert(0);
++	case RepeatNone:
++		return SAMPLER_EXTEND_NONE;
++	case RepeatNormal:
++		return SAMPLER_EXTEND_REPEAT;
++	case RepeatPad:
++		return SAMPLER_EXTEND_PAD;
++	case RepeatReflect:
++		return SAMPLER_EXTEND_REFLECT;
++	}
++}
++
++static bool gen9_check_repeat(PicturePtr picture)
++{
++	if (!picture->repeat)
++		return true;
++
++	switch (picture->repeatType) {
++	case RepeatNone:
++	case RepeatNormal:
++	case RepeatPad:
++	case RepeatReflect:
++		return true;
++	default:
++		return false;
++	}
++}
++
++static int
++gen9_choose_composite_kernel(int op, bool has_mask, bool is_ca, bool is_affine)
++{
++	int base;
++
++	if (has_mask) {
++		if (is_ca) {
++			if (gen9_blend_op[op].src_alpha)
++				base = GEN9_WM_KERNEL_MASKSA;
++			else
++				base = GEN9_WM_KERNEL_MASKCA;
++		} else
++			base = GEN9_WM_KERNEL_MASK;
++	} else
++		base = GEN9_WM_KERNEL_NOMASK;
++
++	return base + !is_affine;
++}
++
++static void
++gen9_emit_push_constants(struct sna *sna)
++{
++#if SIM
++	OUT_BATCH(GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_VS | (2 - 2));
++	OUT_BATCH(0);
++
++	OUT_BATCH(GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_HS | (2 - 2));
++	OUT_BATCH(0);
++
++	OUT_BATCH(GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_DS | (2 - 2));
++	OUT_BATCH(0);
++
++	OUT_BATCH(GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_GS | (2 - 2));
++	OUT_BATCH(0);
++
++	OUT_BATCH(GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_PS | (2 - 2));
++	OUT_BATCH(0);
++#endif
++}
++
++static void
++gen9_emit_urb(struct sna *sna)
++{
++	/* num of VS entries must be divisible by 8 if size < 9 */
++	OUT_BATCH(GEN9_3DSTATE_URB_VS | (2 - 2));
++	OUT_BATCH(sna->render_state.gen9.info->urb.max_vs_entries << URB_ENTRY_NUMBER_SHIFT |
++		  (2 - 1) << URB_ENTRY_SIZE_SHIFT |
++		  4 << URB_STARTING_ADDRESS_SHIFT);
++
++	OUT_BATCH(GEN9_3DSTATE_URB_HS | (2 - 2));
++	OUT_BATCH(0 << URB_ENTRY_SIZE_SHIFT |
++		  4 << URB_STARTING_ADDRESS_SHIFT);
++
++	OUT_BATCH(GEN9_3DSTATE_URB_DS | (2 - 2));
++	OUT_BATCH(0 << URB_ENTRY_SIZE_SHIFT |
++		  4 << URB_STARTING_ADDRESS_SHIFT);
++
++	OUT_BATCH(GEN9_3DSTATE_URB_GS | (2 - 2));
++	OUT_BATCH(0 << URB_ENTRY_SIZE_SHIFT |
++		  4 << URB_STARTING_ADDRESS_SHIFT);
++}
++
++static void
++gen9_emit_state_base_address(struct sna *sna)
++{
++	uint32_t num_pages;
++
++	assert(sna->kgem.surface - sna->kgem.nbatch <= 16384);
++
++	/* WaBindlessSurfaceStateModifyEnable:skl,bxt */
++	OUT_BATCH(GEN9_STATE_BASE_ADDRESS | (19 - 1 - 2));
++	OUT_BATCH64(0); /* general */
++	OUT_BATCH(0); /* stateless dataport */
++	OUT_BATCH64(kgem_add_reloc64(&sna->kgem, /* surface */
++				     sna->kgem.nbatch,
++				     NULL,
++				     I915_GEM_DOMAIN_INSTRUCTION << 16,
++				     BASE_ADDRESS_MODIFY));
++	OUT_BATCH64(kgem_add_reloc64(&sna->kgem, /* dynamic */
++				     sna->kgem.nbatch,
++				     sna->render_state.gen9.general_bo,
++				     I915_GEM_DOMAIN_INSTRUCTION << 16,
++				     BASE_ADDRESS_MODIFY));
++	OUT_BATCH64(0); /* indirect */
++	OUT_BATCH64(kgem_add_reloc64(&sna->kgem, /* instruction */
++				     sna->kgem.nbatch,
++				     sna->render_state.gen9.general_bo,
++				     I915_GEM_DOMAIN_INSTRUCTION << 16,
++				     BASE_ADDRESS_MODIFY));
++	/* upper bounds */
++	num_pages = sna->render_state.gen9.general_bo->size.pages.count;
++	OUT_BATCH(0); /* general */
++	OUT_BATCH(num_pages << 12 | 1); /* dynamic */
++	OUT_BATCH(0); /* indirect */
++	OUT_BATCH(num_pages << 12 | 1); /* instruction */
++
++	/* Bindless */
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++}
++
++static void
++gen9_emit_vs_invariant(struct sna *sna)
++{
++	OUT_BATCH(GEN9_3DSTATE_VS | (9 - 2));
++	OUT_BATCH64(0); /* no VS kernel */
++	OUT_BATCH(0);
++	OUT_BATCH64(0); /* scratch */
++	OUT_BATCH(0);
++	OUT_BATCH(1 << 1); /* pass-through */
++	OUT_BATCH(1 << 16 | 1 << 21); /* urb write to SBE */
++
++#if SIM
++	OUT_BATCH(GEN9_3DSTATE_CONSTANT_VS | (11 - 2));
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++	OUT_BATCH64(0);
++	OUT_BATCH64(0);
++	OUT_BATCH64(0);
++	OUT_BATCH64(0);
++
++	OUT_BATCH(GEN9_3DSTATE_BINDING_TABLE_POINTERS_VS | (2 - 2));
++	OUT_BATCH(0);
++
++	OUT_BATCH(GEN9_3DSTATE_SAMPLER_STATE_POINTERS_VS | (2 - 2));
++	OUT_BATCH(0);
++#endif
++}
++
++static void
++gen9_emit_hs_invariant(struct sna *sna)
++{
++	OUT_BATCH(GEN9_3DSTATE_HS | (9 - 2));
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++	OUT_BATCH64(0); /* no HS kernel */
++	OUT_BATCH64(0); /* scratch */
++	OUT_BATCH(0);
++	OUT_BATCH(0); /* pass-through */
++
++#if SIM
++	OUT_BATCH(GEN9_3DSTATE_CONSTANT_HS | (11 - 2));
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++	OUT_BATCH64(0);
++	OUT_BATCH64(0);
++	OUT_BATCH64(0);
++	OUT_BATCH64(0);
++
++#if 1
++	OUT_BATCH(GEN9_3DSTATE_BINDING_TABLE_POINTERS_HS | (2 - 2));
++	OUT_BATCH(0);
++
++	OUT_BATCH(GEN9_3DSTATE_SAMPLER_STATE_POINTERS_HS | (2 - 2));
++	OUT_BATCH(0);
++#endif
++#endif
++}
++
++static void
++gen9_emit_te_invariant(struct sna *sna)
++{
++	OUT_BATCH(GEN9_3DSTATE_TE | (4 - 2));
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++}
++
++static void
++gen9_emit_ds_invariant(struct sna *sna)
++{
++	OUT_BATCH(GEN9_3DSTATE_DS | (11 - 2));
++	OUT_BATCH64(0); /* no kernel */
++	OUT_BATCH(0);
++	OUT_BATCH64(0); /* scratch */
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++
++#if SIM
++	OUT_BATCH(GEN9_3DSTATE_CONSTANT_DS | (11 - 2));
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++	OUT_BATCH64(0);
++	OUT_BATCH64(0);
++	OUT_BATCH64(0);
++	OUT_BATCH64(0);
++
++#if 1
++	OUT_BATCH(GEN9_3DSTATE_BINDING_TABLE_POINTERS_DS | (2 - 2));
++	OUT_BATCH(0);
++
++	OUT_BATCH(GEN9_3DSTATE_SAMPLER_STATE_POINTERS_DS | (2 - 2));
++	OUT_BATCH(0);
++#endif
++#endif
++}
++
++static void
++gen9_emit_gs_invariant(struct sna *sna)
++{
++	OUT_BATCH(GEN9_3DSTATE_GS | (10 - 2));
++	OUT_BATCH64(0); /* no GS kernel */
++	OUT_BATCH(0);
++	OUT_BATCH64(0); /* scratch */
++	OUT_BATCH(0);
++	OUT_BATCH(0); /* pass-through */
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++
++#if SIM
++	OUT_BATCH(GEN9_3DSTATE_CONSTANT_GS | (11 - 2));
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++	OUT_BATCH64(0);
++	OUT_BATCH64(0);
++	OUT_BATCH64(0);
++	OUT_BATCH64(0);
++
++#if 1
++	OUT_BATCH(GEN9_3DSTATE_BINDING_TABLE_POINTERS_GS | (2 - 2));
++	OUT_BATCH(0);
++
++	OUT_BATCH(GEN9_3DSTATE_SAMPLER_STATE_POINTERS_GS | (2 - 2));
++	OUT_BATCH(0);
++#endif
++#endif
++}
++
++static void
++gen9_emit_sol_invariant(struct sna *sna)
++{
++	OUT_BATCH(GEN9_3DSTATE_STREAMOUT | (5 - 2));
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++}
++
++static void
++gen9_emit_sf_invariant(struct sna *sna)
++{
++	OUT_BATCH(GEN9_3DSTATE_SF | (4 - 2));
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++}
++
++static void
++gen9_emit_clip_invariant(struct sna *sna)
++{
++	OUT_BATCH(GEN9_3DSTATE_CLIP | (4 - 2));
++	OUT_BATCH(0);
++	OUT_BATCH(0); /* pass-through */
++	OUT_BATCH(0);
++
++	OUT_BATCH(GEN9_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP | (2 - 2));
++	OUT_BATCH(0);
++
++	OUT_BATCH(GEN9_3DSTATE_VIEWPORT_STATE_POINTERS_CC | (2 - 2));
++	OUT_BATCH(0);
++}
++
++static void
++gen9_emit_null_depth_buffer(struct sna *sna)
++{
++	OUT_BATCH(GEN9_3DSTATE_DEPTH_BUFFER | (8 - 2));
++#if 1
++	OUT_BATCH(SURFACE_NULL << DEPTH_BUFFER_TYPE_SHIFT |
++		  DEPTHFORMAT_D32_FLOAT << DEPTH_BUFFER_FORMAT_SHIFT);
++#else
++	OUT_BATCH(SURFACE_2D << DEPTH_BUFFER_TYPE_SHIFT |
++		  DEPTHFORMAT_D16_UNORM << DEPTH_BUFFER_FORMAT_SHIFT);
++#endif
++	OUT_BATCH64(0);
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++
++#if SIM
++	OUT_BATCH(GEN9_3DSTATE_HIER_DEPTH_BUFFER | (5 - 2));
++	OUT_BATCH(0);
++	OUT_BATCH64(0);
++	OUT_BATCH(0);
++#endif
++
++#if SIM
++	OUT_BATCH(GEN9_3DSTATE_STENCIL_BUFFER | (5 - 2));
++	OUT_BATCH(0);
++	OUT_BATCH64(0);
++	OUT_BATCH(0);
++#endif
++
++#if SIM
++	OUT_BATCH(GEN9_3DSTATE_WM_DEPTH_STENCIL | (4 - 2));
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++#endif
++
++#if SIM
++	OUT_BATCH(GEN9_3DSTATE_CLEAR_PARAMS | (3 - 2));
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++#endif
++}
++
++static void
++gen9_emit_wm_invariant(struct sna *sna)
++{
++	gen9_emit_null_depth_buffer(sna);
++
++#if SIM
++	OUT_BATCH(GEN9_3DSTATE_SCISSOR_STATE_POINTERS | (2 - 2));
++	OUT_BATCH(0);
++#endif
++
++	OUT_BATCH(GEN9_3DSTATE_WM | (2 - 2));
++	//OUT_BATCH(WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC); /* XXX */
++	OUT_BATCH(WM_PERSPECTIVE_PIXEL_BARYCENTRIC);
++
++#if SIM
++	OUT_BATCH(GEN9_3DSTATE_WM_CHROMAKEY | (2 - 2));
++	OUT_BATCH(0);
++#endif
++
++#if 0
++	OUT_BATCH(GEN9_3DSTATE_WM_HZ_OP | (5 - 2));
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++#endif
++
++	OUT_BATCH(GEN9_3DSTATE_PS_EXTRA | (2 - 2));
++	OUT_BATCH(PSX_PIXEL_SHADER_VALID |
++		  PSX_ATTRIBUTE_ENABLE);
++
++	OUT_BATCH(GEN9_3DSTATE_RASTER | (5 - 2));
++	OUT_BATCH(RASTER_FRONT_WINDING_CCW |
++		  RASTER_CULL_NONE);
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++
++	OUT_BATCH(GEN9_3DSTATE_SBE_SWIZ | (11 - 2));
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++
++#if SIM
++	OUT_BATCH(GEN9_3DSTATE_CONSTANT_PS | (11 - 2));
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++	OUT_BATCH64(0);
++	OUT_BATCH64(0);
++	OUT_BATCH64(0);
++	OUT_BATCH64(0);
++#endif
++}
++
++static void
++gen9_emit_cc_invariant(struct sna *sna)
++{
++}
++
++static void
++gen9_emit_vf_invariant(struct sna *sna)
++{
++	int n;
++
++#if 1
++	OUT_BATCH(GEN9_3DSTATE_VF | (2 - 2));
++	OUT_BATCH(0);
++#endif
++
++	OUT_BATCH(GEN9_3DSTATE_VF_SGVS | (2 - 2));
++	OUT_BATCH(0);
++
++	OUT_BATCH(GEN9_3DSTATE_VF_TOPOLOGY | (2 - 2));
++	OUT_BATCH(RECTLIST);
++
++	OUT_BATCH(GEN9_3DSTATE_VF_STATISTICS | 0);
++
++	for (n = 1; n <= 3; n++) {
++		OUT_BATCH(GEN9_3DSTATE_VF_INSTANCING | (3 - 2));
++		OUT_BATCH(n);
++		OUT_BATCH(0);
++	}
++}
++
++static void
++gen9_emit_invariant(struct sna *sna)
++{
++	OUT_BATCH(GEN9_PIPELINE_SELECT |
++		  PIPELINE_SELECTION_MASK |
++		  PIPELINE_SELECT_3D);
++
++#if SIM
++	OUT_BATCH(GEN9_STATE_SIP | (3 - 2));
++	OUT_BATCH64(0);
++#endif
++
++	OUT_BATCH(GEN9_3DSTATE_MULTISAMPLE | (2 - 2));
++	OUT_BATCH(MULTISAMPLE_PIXEL_LOCATION_CENTER |
++		  MULTISAMPLE_NUMSAMPLES_1); /* 1 sample/pixel */
++
++	OUT_BATCH(GEN9_3DSTATE_SAMPLE_MASK | (2 - 2));
++	OUT_BATCH(1);
++
++#if SIM
++	OUT_BATCH(GEN9_3DSTATE_SAMPLE_PATTERN | (5 - 2));
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++	//OUT_BATCH(8<<20 | 8<<16);
++	OUT_BATCH(0);
++#endif
++
++	gen9_emit_push_constants(sna);
++	gen9_emit_urb(sna);
++
++	gen9_emit_state_base_address(sna);
++
++	gen9_emit_vf_invariant(sna);
++	gen9_emit_vs_invariant(sna);
++	gen9_emit_hs_invariant(sna);
++	gen9_emit_te_invariant(sna);
++	gen9_emit_ds_invariant(sna);
++	gen9_emit_gs_invariant(sna);
++	gen9_emit_sol_invariant(sna);
++	gen9_emit_clip_invariant(sna);
++	gen9_emit_sf_invariant(sna);
++	gen9_emit_wm_invariant(sna);
++	gen9_emit_cc_invariant(sna);
++
++	sna->render_state.gen9.needs_invariant = false;
++}
++
++static void
++gen9_emit_cc(struct sna *sna, uint32_t blend)
++{
++	struct gen9_render_state *render = &sna->render_state.gen9;
++
++	if (render->blend == blend)
++		return;
++
++	DBG(("%s: blend=%x (current=%x), src=%d, dst=%d\n",
++	     __FUNCTION__, blend, render->blend,
++	     blend / GEN9_BLENDFACTOR_COUNT,
++	     blend % GEN9_BLENDFACTOR_COUNT));
++
++	assert(blend < GEN9_BLENDFACTOR_COUNT * GEN9_BLENDFACTOR_COUNT);
++	assert(blend / GEN9_BLENDFACTOR_COUNT > 0);
++	assert(blend % GEN9_BLENDFACTOR_COUNT > 0);
++
++	/* XXX can have up to 8 blend states preload, selectable via
++	 * Render Target Index. What other side-effects of Render Target Index?
++	 */
++
++	OUT_BATCH(GEN9_3DSTATE_PS_BLEND | (2 - 2));
++	if (blend != GEN9_BLEND(NO_BLEND)) {
++		uint32_t src = blend / GEN9_BLENDFACTOR_COUNT;
++		uint32_t dst = blend % GEN9_BLENDFACTOR_COUNT;
++		OUT_BATCH(PS_BLEND_HAS_WRITEABLE_RT |
++			  PS_BLEND_COLOR_BLEND_ENABLE |
++			  src << PS_BLEND_SRC_ALPHA_SHIFT |
++			  dst << PS_BLEND_DST_ALPHA_SHIFT |
++			  src << PS_BLEND_SRC_SHIFT |
++			  dst << PS_BLEND_DST_SHIFT);
++	} else
++		OUT_BATCH(PS_BLEND_HAS_WRITEABLE_RT);
++
++	assert(is_aligned(render->cc_blend + blend * GEN9_BLEND_STATE_PADDED_SIZE, 64));
++	OUT_BATCH(GEN9_3DSTATE_BLEND_STATE_POINTERS | (2 - 2));
++	OUT_BATCH((render->cc_blend + blend * GEN9_BLEND_STATE_PADDED_SIZE) | 1);
++
++	/* Force a CC_STATE pointer change to improve blend performance */
++	OUT_BATCH(GEN9_3DSTATE_CC_STATE_POINTERS | (2 - 2));
++	OUT_BATCH(0);
++
++	render->blend = blend;
++}
++
++static void
++gen9_emit_sampler(struct sna *sna, uint32_t state)
++{
++	if (sna->render_state.gen9.samplers == state)
++		return;
++
++	sna->render_state.gen9.samplers = state;
++
++	DBG(("%s: sampler = %x\n", __FUNCTION__, state));
++
++	assert(2 * sizeof(struct gen9_sampler_state) == 32);
++	OUT_BATCH(GEN9_3DSTATE_SAMPLER_STATE_POINTERS_PS | (2 - 2));
++	OUT_BATCH(sna->render_state.gen9.wm_state + state * 2 * sizeof(struct gen9_sampler_state));
++}
++
++static void
++gen9_emit_sf(struct sna *sna, bool has_mask)
++{
++	int num_sf_outputs = has_mask ? 2 : 1;
++
++	if (sna->render_state.gen9.num_sf_outputs == num_sf_outputs)
++		return;
++
++	DBG(("%s: num_sf_outputs=%d\n", __FUNCTION__, num_sf_outputs));
++
++	sna->render_state.gen9.num_sf_outputs = num_sf_outputs;
++
++	OUT_BATCH(GEN9_3DSTATE_SBE | (6 - 2));
++	OUT_BATCH(num_sf_outputs << SBE_NUM_OUTPUTS_SHIFT |
++		  SBE_FORCE_VERTEX_URB_READ_LENGTH | /* forced is faster */
++		  SBE_FORCE_VERTEX_URB_READ_OFFSET |
++		  1 << SBE_URB_ENTRY_READ_LENGTH_SHIFT |
++		  1 << SBE_URB_ENTRY_READ_OFFSET_SHIFT);
++	OUT_BATCH(0);
++	OUT_BATCH(0);
++        OUT_BATCH(SBE_ACTIVE_COMPONENT_XYZW << 0 |
++		  SBE_ACTIVE_COMPONENT_XYZW << 1);
++        OUT_BATCH(0);
++}
++
++static void
++gen9_emit_wm(struct sna *sna, int kernel)
++{
++	const uint32_t *kernels;
++
++	assert(kernel < ARRAY_SIZE(wm_kernels));
++	if (sna->render_state.gen9.kernel == kernel)
++		return;
++
++	sna->render_state.gen9.kernel = kernel;
++	kernels = sna->render_state.gen9.wm_kernel[kernel];
++
++	DBG(("%s: switching to %s, num_surfaces=%d (8-wide? %d, 16-wide? %d, 32-wide? %d)\n",
++	     __FUNCTION__,
++	     wm_kernels[kernel].name,
++	     wm_kernels[kernel].num_surfaces,
++	     kernels[0], kernels[1], kernels[2]));
++	assert(is_aligned(kernels[0], 64));
++	assert(is_aligned(kernels[1], 64));
++	assert(is_aligned(kernels[2], 64));
++
++	OUT_BATCH(GEN9_3DSTATE_PS | (12 - 2));
++	OUT_BATCH64(kernels[0] ?: kernels[1] ?: kernels[2]);
++	OUT_BATCH(1 << PS_SAMPLER_COUNT_SHIFT |
++		  PS_VECTOR_MASK_ENABLE |
++		  wm_kernels[kernel].num_surfaces << PS_BINDING_TABLE_ENTRY_COUNT_SHIFT);
++	OUT_BATCH64(0); /* scratch address */
++	OUT_BATCH(PS_MAX_THREADS |
++		  (kernels[0] ? PS_8_DISPATCH_ENABLE : 0) |
++		  (kernels[1] ? PS_16_DISPATCH_ENABLE : 0) |
++		  (kernels[2] ? PS_32_DISPATCH_ENABLE : 0));
++	OUT_BATCH((kernels[0] ? 4 : kernels[1] ? 6 : 8) << PS_DISPATCH_START_GRF_SHIFT_0 |
++		  8 << PS_DISPATCH_START_GRF_SHIFT_1 |
++		  6 << PS_DISPATCH_START_GRF_SHIFT_2);
++	OUT_BATCH64(kernels[2]);
++	OUT_BATCH64(kernels[1]);
++}
++
++static bool
++gen9_emit_binding_table(struct sna *sna, uint16_t offset)
++{
++	if (sna->render_state.gen9.surface_table == offset)
++		return false;
++
++	/* Binding table pointers */
++	assert(is_aligned(4*offset, 32));
++	OUT_BATCH(GEN9_3DSTATE_BINDING_TABLE_POINTERS_PS | (2 - 2));
++	OUT_BATCH(offset*4);
++
++	sna->render_state.gen9.surface_table = offset;
++	return true;
++}
++
++static bool
++gen9_emit_drawing_rectangle(struct sna *sna,
++			    const struct sna_composite_op *op)
++{
++	uint32_t limit = (op->dst.height - 1) << 16 | (op->dst.width - 1);
++	uint32_t offset = (uint16_t)op->dst.y << 16 | (uint16_t)op->dst.x;
++
++	assert(!too_large(abs(op->dst.x), abs(op->dst.y)));
++	assert(!too_large(op->dst.width, op->dst.height));
++
++	if (sna->render_state.gen9.drawrect_limit == limit &&
++	    sna->render_state.gen9.drawrect_offset == offset)
++		return true;
++
++	sna->render_state.gen9.drawrect_offset = offset;
++	sna->render_state.gen9.drawrect_limit = limit;
++
++	OUT_BATCH(GEN9_3DSTATE_DRAWING_RECTANGLE | (4 - 2));
++	OUT_BATCH(0);
++	OUT_BATCH(limit);
++	OUT_BATCH(offset);
++	return false;
++}
++
++static void
++gen9_emit_vertex_elements(struct sna *sna,
++			  const struct sna_composite_op *op)
++{
++	/*
++	 * vertex data in vertex buffer
++	 *    position: (x, y)
++	 *    texture coordinate 0: (u0, v0) if (is_affine is true) else (u0, v0, w0)
++	 *    texture coordinate 1 if (has_mask is true): same as above
++	 */
++	struct gen9_render_state *render = &sna->render_state.gen9;
++	uint32_t src_format, dw;
++	int id = GEN9_VERTEX(op->u.gen9.flags);
++	bool has_mask;
++
++	DBG(("%s: setup id=%d\n", __FUNCTION__, id));
++
++	if (render->ve_id == id)
++		return;
++	render->ve_id = id;
++
++	if (render->ve_dirty) {
++		/* dummy primitive to flush vertex before change? */
++		OUT_BATCH(GEN9_3DPRIMITIVE | (7 - 2));
++		OUT_BATCH(0); /* ignored, see VF_TOPOLOGY */
++		OUT_BATCH(0);
++		OUT_BATCH(0);
++		OUT_BATCH(1);	/* single instance */
++		OUT_BATCH(0);	/* start instance location */
++		OUT_BATCH(0);	/* index buffer offset, ignored */
++	}
++
++	/* The VUE layout
++	 *    dword 0-3: pad (0.0, 0.0, 0.0. 0.0)
++	 *    dword 4-7: position (x, y, 1.0, 1.0),
++	 *    dword 8-11: texture coordinate 0 (u0, v0, w0, 1.0)
++	 *    dword 12-15: texture coordinate 1 (u1, v1, w1, 1.0)
++	 *
++	 * dword 4-15 are fetched from vertex buffer
++	 */
++	has_mask = (id >> 2) != 0;
++	OUT_BATCH(GEN9_3DSTATE_VERTEX_ELEMENTS |
++		((2 * (3 + has_mask)) + 1 - 2));
++
++	OUT_BATCH(id << VE_INDEX_SHIFT | VE_VALID |
++		  SURFACEFORMAT_R32G32B32A32_FLOAT << VE_FORMAT_SHIFT |
++		  0 << VE_OFFSET_SHIFT);
++	OUT_BATCH(COMPONENT_STORE_0 << VE_COMPONENT_0_SHIFT |
++		  COMPONENT_STORE_0 << VE_COMPONENT_1_SHIFT |
++		  COMPONENT_STORE_0 << VE_COMPONENT_2_SHIFT |
++		  COMPONENT_STORE_0 << VE_COMPONENT_3_SHIFT);
++
++	/* x,y */
++	OUT_BATCH(id << VE_INDEX_SHIFT | VE_VALID |
++		  SURFACEFORMAT_R16G16_SSCALED << VE_FORMAT_SHIFT |
++		  0 << VE_OFFSET_SHIFT);
++	OUT_BATCH(COMPONENT_STORE_SRC << VE_COMPONENT_0_SHIFT |
++		  COMPONENT_STORE_SRC << VE_COMPONENT_1_SHIFT |
++		  COMPONENT_STORE_0 << VE_COMPONENT_2_SHIFT |
++		  COMPONENT_STORE_1_FLT << VE_COMPONENT_3_SHIFT);
++
++	/* u0, v0, w0 */
++	DBG(("%s: first channel %d floats, offset=4\n", __FUNCTION__, id & 3));
++	dw = COMPONENT_STORE_1_FLT << VE_COMPONENT_3_SHIFT;
++	switch (id & 3) {
++	default:
++		assert(0);
++	case 0:
++		src_format = SURFACEFORMAT_R16G16_SSCALED;
++		dw |= COMPONENT_STORE_SRC << VE_COMPONENT_0_SHIFT;
++		dw |= COMPONENT_STORE_SRC << VE_COMPONENT_1_SHIFT;
++		dw |= COMPONENT_STORE_0 << VE_COMPONENT_2_SHIFT;
++		break;
++	case 1:
++		src_format = SURFACEFORMAT_R32_FLOAT;
++		dw |= COMPONENT_STORE_SRC << VE_COMPONENT_0_SHIFT;
++		dw |= COMPONENT_STORE_0 << VE_COMPONENT_1_SHIFT;
++		dw |= COMPONENT_STORE_0 << VE_COMPONENT_2_SHIFT;
++		break;
++	case 2:
++		src_format = SURFACEFORMAT_R32G32_FLOAT;
++		dw |= COMPONENT_STORE_SRC << VE_COMPONENT_0_SHIFT;
++		dw |= COMPONENT_STORE_SRC << VE_COMPONENT_1_SHIFT;
++		dw |= COMPONENT_STORE_0 << VE_COMPONENT_2_SHIFT;
++		break;
++	case 3:
++		src_format = SURFACEFORMAT_R32G32B32_FLOAT;
++		dw |= COMPONENT_STORE_SRC << VE_COMPONENT_0_SHIFT;
++		dw |= COMPONENT_STORE_SRC << VE_COMPONENT_1_SHIFT;
++		dw |= COMPONENT_STORE_SRC << VE_COMPONENT_2_SHIFT;
++		break;
++	}
++	OUT_BATCH(id << VE_INDEX_SHIFT | VE_VALID |
++		  src_format << VE_FORMAT_SHIFT |
++		  4 << VE_OFFSET_SHIFT);
++	OUT_BATCH(dw);
++
++	/* u1, v1, w1 */
++	if (has_mask) {
++		unsigned offset = 4 + ((id & 3) ?: 1) * sizeof(float);
++		DBG(("%s: second channel %d floats, offset=%d\n", __FUNCTION__, (id >> 2) & 3, offset));
++		dw = COMPONENT_STORE_1_FLT << VE_COMPONENT_3_SHIFT;
++		switch (id >> 2) {
++		case 1:
++			src_format = SURFACEFORMAT_R32_FLOAT;
++			dw |= COMPONENT_STORE_SRC << VE_COMPONENT_0_SHIFT;
++			dw |= COMPONENT_STORE_0 << VE_COMPONENT_1_SHIFT;
++			dw |= COMPONENT_STORE_0 << VE_COMPONENT_2_SHIFT;
++			break;
++		default:
++			assert(0);
++		case 2:
++			src_format = SURFACEFORMAT_R32G32_FLOAT;
++			dw |= COMPONENT_STORE_SRC << VE_COMPONENT_0_SHIFT;
++			dw |= COMPONENT_STORE_SRC << VE_COMPONENT_1_SHIFT;
++			dw |= COMPONENT_STORE_0 << VE_COMPONENT_2_SHIFT;
++			break;
++		case 3:
++			src_format = SURFACEFORMAT_R32G32B32_FLOAT;
++			dw |= COMPONENT_STORE_SRC << VE_COMPONENT_0_SHIFT;
++			dw |= COMPONENT_STORE_SRC << VE_COMPONENT_1_SHIFT;
++			dw |= COMPONENT_STORE_SRC << VE_COMPONENT_2_SHIFT;
++			break;
++		}
++		OUT_BATCH(id << VE_INDEX_SHIFT | VE_VALID |
++			  src_format << VE_FORMAT_SHIFT |
++			  offset << VE_OFFSET_SHIFT);
++		OUT_BATCH(dw);
++	}
++
++	render->ve_dirty = true;
++}
++
++inline static void
++gen9_emit_pipe_invalidate(struct sna *sna)
++{
++	OUT_BATCH(GEN9_PIPE_CONTROL | (6 - 2));
++	OUT_BATCH(PIPE_CONTROL_WC_FLUSH |
++		  PIPE_CONTROL_TC_FLUSH |
++		  PIPE_CONTROL_CS_STALL);
++	OUT_BATCH64(0);
++	OUT_BATCH64(0);
++}
++
++inline static void
++gen9_emit_pipe_flush(struct sna *sna, bool need_stall)
++{
++	unsigned stall;
++
++	stall = 0;
++	if (need_stall)
++		stall = (PIPE_CONTROL_CS_STALL |
++			 PIPE_CONTROL_STALL_AT_SCOREBOARD);
++
++	OUT_BATCH(GEN9_PIPE_CONTROL | (6 - 2));
++	OUT_BATCH(PIPE_CONTROL_WC_FLUSH | stall);
++	OUT_BATCH64(0);
++	OUT_BATCH64(0);
++}
++
++inline static void
++gen9_emit_pipe_stall(struct sna *sna)
++{
++	OUT_BATCH(GEN9_PIPE_CONTROL | (6 - 2));
++	OUT_BATCH(PIPE_CONTROL_CS_STALL |
++		  PIPE_CONTROL_FLUSH |
++		  PIPE_CONTROL_STALL_AT_SCOREBOARD);
++	OUT_BATCH64(0);
++	OUT_BATCH64(0);
++}
++
++static void
++gen9_emit_state(struct sna *sna,
++		const struct sna_composite_op *op,
++		uint16_t wm_binding_table)
++{
++	bool need_invalidate;
++	bool need_flush;
++	bool need_stall;
++
++	assert(op->dst.bo->exec);
++
++	need_flush = wm_binding_table & 1 ||
++		(sna->render_state.gen9.emit_flush && GEN9_READS_DST(op->u.gen9.flags));
++	if (ALWAYS_FLUSH)
++		need_flush = true;
++
++	wm_binding_table &= ~1;
++
++	need_stall = sna->render_state.gen9.surface_table != wm_binding_table;
++
++	need_invalidate = kgem_bo_is_dirty(op->src.bo) || kgem_bo_is_dirty(op->mask.bo);
++	if (ALWAYS_INVALIDATE)
++		need_invalidate = true;
++
++	need_stall &= gen9_emit_drawing_rectangle(sna, op);
++	if (ALWAYS_STALL)
++		need_stall = true;
++
++	if (need_invalidate) {
++		gen9_emit_pipe_invalidate(sna);
++		kgem_clear_dirty(&sna->kgem);
++		assert(op->dst.bo->exec);
++		kgem_bo_mark_dirty(op->dst.bo);
++
++		need_flush = false;
++		need_stall = false;
++	}
++	if (need_flush) {
++		gen9_emit_pipe_flush(sna, need_stall);
++		need_stall = false;
++	}
++	if (need_stall)
++		gen9_emit_pipe_stall(sna);
++
++	gen9_emit_cc(sna, GEN9_BLEND(op->u.gen9.flags));
++	gen9_emit_sampler(sna, GEN9_SAMPLER(op->u.gen9.flags));
++	gen9_emit_sf(sna, GEN9_VERTEX(op->u.gen9.flags) >> 2);
++	gen9_emit_wm(sna, GEN9_KERNEL(op->u.gen9.flags));
++	gen9_emit_vertex_elements(sna, op);
++	gen9_emit_binding_table(sna, wm_binding_table);
++
++	sna->render_state.gen9.emit_flush = GEN9_READS_DST(op->u.gen9.flags);
++}
++
++static bool gen9_magic_ca_pass(struct sna *sna,
++			       const struct sna_composite_op *op)
++{
++	struct gen9_render_state *state = &sna->render_state.gen9;
++
++	if (!op->need_magic_ca_pass)
++		return false;
++
++	DBG(("%s: CA fixup (%d -> %d)\n", __FUNCTION__,
++	     sna->render.vertex_start, sna->render.vertex_index));
++
++	gen9_emit_pipe_stall(sna);
++
++	gen9_emit_cc(sna,
++		     GEN9_BLEND(gen9_get_blend(PictOpAdd, true,
++					       op->dst.format)));
++	gen9_emit_wm(sna,
++		     gen9_choose_composite_kernel(PictOpAdd,
++						  true, true,
++						  op->is_affine));
++
++	OUT_BATCH(GEN9_3DPRIMITIVE | (7 - 2));
++	OUT_BATCH(0); /* ignored, see VF_TOPOLOGY */
++	OUT_BATCH(sna->render.vertex_index - sna->render.vertex_start);
++	OUT_BATCH(sna->render.vertex_start);
++	OUT_BATCH(1);	/* single instance */
++	OUT_BATCH(0);	/* start instance location */
++	OUT_BATCH(0);	/* index buffer offset, ignored */
++
++	state->last_primitive = sna->kgem.nbatch;
++	state->ve_dirty = false;
++	return true;
++}
++
++static void null_create(struct sna_static_stream *stream)
++{
++	/* A bunch of zeros useful for legacy border color and depth-stencil */
++	sna_static_stream_map(stream, 64, 64);
++}
++
++static void
++sampler_state_init(struct gen9_sampler_state *sampler_state,
++		   sampler_filter_t filter,
++		   sampler_extend_t extend)
++{
++	COMPILE_TIME_ASSERT(sizeof(*sampler_state) == 4*sizeof(uint32_t));
++
++	sampler_state->ss0.lod_preclamp = 2;	/* GL mode */
++	sampler_state->ss0.default_color_mode = 1;
++
++	switch (filter) {
++	default:
++	case SAMPLER_FILTER_NEAREST:
++		sampler_state->ss0.min_filter = MAPFILTER_NEAREST;
++		sampler_state->ss0.mag_filter = MAPFILTER_NEAREST;
++		break;
++	case SAMPLER_FILTER_BILINEAR:
++		sampler_state->ss0.min_filter = MAPFILTER_LINEAR;
++		sampler_state->ss0.mag_filter = MAPFILTER_LINEAR;
++		break;
++	}
++
++	/* XXX bicubic filter using MAPFILTER_FLEXIBLE */
++
++	switch (extend) {
++	default:
++	case SAMPLER_EXTEND_NONE:
++		sampler_state->ss3.r_wrap_mode = TEXCOORDMODE_CLAMP_BORDER;
++		sampler_state->ss3.s_wrap_mode = TEXCOORDMODE_CLAMP_BORDER;
++		sampler_state->ss3.t_wrap_mode = TEXCOORDMODE_CLAMP_BORDER;
++		break;
++	case SAMPLER_EXTEND_REPEAT:
++		sampler_state->ss3.r_wrap_mode = TEXCOORDMODE_WRAP;
++		sampler_state->ss3.s_wrap_mode = TEXCOORDMODE_WRAP;
++		sampler_state->ss3.t_wrap_mode = TEXCOORDMODE_WRAP;
++		break;
++	case SAMPLER_EXTEND_PAD:
++		sampler_state->ss3.r_wrap_mode = TEXCOORDMODE_CLAMP;
++		sampler_state->ss3.s_wrap_mode = TEXCOORDMODE_CLAMP;
++		sampler_state->ss3.t_wrap_mode = TEXCOORDMODE_CLAMP;
++		break;
++	case SAMPLER_EXTEND_REFLECT:
++		sampler_state->ss3.r_wrap_mode = TEXCOORDMODE_MIRROR;
++		sampler_state->ss3.s_wrap_mode = TEXCOORDMODE_MIRROR;
++		sampler_state->ss3.t_wrap_mode = TEXCOORDMODE_MIRROR;
++		break;
++	}
++}
++
++static void
++sampler_copy_init(struct gen9_sampler_state *ss)
++{
++	sampler_state_init(ss, SAMPLER_FILTER_NEAREST, SAMPLER_EXTEND_NONE);
++	ss->ss3.non_normalized_coord = 1;
++
++	sampler_state_init(ss+1, SAMPLER_FILTER_NEAREST, SAMPLER_EXTEND_NONE);
++}
++
++static void
++sampler_fill_init(struct gen9_sampler_state *ss)
++{
++	sampler_state_init(ss, SAMPLER_FILTER_NEAREST, SAMPLER_EXTEND_REPEAT);
++	ss->ss3.non_normalized_coord = 1;
++
++	sampler_state_init(ss+1, SAMPLER_FILTER_NEAREST, SAMPLER_EXTEND_NONE);
++}
++
++static uint32_t
++gen9_tiling_bits(uint32_t tiling)
++{
++	switch (tiling) {
++	default: assert(0);
++	case I915_TILING_NONE: return 0;
++	case I915_TILING_X: return SURFACE_TILED;
++	case I915_TILING_Y: return SURFACE_TILED | SURFACE_TILED_Y;
++	}
++}
++
++#define MOCS_PTE (1 << 1)
++#define MOCS_WB (2 << 1)
++
++/**
++ * Sets up the common fields for a surface state buffer for the given
++ * picture in the given surface state buffer.
++ */
++static uint32_t
++gen9_bind_bo(struct sna *sna,
++	     struct kgem_bo *bo,
++	     uint32_t width,
++	     uint32_t height,
++	     uint32_t format,
++	     bool is_dst)
++{
++	uint32_t *ss;
++	uint32_t domains;
++	int offset;
++	uint32_t is_scanout = is_dst && bo->scanout;
++
++	/* After the first bind, we manage the cache domains within the batch */
++	offset = kgem_bo_get_binding(bo, format | is_dst << 30 | is_scanout << 31);
++	if (offset) {
++		if (is_dst)
++			kgem_bo_mark_dirty(bo);
++		assert(offset >= sna->kgem.surface);
++		return offset * sizeof(uint32_t);
++	}
++
++	offset = sna->kgem.surface -= SURFACE_DW;
++	ss = sna->kgem.batch + offset;
++	ss[0] = (SURFACE_2D << SURFACE_TYPE_SHIFT |
++		 gen9_tiling_bits(bo->tiling) |
++		 format << SURFACE_FORMAT_SHIFT |
++		 SURFACE_VALIGN_4 | SURFACE_HALIGN_4);
++	if (is_dst) {
++		ss[0] |= SURFACE_RC_READ_WRITE;
++		domains = I915_GEM_DOMAIN_RENDER << 16 |I915_GEM_DOMAIN_RENDER;
++	} else
++		domains = I915_GEM_DOMAIN_SAMPLER << 16;
++	ss[1] = (is_scanout || (is_dst && is_uncached(sna, bo))) ? MOCS_PTE << 24 : MOCS_WB << 24;
++	ss[2] = ((width - 1)  << SURFACE_WIDTH_SHIFT |
++		 (height - 1) << SURFACE_HEIGHT_SHIFT);
++	ss[3] = (bo->pitch - 1) << SURFACE_PITCH_SHIFT;
++	ss[4] = 0;
++	ss[5] = 0;
++	ss[6] = 0;
++	ss[7] = SURFACE_SWIZZLE(RED, GREEN, BLUE, ALPHA);
++	*(uint64_t *)(ss+8) = kgem_add_reloc64(&sna->kgem, offset + 8, bo, domains, 0);
++	ss[10] = 0;
++	ss[11] = 0;
++	ss[12] = 0;
++	ss[13] = 0;
++	ss[14] = 0;
++	ss[15] = 0;
++
++	kgem_bo_set_binding(bo, format | is_dst << 30 | is_scanout << 31, offset);
++
++	DBG(("[%x] bind bo(handle=%d, addr=%lx), format=%d, width=%d, height=%d, pitch=%d, tiling=%d -> %s\n",
++	     offset, bo->handle, *(uint64_t *)(ss+8),
++	     format, width, height, bo->pitch, bo->tiling,
++	     domains & 0xffff ? "render" : "sampler"));
++
++	return offset * sizeof(uint32_t);
++}
++
++static void gen9_emit_vertex_buffer(struct sna *sna,
++				    const struct sna_composite_op *op)
++{
++	int id = GEN9_VERTEX(op->u.gen9.flags);
++
++	OUT_BATCH(GEN9_3DSTATE_VERTEX_BUFFERS | (5 - 2));
++	OUT_BATCH(id << VB_INDEX_SHIFT | VB_MODIFY_ENABLE |
++		  4*op->floats_per_vertex);
++	sna->render.vertex_reloc[sna->render.nvertex_reloc++] = sna->kgem.nbatch;
++	OUT_BATCH64(0);
++	OUT_BATCH(~0); /* buffer size: disabled */
++
++	sna->render.vb_id |= 1 << id;
++}
++
++static void gen9_emit_primitive(struct sna *sna)
++{
++	if (sna->kgem.nbatch == sna->render_state.gen9.last_primitive) {
++		sna->render.vertex_offset = sna->kgem.nbatch - 5;
++		return;
++	}
++
++	OUT_BATCH(GEN9_3DPRIMITIVE | (7 - 2));
++	OUT_BATCH(0); /* ignored, see VF_TOPOLOGY */
++	sna->render.vertex_offset = sna->kgem.nbatch;
++	OUT_BATCH(0);	/* vertex count, to be filled in later */
++	OUT_BATCH(sna->render.vertex_index);
++	OUT_BATCH(1);	/* single instance */
++	OUT_BATCH(0);	/* start instance location */
++	OUT_BATCH(0);	/* index buffer offset, ignored */
++	sna->render.vertex_start = sna->render.vertex_index;
++
++	sna->render_state.gen9.last_primitive = sna->kgem.nbatch;
++	sna->render_state.gen9.ve_dirty = false;
++}
++
++static bool gen9_rectangle_begin(struct sna *sna,
++				 const struct sna_composite_op *op)
++{
++	int id = 1 << GEN9_VERTEX(op->u.gen9.flags);
++	int ndwords;
++
++	if (sna_vertex_wait__locked(&sna->render) && sna->render.vertex_offset)
++		return true;
++
++	ndwords = op->need_magic_ca_pass ? 60 : 6;
++	if ((sna->render.vb_id & id) == 0)
++		ndwords += 5;
++	if (!kgem_check_batch(&sna->kgem, ndwords))
++		return false;
++
++	if ((sna->render.vb_id & id) == 0)
++		gen9_emit_vertex_buffer(sna, op);
++
++	gen9_emit_primitive(sna);
++	return true;
++}
++
++static int gen9_get_rectangles__flush(struct sna *sna,
++				      const struct sna_composite_op *op)
++{
++	/* Preventing discarding new vbo after lock contention */
++	if (sna_vertex_wait__locked(&sna->render)) {
++		int rem = vertex_space(sna);
++		if (rem > op->floats_per_rect)
++			return rem;
++	}
++
++	if (!kgem_check_batch(&sna->kgem, op->need_magic_ca_pass ? 65 : 6))
++		return 0;
++	if (!kgem_check_reloc_and_exec(&sna->kgem, 2))
++		return 0;
++
++	if (sna->render.vertex_offset) {
++		gen8_vertex_flush(sna);
++		if (gen9_magic_ca_pass(sna, op)) {
++			gen9_emit_pipe_invalidate(sna);
++			gen9_emit_cc(sna, GEN9_BLEND(op->u.gen9.flags));
++			gen9_emit_wm(sna, GEN9_KERNEL(op->u.gen9.flags));
++		}
++	}
++
++	return gen8_vertex_finish(sna);
++}
++
++inline static int gen9_get_rectangles(struct sna *sna,
++				      const struct sna_composite_op *op,
++				      int want,
++				      void (*emit_state)(struct sna *sna, const struct sna_composite_op *op))
++{
++	int rem;
++
++	assert(want);
++
++start:
++	rem = vertex_space(sna);
++	if (unlikely(rem < op->floats_per_rect)) {
++		DBG(("flushing vbo for %s: %d < %d\n",
++		     __FUNCTION__, rem, op->floats_per_rect));
++		rem = gen9_get_rectangles__flush(sna, op);
++		if (unlikely(rem == 0))
++			goto flush;
++	}
++
++	if (unlikely(sna->render.vertex_offset == 0)) {
++		if (!gen9_rectangle_begin(sna, op))
++			goto flush;
++		else
++			goto start;
++	}
++
++	assert(rem <= vertex_space(sna));
++	assert(op->floats_per_rect <= rem);
++	if (want > 1 && want * op->floats_per_rect > rem)
++		want = rem / op->floats_per_rect;
++
++	assert(want > 0);
++	sna->render.vertex_index += 3*want;
++	return want;
++
++flush:
++	if (sna->render.vertex_offset) {
++		gen8_vertex_flush(sna);
++		gen9_magic_ca_pass(sna, op);
++	}
++	sna_vertex_wait__locked(&sna->render);
++	_kgem_submit(&sna->kgem);
++	emit_state(sna, op);
++	goto start;
++}
++
++inline static uint32_t *gen9_composite_get_binding_table(struct sna *sna,
++							 uint16_t *offset)
++{
++	uint32_t *table;
++
++	assert(sna->kgem.surface <= 16384);
++	sna->kgem.surface -= SURFACE_DW;
++	/* Clear all surplus entries to zero in case of prefetch */
++	table = memset(sna->kgem.batch + sna->kgem.surface, 0, 64);
++
++	DBG(("%s(%x)\n", __FUNCTION__, 4*sna->kgem.surface));
++
++	*offset = sna->kgem.surface;
++	return table;
++}
++
++static void
++gen9_get_batch(struct sna *sna, const struct sna_composite_op *op)
++{
++	kgem_set_mode(&sna->kgem, KGEM_RENDER, op->dst.bo);
++
++	if (!kgem_check_batch_with_surfaces(&sna->kgem, 150, 2*(1+3))) {
++		DBG(("%s: flushing batch: %d < %d+%d\n",
++		     __FUNCTION__, sna->kgem.surface - sna->kgem.nbatch,
++		     150, 4*8*2));
++		_kgem_submit(&sna->kgem);
++		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
++	}
++
++	assert(sna->kgem.mode == KGEM_RENDER);
++	assert(sna->kgem.ring == KGEM_RENDER);
++
++	if (sna->render_state.gen9.needs_invariant)
++		gen9_emit_invariant(sna);
++}
++
++static void gen9_emit_composite_state(struct sna *sna,
++				      const struct sna_composite_op *op)
++{
++	uint32_t *binding_table;
++	uint16_t offset, dirty;
++
++	gen9_get_batch(sna, op);
++
++	binding_table = gen9_composite_get_binding_table(sna, &offset);
++
++	dirty = kgem_bo_is_dirty(op->dst.bo);
++
++	binding_table[0] =
++		gen9_bind_bo(sna,
++			    op->dst.bo, op->dst.width, op->dst.height,
++			    gen9_get_dest_format(op->dst.format),
++			    true);
++	binding_table[1] =
++		gen9_bind_bo(sna,
++			     op->src.bo, op->src.width, op->src.height,
++			     op->src.card_format,
++			     false);
++	if (op->mask.bo) {
++		binding_table[2] =
++			gen9_bind_bo(sna,
++				     op->mask.bo,
++				     op->mask.width,
++				     op->mask.height,
++				     op->mask.card_format,
++				     false);
++	}
++
++	if (sna->kgem.surface == offset &&
++	    *(uint64_t *)(sna->kgem.batch + sna->render_state.gen9.surface_table) == *(uint64_t*)binding_table &&
++	    (op->mask.bo == NULL ||
++	     sna->kgem.batch[sna->render_state.gen9.surface_table+2] == binding_table[2])) {
++		sna->kgem.surface += SURFACE_DW;
++		offset = sna->render_state.gen9.surface_table;
++	}
++
++	if (sna->kgem.batch[sna->render_state.gen9.surface_table] == binding_table[0])
++		dirty = 0;
++
++	gen9_emit_state(sna, op, offset | dirty);
++}
++
++static void
++gen9_align_vertex(struct sna *sna, const struct sna_composite_op *op)
++{
++	if (op->floats_per_vertex != sna->render_state.gen9.floats_per_vertex) {
++		DBG(("aligning vertex: was %d, now %d floats per vertex\n",
++		     sna->render_state.gen9.floats_per_vertex, op->floats_per_vertex));
++		gen8_vertex_align(sna, op);
++		sna->render_state.gen9.floats_per_vertex = op->floats_per_vertex;
++	}
++}
++
++fastcall static void
++gen9_render_composite_blt(struct sna *sna,
++			  const struct sna_composite_op *op,
++			  const struct sna_composite_rectangles *r)
++{
++	gen9_get_rectangles(sna, op, 1, gen9_emit_composite_state);
++	op->prim_emit(sna, op, r);
++}
++
++fastcall static void
++gen9_render_composite_box(struct sna *sna,
++			  const struct sna_composite_op *op,
++			  const BoxRec *box)
++{
++	struct sna_composite_rectangles r;
++
++	gen9_get_rectangles(sna, op, 1, gen9_emit_composite_state);
++
++	DBG(("  %s: (%d, %d), (%d, %d)\n",
++	     __FUNCTION__,
++	     box->x1, box->y1, box->x2, box->y2));
++
++	r.dst.x = box->x1;
++	r.dst.y = box->y1;
++	r.width  = box->x2 - box->x1;
++	r.height = box->y2 - box->y1;
++	r.src = r.mask = r.dst;
++
++	op->prim_emit(sna, op, &r);
++}
++
++static void
++gen9_render_composite_boxes__blt(struct sna *sna,
++				 const struct sna_composite_op *op,
++				 const BoxRec *box, int nbox)
++{
++	DBG(("composite_boxes(%d)\n", nbox));
++
++	do {
++		int nbox_this_time;
++
++		nbox_this_time = gen9_get_rectangles(sna, op, nbox,
++						     gen9_emit_composite_state);
++		nbox -= nbox_this_time;
++
++		do {
++			struct sna_composite_rectangles r;
++
++			DBG(("  %s: (%d, %d), (%d, %d)\n",
++			     __FUNCTION__,
++			     box->x1, box->y1, box->x2, box->y2));
++
++			r.dst.x = box->x1;
++			r.dst.y = box->y1;
++			r.width  = box->x2 - box->x1;
++			r.height = box->y2 - box->y1;
++			r.src = r.mask = r.dst;
++
++			op->prim_emit(sna, op, &r);
++			box++;
++		} while (--nbox_this_time);
++	} while (nbox);
++}
++
++static void
++gen9_render_composite_boxes(struct sna *sna,
++			    const struct sna_composite_op *op,
++			    const BoxRec *box, int nbox)
++{
++	DBG(("%s: nbox=%d\n", __FUNCTION__, nbox));
++
++	do {
++		int nbox_this_time;
++		float *v;
++
++		nbox_this_time = gen9_get_rectangles(sna, op, nbox,
++						     gen9_emit_composite_state);
++		assert(nbox_this_time);
++		nbox -= nbox_this_time;
++
++		v = sna->render.vertices + sna->render.vertex_used;
++		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
++
++		op->emit_boxes(op, box, nbox_this_time, v);
++		box += nbox_this_time;
++	} while (nbox);
++}
++
++static void
++gen9_render_composite_boxes__thread(struct sna *sna,
++				    const struct sna_composite_op *op,
++				    const BoxRec *box, int nbox)
++{
++	DBG(("%s: nbox=%d\n", __FUNCTION__, nbox));
++
++	sna_vertex_lock(&sna->render);
++	do {
++		int nbox_this_time;
++		float *v;
++
++		nbox_this_time = gen9_get_rectangles(sna, op, nbox,
++						     gen9_emit_composite_state);
++		assert(nbox_this_time);
++		nbox -= nbox_this_time;
++
++		v = sna->render.vertices + sna->render.vertex_used;
++		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
++
++		sna_vertex_acquire__locked(&sna->render);
++		sna_vertex_unlock(&sna->render);
++
++		op->emit_boxes(op, box, nbox_this_time, v);
++		box += nbox_this_time;
++
++		sna_vertex_lock(&sna->render);
++		sna_vertex_release__locked(&sna->render);
++	} while (nbox);
++	sna_vertex_unlock(&sna->render);
++}
++
++static uint32_t
++gen9_create_blend_state(struct sna_static_stream *stream)
++{
++	char *base, *ptr;
++	int src, dst;
++
++	COMPILE_TIME_ASSERT(((GEN9_BLENDFACTOR_COUNT * GEN9_BLENDFACTOR_COUNT << 4) & (1 << 15)) == 0);
++
++	base = sna_static_stream_map(stream,
++				     GEN9_BLENDFACTOR_COUNT * GEN9_BLENDFACTOR_COUNT * GEN9_BLEND_STATE_PADDED_SIZE,
++				     64);
++
++	ptr = base;
++	for (src = 0; src < GEN9_BLENDFACTOR_COUNT; src++) {
++		for (dst = 0; dst < GEN9_BLENDFACTOR_COUNT; dst++) {
++			struct gen9_blend_state *blend =
++				(struct gen9_blend_state *)ptr;
++
++			assert(((ptr - base) & 63) == 0);
++			COMPILE_TIME_ASSERT(sizeof(blend->common) == 4);
++			COMPILE_TIME_ASSERT(sizeof(blend->rt) == 8);
++			COMPILE_TIME_ASSERT((char *)&blend->rt - (char *)blend == 4);
++
++			blend->rt.post_blend_clamp = 1;
++			blend->rt.pre_blend_clamp = 1;
++
++			blend->rt.color_blend =
++				!(dst == BLENDFACTOR_ZERO && src == BLENDFACTOR_ONE);
++			blend->rt.dest_blend_factor = dst;
++			blend->rt.source_blend_factor = src;
++			blend->rt.color_blend_function = BLENDFUNCTION_ADD;
++
++			blend->rt.dest_alpha_blend_factor = dst;
++			blend->rt.source_alpha_blend_factor = src;
++			blend->rt.alpha_blend_function = BLENDFUNCTION_ADD;
++
++			ptr += GEN9_BLEND_STATE_PADDED_SIZE;
++		}
++	}
++
++	return sna_static_stream_offsetof(stream, base);
++}
++
++static int
++gen9_composite_picture(struct sna *sna,
++		       PicturePtr picture,
++		       struct sna_composite_channel *channel,
++		       int x, int y,
++		       int w, int h,
++		       int dst_x, int dst_y,
++		       bool precise)
++{
++	PixmapPtr pixmap;
++	uint32_t color;
++	int16_t dx, dy;
++
++	DBG(("%s: (%d, %d)x(%d, %d), dst=(%d, %d)\n",
++	     __FUNCTION__, x, y, w, h, dst_x, dst_y));
++
++	channel->is_solid = false;
++	channel->card_format = -1;
++
++	if (sna_picture_is_solid(picture, &color))
++		return gen4_channel_init_solid(sna, channel, color);
++
++	if (picture->pDrawable == NULL) {
++		int ret;
++
++		if (picture->pSourcePict->type == SourcePictTypeLinear)
++			return gen4_channel_init_linear(sna, picture, channel,
++							x, y,
++							w, h,
++							dst_x, dst_y);
++
++		DBG(("%s -- fixup, gradient\n", __FUNCTION__));
++		ret = -1;
++		if (!precise)
++			ret = sna_render_picture_approximate_gradient(sna, picture, channel,
++								      x, y, w, h, dst_x, dst_y);
++		if (ret == -1)
++			ret = sna_render_picture_fixup(sna, picture, channel,
++						       x, y, w, h, dst_x, dst_y);
++		return ret;
++	}
++
++	if (picture->alphaMap) {
++		DBG(("%s -- fallback, alphamap\n", __FUNCTION__));
++		return sna_render_picture_fixup(sna, picture, channel,
++						x, y, w, h, dst_x, dst_y);
++	}
++
++	if (!gen9_check_repeat(picture))
++		return sna_render_picture_fixup(sna, picture, channel,
++						x, y, w, h, dst_x, dst_y);
++
++	if (!gen9_check_filter(picture))
++		return sna_render_picture_fixup(sna, picture, channel,
++						x, y, w, h, dst_x, dst_y);
++
++	channel->repeat = picture->repeat ? picture->repeatType : RepeatNone;
++	channel->filter = picture->filter;
++
++	pixmap = get_drawable_pixmap(picture->pDrawable);
++	get_drawable_deltas(picture->pDrawable, pixmap, &dx, &dy);
++
++	x += dx + picture->pDrawable->x;
++	y += dy + picture->pDrawable->y;
++
++	channel->is_affine = sna_transform_is_affine(picture->transform);
++	if (sna_transform_is_imprecise_integer_translation(picture->transform, picture->filter, precise, &dx, &dy)) {
++		DBG(("%s: integer translation (%d, %d), removing\n",
++		     __FUNCTION__, dx, dy));
++		x += dx;
++		y += dy;
++		channel->transform = NULL;
++		channel->filter = PictFilterNearest;
++
++		if (channel->repeat ||
++		    (x >= 0 &&
++		     y >= 0 &&
++		     x + w <= pixmap->drawable.width &&
++		     y + h <= pixmap->drawable.height)) {
++			struct sna_pixmap *priv = sna_pixmap(pixmap);
++			if (priv && priv->clear) {
++				DBG(("%s: converting large pixmap source into solid [%08x]\n", __FUNCTION__, priv->clear_color));
++				return gen4_channel_init_solid(sna, channel, solid_color(picture->format, priv->clear_color));
++			}
++		}
++	} else
++		channel->transform = picture->transform;
++
++	channel->pict_format = picture->format;
++	channel->card_format = gen9_get_card_format(picture->format);
++	if (channel->card_format == (unsigned)-1)
++		return sna_render_picture_convert(sna, picture, channel, pixmap,
++						  x, y, w, h, dst_x, dst_y,
++						  false);
++
++	if (too_large(pixmap->drawable.width, pixmap->drawable.height)) {
++		DBG(("%s: extracting from pixmap %dx%d\n", __FUNCTION__,
++		     pixmap->drawable.width, pixmap->drawable.height));
++		return sna_render_picture_extract(sna, picture, channel,
++						  x, y, w, h, dst_x, dst_y);
++	}
++
++	return sna_render_pixmap_bo(sna, channel, pixmap,
++				    x, y, w, h, dst_x, dst_y);
++}
++
++inline static bool gen9_composite_channel_convert(struct sna_composite_channel *channel)
++{
++	if (unaligned(channel->bo, PICT_FORMAT_BPP(channel->pict_format)))
++		return false;
++
++	channel->repeat = gen9_repeat(channel->repeat);
++	channel->filter = gen9_filter(channel->filter);
++	if (channel->card_format == (unsigned)-1)
++		channel->card_format = gen9_get_card_format(channel->pict_format);
++	assert(channel->card_format != (unsigned)-1);
++
++	return true;
++}
++
++static void gen9_render_composite_done(struct sna *sna,
++				       const struct sna_composite_op *op)
++{
++	if (sna->render.vertex_offset) {
++		gen8_vertex_flush(sna);
++		gen9_magic_ca_pass(sna, op);
++	}
++
++	if (op->mask.bo)
++		kgem_bo_destroy(&sna->kgem, op->mask.bo);
++	if (op->src.bo)
++		kgem_bo_destroy(&sna->kgem, op->src.bo);
++
++	sna_render_composite_redirect_done(sna, op);
++}
++
++inline static bool
++gen9_composite_set_target(struct sna *sna,
++			  struct sna_composite_op *op,
++			  PicturePtr dst,
++			  int x, int y, int w, int h,
++			  bool partial)
++{
++	BoxRec box;
++	unsigned int hint;
++
++	DBG(("%s: (%d, %d)x(%d, %d), partial?=%d\n", __FUNCTION__, x, y, w, h, partial));
++
++	op->dst.pixmap = get_drawable_pixmap(dst->pDrawable);
++	op->dst.format = dst->format;
++	op->dst.width  = op->dst.pixmap->drawable.width;
++	op->dst.height = op->dst.pixmap->drawable.height;
++
++	if (w | h) {
++		assert(w && h);
++		box.x1 = x;
++		box.y1 = y;
++		box.x2 = x + w;
++		box.y2 = y + h;
++	} else
++		sna_render_picture_extents(dst, &box);
++
++	hint = PREFER_GPU | RENDER_GPU;
++	if (!need_tiling(sna, op->dst.width, op->dst.height))
++		hint |= FORCE_GPU;
++	if (!partial) {
++		hint |= IGNORE_DAMAGE;
++		if (w == op->dst.width && h == op->dst.height)
++			hint |= REPLACES;
++	}
++
++	op->dst.bo = sna_drawable_use_bo(dst->pDrawable, hint, &box, &op->damage);
++	if (op->dst.bo == NULL)
++		return false;
++
++	assert(!op->damage || !DAMAGE_IS_ALL(*op->damage));
++
++	if (unaligned(op->dst.bo, dst->pDrawable->bitsPerPixel))
++		return false;
++
++	if (hint & REPLACES) {
++		struct sna_pixmap *priv = sna_pixmap(op->dst.pixmap);
++		kgem_bo_pair_undo(&sna->kgem, priv->gpu_bo, priv->cpu_bo);
++	}
++
++	get_drawable_deltas(dst->pDrawable, op->dst.pixmap,
++			    &op->dst.x, &op->dst.y);
++
++	DBG(("%s: pixmap=%ld, format=%08x, size=%dx%d, pitch=%d, delta=(%d,%d),damage=%p\n",
++	     __FUNCTION__,
++	     op->dst.pixmap->drawable.serialNumber, (int)op->dst.format,
++	     op->dst.width, op->dst.height,
++	     op->dst.bo->pitch,
++	     op->dst.x, op->dst.y,
++	     op->damage ? *op->damage : (void *)-1));
++
++	assert(op->dst.bo->proxy == NULL);
++
++	if (too_large(op->dst.width, op->dst.height) &&
++	    !sna_render_composite_redirect(sna, op, x, y, w, h, partial))
++		return false;
++
++	return true;
++}
++
++static bool
++try_blt(struct sna *sna,
++	uint8_t op,
++	PicturePtr src,
++	PicturePtr mask,
++	PicturePtr dst,
++	int16_t src_x, int16_t src_y,
++	int16_t msk_x, int16_t msk_y,
++	int16_t dst_x, int16_t dst_y,
++	int16_t width, int16_t height,
++	unsigned flags,
++	struct sna_composite_op *tmp)
++{
++	struct kgem_bo *bo;
++
++	if (sna->kgem.mode == KGEM_BLT) {
++		DBG(("%s: already performing BLT\n", __FUNCTION__));
++		goto execute;
++	}
++
++	if (too_large(width, height)) {
++		DBG(("%s: operation too large for 3D pipe (%d, %d)\n",
++		     __FUNCTION__, width, height));
++		goto execute;
++	}
++
++	bo = __sna_drawable_peek_bo(dst->pDrawable);
++	if (bo == NULL)
++		goto execute;
++
++	if (untiled_tlb_miss(bo))
++		goto execute;
++
++	if (bo->rq) {
++		if (RQ_IS_BLT(bo->rq))
++			goto execute;
++
++		return false;
++	}
++
++	if (bo->tiling == I915_TILING_Y)
++		goto upload;
++
++	if (sna_picture_is_solid(src, NULL) && can_switch_to_blt(sna, bo, 0))
++		goto execute;
++
++	if (src->pDrawable == dst->pDrawable &&
++	    (sna->render_state.gt < 3 || width*height < 1024) &&
++	    can_switch_to_blt(sna, bo, 0))
++		goto execute;
++
++	if (src->pDrawable) {
++		struct kgem_bo *s = __sna_drawable_peek_bo(src->pDrawable);
++		if (s == NULL)
++			goto upload;
++
++		if (prefer_blt_bo(sna, s, bo))
++			goto execute;
++	}
++
++	if (sna->kgem.ring == KGEM_BLT) {
++		DBG(("%s: already performing BLT\n", __FUNCTION__));
++		goto execute;
++	}
++
++upload:
++	flags |= COMPOSITE_UPLOAD;
++execute:
++	return sna_blt_composite(sna, op,
++				 src, dst,
++				 src_x, src_y,
++				 dst_x, dst_y,
++				 width, height,
++				 flags, tmp);
++}
++
++static bool
++check_gradient(PicturePtr picture, bool precise)
++{
++	if (picture->pDrawable)
++		return false;
++
++	switch (picture->pSourcePict->type) {
++	case SourcePictTypeSolidFill:
++	case SourcePictTypeLinear:
++		return false;
++	default:
++		return precise;
++	}
++}
++
++static bool
++has_alphamap(PicturePtr p)
++{
++	return p->alphaMap != NULL;
++}
++
++static bool
++need_upload(PicturePtr p)
++{
++	return p->pDrawable && unattached(p->pDrawable) && untransformed(p);
++}
++
++static bool
++source_is_busy(PixmapPtr pixmap)
++{
++	struct sna_pixmap *priv = sna_pixmap(pixmap);
++	if (priv == NULL || priv->clear)
++		return false;
++
++	if (priv->gpu_bo && kgem_bo_is_busy(priv->gpu_bo))
++		return true;
++
++	if (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo))
++		return true;
++
++	return priv->gpu_damage && !priv->cpu_damage;
++}
++
++static bool
++source_fallback(PicturePtr p, PixmapPtr pixmap, bool precise)
++{
++	if (sna_picture_is_solid(p, NULL))
++		return false;
++
++	if (p->pSourcePict)
++		return check_gradient(p, precise);
++
++	if (!gen9_check_repeat(p) || !gen9_check_format(p->format))
++		return true;
++
++	if (pixmap && source_is_busy(pixmap))
++		return false;
++
++	return has_alphamap(p) || !gen9_check_filter(p) || need_upload(p);
++}
++
++static bool
++gen9_composite_fallback(struct sna *sna,
++			PicturePtr src,
++			PicturePtr mask,
++			PicturePtr dst)
++{
++	PixmapPtr src_pixmap;
++	PixmapPtr mask_pixmap;
++	PixmapPtr dst_pixmap;
++	bool src_fallback, mask_fallback;
++
++	if (!gen9_check_dst_format(dst->format)) {
++		DBG(("%s: unknown destination format: %d\n",
++		     __FUNCTION__, dst->format));
++		return true;
++	}
++
++	dst_pixmap = get_drawable_pixmap(dst->pDrawable);
++
++	src_pixmap = src->pDrawable ? get_drawable_pixmap(src->pDrawable) : NULL;
++	src_fallback = source_fallback(src, src_pixmap,
++				       dst->polyMode == PolyModePrecise);
++
++	if (mask) {
++		mask_pixmap = mask->pDrawable ? get_drawable_pixmap(mask->pDrawable) : NULL;
++		mask_fallback = source_fallback(mask, mask_pixmap,
++						dst->polyMode == PolyModePrecise);
++	} else {
++		mask_pixmap = NULL;
++		mask_fallback = false;
++	}
++
++	/* If we are using the destination as a source and need to
++	 * readback in order to upload the source, do it all
++	 * on the cpu.
++	 */
++	if (src_pixmap == dst_pixmap && src_fallback) {
++		DBG(("%s: src is dst and will fallback\n",__FUNCTION__));
++		return true;
++	}
++	if (mask_pixmap == dst_pixmap && mask_fallback) {
++		DBG(("%s: mask is dst and will fallback\n",__FUNCTION__));
++		return true;
++	}
++
++	/* If anything is on the GPU, push everything out to the GPU */
++	if (dst_use_gpu(dst_pixmap)) {
++		DBG(("%s: dst is already on the GPU, try to use GPU\n",
++		     __FUNCTION__));
++		return false;
++	}
++
++	if (src_pixmap && !src_fallback) {
++		DBG(("%s: src is already on the GPU, try to use GPU\n",
++		     __FUNCTION__));
++		return false;
++	}
++	if (mask_pixmap && !mask_fallback) {
++		DBG(("%s: mask is already on the GPU, try to use GPU\n",
++		     __FUNCTION__));
++		return false;
++	}
++
++	/* However if the dst is not on the GPU and we need to
++	 * render one of the sources using the CPU, we may
++	 * as well do the entire operation in place onthe CPU.
++	 */
++	if (src_fallback) {
++		DBG(("%s: dst is on the CPU and src will fallback\n",
++		     __FUNCTION__));
++		return true;
++	}
++
++	if (mask && mask_fallback) {
++		DBG(("%s: dst is on the CPU and mask will fallback\n",
++		     __FUNCTION__));
++		return true;
++	}
++
++	if (too_large(dst_pixmap->drawable.width,
++		      dst_pixmap->drawable.height) &&
++	    dst_is_cpu(dst_pixmap)) {
++		DBG(("%s: dst is on the CPU and too large\n", __FUNCTION__));
++		return true;
++	}
++
++	DBG(("%s: dst is not on the GPU and the operation should not fallback\n",
++	     __FUNCTION__));
++	return dst_use_cpu(dst_pixmap);
++}
++
++static int
++reuse_source(struct sna *sna,
++	     PicturePtr src, struct sna_composite_channel *sc, int src_x, int src_y,
++	     PicturePtr mask, struct sna_composite_channel *mc, int msk_x, int msk_y)
++{
++	uint32_t color;
++
++	if (src_x != msk_x || src_y != msk_y)
++		return false;
++
++	if (src == mask) {
++		DBG(("%s: mask is source\n", __FUNCTION__));
++		*mc = *sc;
++		mc->bo = kgem_bo_reference(mc->bo);
++		return true;
++	}
++
++	if (sna_picture_is_solid(mask, &color))
++		return gen4_channel_init_solid(sna, mc, color);
++
++	if (sc->is_solid)
++		return false;
++
++	if (src->pDrawable == NULL || mask->pDrawable != src->pDrawable)
++		return false;
++
++	DBG(("%s: mask reuses source drawable\n", __FUNCTION__));
++
++	if (!sna_transform_equal(src->transform, mask->transform))
++		return false;
++
++	if (!sna_picture_alphamap_equal(src, mask))
++		return false;
++
++	if (!gen9_check_repeat(mask))
++		return false;
++
++	if (!gen9_check_filter(mask))
++		return false;
++
++	if (!gen9_check_format(mask->format))
++		return false;
++
++	DBG(("%s: reusing source channel for mask with a twist\n",
++	     __FUNCTION__));
++
++	*mc = *sc;
++	mc->repeat = gen9_repeat(mask->repeat ? mask->repeatType : RepeatNone);
++	mc->filter = gen9_filter(mask->filter);
++	mc->pict_format = mask->format;
++	mc->card_format = gen9_get_card_format(mask->format);
++	mc->bo = kgem_bo_reference(mc->bo);
++	return true;
++}
++
++static bool
++gen9_render_composite(struct sna *sna,
++		      uint8_t op,
++		      PicturePtr src,
++		      PicturePtr mask,
++		      PicturePtr dst,
++		      int16_t src_x, int16_t src_y,
++		      int16_t msk_x, int16_t msk_y,
++		      int16_t dst_x, int16_t dst_y,
++		      int16_t width, int16_t height,
++		      unsigned flags,
++		      struct sna_composite_op *tmp)
++{
++	if (op >= ARRAY_SIZE(gen9_blend_op))
++		return false;
++
++	DBG(("%s: %dx%d, current mode=%d/%d\n", __FUNCTION__,
++	     width, height, sna->kgem.mode, sna->kgem.ring));
++
++	if (mask == NULL &&
++	    try_blt(sna, op,
++		    src, mask, dst,
++		    src_x, src_y,
++		    msk_x, msk_y,
++		    dst_x, dst_y,
++		    width, height,
++		    flags, tmp))
++		return true;
++
++	if (gen9_composite_fallback(sna, src, mask, dst))
++		goto fallback;
++
++	if (need_tiling(sna, width, height))
++		return sna_tiling_composite(op, src, mask, dst,
++					    src_x, src_y,
++					    msk_x, msk_y,
++					    dst_x, dst_y,
++					    width, height,
++					    tmp);
++
++	if (op == PictOpClear && src == sna->clear)
++		op = PictOpSrc;
++	tmp->op = op;
++	if (!gen9_composite_set_target(sna, tmp, dst,
++				       dst_x, dst_y, width, height,
++				       flags & COMPOSITE_PARTIAL || op > PictOpSrc))
++		goto fallback;
++
++	switch (gen9_composite_picture(sna, src, &tmp->src,
++				       src_x, src_y,
++				       width, height,
++				       dst_x, dst_y,
++				       dst->polyMode == PolyModePrecise)) {
++	case -1:
++		goto cleanup_dst;
++	case 0:
++		if (!gen4_channel_init_solid(sna, &tmp->src, 0))
++			goto cleanup_dst;
++		/* fall through to fixup */
++	case 1:
++		/* Did we just switch rings to prepare the source? */
++		if (mask == NULL &&
++		    (prefer_blt_composite(sna, tmp) ||
++		     unaligned(tmp->src.bo, PICT_FORMAT_BPP(tmp->src.pict_format))) &&
++		    sna_blt_composite__convert(sna,
++					       dst_x, dst_y, width, height,
++					       tmp))
++			return true;
++
++		if (!gen9_composite_channel_convert(&tmp->src))
++			goto cleanup_src;
++
++		break;
++	}
++
++	tmp->is_affine = tmp->src.is_affine;
++	tmp->has_component_alpha = false;
++	tmp->need_magic_ca_pass = false;
++
++	tmp->mask.bo = NULL;
++	tmp->mask.filter = SAMPLER_FILTER_NEAREST;
++	tmp->mask.repeat = SAMPLER_EXTEND_NONE;
++
++	if (mask) {
++		if (mask->componentAlpha && PICT_FORMAT_RGB(mask->format)) {
++			tmp->has_component_alpha = true;
++
++			/* Check if it's component alpha that relies on a source alpha and on
++			 * the source value.  We can only get one of those into the single
++			 * source value that we get to blend with.
++			 */
++			if (gen9_blend_op[op].src_alpha &&
++			    (gen9_blend_op[op].src_blend != BLENDFACTOR_ZERO)) {
++				if (op != PictOpOver)
++					goto cleanup_src;
++
++				tmp->need_magic_ca_pass = true;
++				tmp->op = PictOpOutReverse;
++			}
++		}
++
++		if (!reuse_source(sna,
++				  src, &tmp->src, src_x, src_y,
++				  mask, &tmp->mask, msk_x, msk_y)) {
++			switch (gen9_composite_picture(sna, mask, &tmp->mask,
++						       msk_x, msk_y,
++						       width, height,
++						       dst_x, dst_y,
++						       dst->polyMode == PolyModePrecise)) {
++			case -1:
++				goto cleanup_src;
++			case 0:
++				if (!gen4_channel_init_solid(sna, &tmp->mask, 0))
++					goto cleanup_src;
++				/* fall through to fixup */
++			case 1:
++				if (!gen9_composite_channel_convert(&tmp->mask))
++					goto cleanup_mask;
++				break;
++			}
++		}
++
++		tmp->is_affine &= tmp->mask.is_affine;
++	}
++
++	tmp->u.gen9.flags =
++		GEN9_SET_FLAGS(SAMPLER_OFFSET(tmp->src.filter,
++					      tmp->src.repeat,
++					      tmp->mask.filter,
++					      tmp->mask.repeat),
++			       gen9_get_blend(tmp->op,
++					      tmp->has_component_alpha,
++					      tmp->dst.format),
++			       gen9_choose_composite_kernel(tmp->op,
++							    tmp->mask.bo != NULL,
++							    tmp->has_component_alpha,
++							    tmp->is_affine),
++			       gen4_choose_composite_emitter(sna, tmp));
++
++	tmp->blt   = gen9_render_composite_blt;
++	tmp->box   = gen9_render_composite_box;
++	tmp->boxes = gen9_render_composite_boxes__blt;
++	if (tmp->emit_boxes){
++		tmp->boxes = gen9_render_composite_boxes;
++		tmp->thread_boxes = gen9_render_composite_boxes__thread;
++	}
++	tmp->done  = gen9_render_composite_done;
++
++	kgem_set_mode(&sna->kgem, KGEM_RENDER, tmp->dst.bo);
++	if (!kgem_check_bo(&sna->kgem,
++			   tmp->dst.bo, tmp->src.bo, tmp->mask.bo,
++			   NULL)) {
++		kgem_submit(&sna->kgem);
++		if (!kgem_check_bo(&sna->kgem,
++				   tmp->dst.bo, tmp->src.bo, tmp->mask.bo,
++				   NULL))
++			goto cleanup_mask;
++		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
++	}
++
++	gen9_align_vertex(sna, tmp);
++	gen9_emit_composite_state(sna, tmp);
++	return true;
++
++cleanup_mask:
++	if (tmp->mask.bo) {
++		kgem_bo_destroy(&sna->kgem, tmp->mask.bo);
++		tmp->mask.bo = NULL;
++	}
++cleanup_src:
++	if (tmp->src.bo) {
++		kgem_bo_destroy(&sna->kgem, tmp->src.bo);
++		tmp->src.bo = NULL;
++	}
++cleanup_dst:
++	if (tmp->redirect.real_bo) {
++		kgem_bo_destroy(&sna->kgem, tmp->dst.bo);
++		tmp->redirect.real_bo = NULL;
++	}
++fallback:
++	return (mask == NULL &&
++		sna_blt_composite(sna, op,
++				  src, dst,
++				  src_x, src_y,
++				  dst_x, dst_y,
++				  width, height,
++				  flags | COMPOSITE_FALLBACK, tmp));
++}
++
++#if !NO_COMPOSITE_SPANS
++fastcall static void
++gen9_render_composite_spans_box(struct sna *sna,
++				const struct sna_composite_spans_op *op,
++				const BoxRec *box, float opacity)
++{
++	DBG(("%s: src=+(%d, %d), opacity=%f, dst=+(%d, %d), box=(%d, %d) x (%d, %d)\n",
++	     __FUNCTION__,
++	     op->base.src.offset[0], op->base.src.offset[1],
++	     opacity,
++	     op->base.dst.x, op->base.dst.y,
++	     box->x1, box->y1,
++	     box->x2 - box->x1,
++	     box->y2 - box->y1));
++
++	gen9_get_rectangles(sna, &op->base, 1, gen9_emit_composite_state);
++	op->prim_emit(sna, op, box, opacity);
++}
++
++static void
++gen9_render_composite_spans_boxes(struct sna *sna,
++				  const struct sna_composite_spans_op *op,
++				  const BoxRec *box, int nbox,
++				  float opacity)
++{
++	DBG(("%s: nbox=%d, src=+(%d, %d), opacity=%f, dst=+(%d, %d)\n",
++	     __FUNCTION__, nbox,
++	     op->base.src.offset[0], op->base.src.offset[1],
++	     opacity,
++	     op->base.dst.x, op->base.dst.y));
++
++	do {
++		int nbox_this_time;
++
++		nbox_this_time = gen9_get_rectangles(sna, &op->base, nbox,
++						     gen9_emit_composite_state);
++		nbox -= nbox_this_time;
++
++		do {
++			DBG(("  %s: (%d, %d) x (%d, %d)\n", __FUNCTION__,
++			     box->x1, box->y1,
++			     box->x2 - box->x1,
++			     box->y2 - box->y1));
++
++			op->prim_emit(sna, op, box++, opacity);
++		} while (--nbox_this_time);
++	} while (nbox);
++}
++
++fastcall static void
++gen9_render_composite_spans_boxes__thread(struct sna *sna,
++					  const struct sna_composite_spans_op *op,
++					  const struct sna_opacity_box *box,
++					  int nbox)
++{
++	DBG(("%s: nbox=%d, src=+(%d, %d), dst=+(%d, %d)\n",
++	     __FUNCTION__, nbox,
++	     op->base.src.offset[0], op->base.src.offset[1],
++	     op->base.dst.x, op->base.dst.y));
++
++	sna_vertex_lock(&sna->render);
++	do {
++		int nbox_this_time;
++		float *v;
++
++		nbox_this_time = gen9_get_rectangles(sna, &op->base, nbox,
++						     gen9_emit_composite_state);
++		assert(nbox_this_time);
++		nbox -= nbox_this_time;
++
++		v = sna->render.vertices + sna->render.vertex_used;
++		sna->render.vertex_used += nbox_this_time * op->base.floats_per_rect;
++
++		sna_vertex_acquire__locked(&sna->render);
++		sna_vertex_unlock(&sna->render);
++
++		op->emit_boxes(op, box, nbox_this_time, v);
++		box += nbox_this_time;
++
++		sna_vertex_lock(&sna->render);
++		sna_vertex_release__locked(&sna->render);
++	} while (nbox);
++	sna_vertex_unlock(&sna->render);
++}
++
++fastcall static void
++gen9_render_composite_spans_done(struct sna *sna,
++				 const struct sna_composite_spans_op *op)
++{
++	if (sna->render.vertex_offset)
++		gen8_vertex_flush(sna);
++
++	DBG(("%s()\n", __FUNCTION__));
++
++	if (op->base.src.bo)
++		kgem_bo_destroy(&sna->kgem, op->base.src.bo);
++
++	sna_render_composite_redirect_done(sna, &op->base);
++}
++
++static bool
++gen9_check_composite_spans(struct sna *sna,
++			   uint8_t op, PicturePtr src, PicturePtr dst,
++			   int16_t width, int16_t height, unsigned flags)
++{
++	if (op >= ARRAY_SIZE(gen9_blend_op))
++		return false;
++
++	if (gen9_composite_fallback(sna, src, NULL, dst))
++		return false;
++
++	if (need_tiling(sna, width, height) &&
++	    !is_gpu(sna, dst->pDrawable, PREFER_GPU_SPANS)) {
++		DBG(("%s: fallback, tiled operation not on GPU\n",
++		     __FUNCTION__));
++		return false;
++	}
++
++	return true;
++}
++
++static bool
++gen9_render_composite_spans(struct sna *sna,
++			    uint8_t op,
++			    PicturePtr src,
++			    PicturePtr dst,
++			    int16_t src_x,  int16_t src_y,
++			    int16_t dst_x,  int16_t dst_y,
++			    int16_t width,  int16_t height,
++			    unsigned flags,
++			    struct sna_composite_spans_op *tmp)
++{
++	DBG(("%s: %dx%d with flags=%x, current mode=%d\n", __FUNCTION__,
++	     width, height, flags, sna->kgem.ring));
++
++	assert(gen9_check_composite_spans(sna, op, src, dst, width, height, flags));
++
++	if (need_tiling(sna, width, height)) {
++		DBG(("%s: tiling, operation (%dx%d) too wide for pipeline\n",
++		     __FUNCTION__, width, height));
++		return sna_tiling_composite_spans(op, src, dst,
++						  src_x, src_y, dst_x, dst_y,
++						  width, height, flags, tmp);
++	}
++
++	tmp->base.op = op;
++	if (!gen9_composite_set_target(sna, &tmp->base, dst,
++				       dst_x, dst_y, width, height, true))
++		return false;
++
++	switch (gen9_composite_picture(sna, src, &tmp->base.src,
++				       src_x, src_y,
++				       width, height,
++				       dst_x, dst_y,
++				       dst->polyMode == PolyModePrecise)) {
++	case -1:
++		goto cleanup_dst;
++	case 0:
++		if (!gen4_channel_init_solid(sna, &tmp->base.src, 0))
++			goto cleanup_dst;
++		/* fall through to fixup */
++	case 1:
++		if (!gen9_composite_channel_convert(&tmp->base.src))
++			goto cleanup_src;
++		break;
++	}
++	tmp->base.mask.bo = NULL;
++
++	tmp->base.is_affine = tmp->base.src.is_affine;
++	tmp->base.need_magic_ca_pass = false;
++
++	tmp->base.u.gen9.flags =
++		GEN9_SET_FLAGS(SAMPLER_OFFSET(tmp->base.src.filter,
++					      tmp->base.src.repeat,
++					      SAMPLER_FILTER_NEAREST,
++					      SAMPLER_EXTEND_PAD),
++			       gen9_get_blend(tmp->base.op, false, tmp->base.dst.format),
++			       GEN9_WM_KERNEL_OPACITY | !tmp->base.is_affine,
++			       gen4_choose_spans_emitter(sna, tmp));
++
++	tmp->box   = gen9_render_composite_spans_box;
++	tmp->boxes = gen9_render_composite_spans_boxes;
++	if (tmp->emit_boxes)
++		tmp->thread_boxes = gen9_render_composite_spans_boxes__thread;
++	tmp->done  = gen9_render_composite_spans_done;
++
++	kgem_set_mode(&sna->kgem, KGEM_RENDER, tmp->base.dst.bo);
++	if (!kgem_check_bo(&sna->kgem,
++			   tmp->base.dst.bo, tmp->base.src.bo,
++			   NULL)) {
++		kgem_submit(&sna->kgem);
++		if (!kgem_check_bo(&sna->kgem,
++				   tmp->base.dst.bo, tmp->base.src.bo,
++				   NULL))
++			goto cleanup_src;
++		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
++	}
++
++	gen9_align_vertex(sna, &tmp->base);
++	gen9_emit_composite_state(sna, &tmp->base);
++	return true;
++
++cleanup_src:
++	if (tmp->base.src.bo)
++		kgem_bo_destroy(&sna->kgem, tmp->base.src.bo);
++cleanup_dst:
++	if (tmp->base.redirect.real_bo)
++		kgem_bo_destroy(&sna->kgem, tmp->base.dst.bo);
++	return false;
++}
++#endif
++
++static void
++gen9_emit_copy_state(struct sna *sna,
++		     const struct sna_composite_op *op)
++{
++	uint32_t *binding_table;
++	uint16_t offset, dirty;
++
++	gen9_get_batch(sna, op);
++
++	binding_table = gen9_composite_get_binding_table(sna, &offset);
++
++	dirty = kgem_bo_is_dirty(op->dst.bo);
++
++	binding_table[0] =
++		gen9_bind_bo(sna,
++			     op->dst.bo, op->dst.width, op->dst.height,
++			     gen9_get_dest_format(op->dst.format),
++			     true);
++	binding_table[1] =
++		gen9_bind_bo(sna,
++			     op->src.bo, op->src.width, op->src.height,
++			     op->src.card_format,
++			     false);
++
++	if (sna->kgem.surface == offset &&
++	    *(uint64_t *)(sna->kgem.batch + sna->render_state.gen9.surface_table) == *(uint64_t*)binding_table) {
++		sna->kgem.surface += SURFACE_DW;
++		offset = sna->render_state.gen9.surface_table;
++	}
++
++	if (sna->kgem.batch[sna->render_state.gen9.surface_table] == binding_table[0])
++		dirty = 0;
++
++	assert(!GEN9_READS_DST(op->u.gen9.flags));
++	gen9_emit_state(sna, op, offset | dirty);
++}
++
++static inline bool
++prefer_blt_copy(struct sna *sna,
++		struct kgem_bo *src_bo,
++		struct kgem_bo *dst_bo,
++		unsigned flags)
++{
++	if (sna->kgem.mode == KGEM_BLT)
++		return true;
++
++	assert((flags & COPY_SYNC) == 0);
++
++	if (untiled_tlb_miss(src_bo) ||
++	    untiled_tlb_miss(dst_bo))
++		return true;
++
++	if (flags & COPY_DRI && !sna->kgem.has_semaphores)
++		return false;
++
++	if (force_blt_ring(sna, dst_bo))
++		return true;
++
++	if ((flags & COPY_SMALL ||
++	     (sna->render_state.gt < 3 && src_bo == dst_bo)) &&
++	    can_switch_to_blt(sna, dst_bo, flags))
++		return true;
++
++	if (kgem_bo_is_render(dst_bo) ||
++	    kgem_bo_is_render(src_bo))
++		return false;
++
++	if (flags & COPY_LAST &&
++	    sna->render_state.gt < 3 &&
++            can_switch_to_blt(sna, dst_bo, flags))
++		return true;
++
++	if (prefer_render_ring(sna, dst_bo))
++		return false;
++
++	if (!prefer_blt_ring(sna, dst_bo, flags))
++		return false;
++
++	return prefer_blt_bo(sna, src_bo, dst_bo);
++}
++
++static bool
++gen9_render_copy_boxes(struct sna *sna, uint8_t alu,
++		       const DrawableRec *src, struct kgem_bo *src_bo, int16_t src_dx, int16_t src_dy,
++		       const DrawableRec *dst, struct kgem_bo *dst_bo, int16_t dst_dx, int16_t dst_dy,
++		       const BoxRec *box, int n, unsigned flags)
++{
++	struct sna_composite_op tmp;
++	BoxRec extents;
++
++	DBG(("%s (%d, %d)->(%d, %d) x %d, alu=%x, flags=%x, self-copy=%d, overlaps? %d\n",
++	     __FUNCTION__, src_dx, src_dy, dst_dx, dst_dy, n, alu, flags,
++	     src_bo == dst_bo,
++	     overlaps(sna,
++		      src_bo, src_dx, src_dy,
++		      dst_bo, dst_dx, dst_dy,
++		      box, n, flags, &extents)));
++
++	if (prefer_blt_copy(sna, src_bo, dst_bo, flags) &&
++	    sna_blt_compare_depth(src, dst) &&
++	    sna_blt_copy_boxes(sna, alu,
++			       src_bo, src_dx, src_dy,
++			       dst_bo, dst_dx, dst_dy,
++			       dst->bitsPerPixel,
++			       box, n))
++		return true;
++
++	if (!(alu == GXcopy || alu == GXclear) ||
++	    unaligned(src_bo, src->bitsPerPixel) ||
++	    unaligned(dst_bo, dst->bitsPerPixel)) {
++fallback_blt:
++		DBG(("%s: fallback blt\n", __FUNCTION__));
++		if (!sna_blt_compare_depth(src, dst))
++			return false;
++
++		return sna_blt_copy_boxes_fallback(sna, alu,
++						   src, src_bo, src_dx, src_dy,
++						   dst, dst_bo, dst_dx, dst_dy,
++						   box, n);
++	}
++
++	if (overlaps(sna,
++		     src_bo, src_dx, src_dy,
++		     dst_bo, dst_dx, dst_dy,
++		     box, n, flags,
++		     &extents)) {
++		bool big = too_large(extents.x2-extents.x1, extents.y2-extents.y1);
++
++		if ((big || !prefer_render_ring(sna, dst_bo)) &&
++		    sna_blt_copy_boxes(sna, alu,
++				       src_bo, src_dx, src_dy,
++				       dst_bo, dst_dx, dst_dy,
++				       dst->bitsPerPixel,
++				       box, n))
++			return true;
++
++		if (big)
++			goto fallback_blt;
++
++		assert(src_bo == dst_bo);
++		assert(src->depth == dst->depth);
++		assert(src->width == dst->width);
++		assert(src->height == dst->height);
++		return sna_render_copy_boxes__overlap(sna, alu, dst, dst_bo,
++						      src_dx, src_dy,
++						      dst_dx, dst_dy,
++						      box, n, &extents);
++	}
++
++	if (dst->depth == src->depth) {
++		tmp.dst.format = sna_render_format_for_depth(dst->depth);
++		tmp.src.pict_format = tmp.dst.format;
++	} else {
++		tmp.dst.format = sna_format_for_depth(dst->depth);
++		tmp.src.pict_format = sna_format_for_depth(src->depth);
++	}
++	if (!gen9_check_format(tmp.src.pict_format))
++		goto fallback_blt;
++
++	tmp.dst.pixmap = (PixmapPtr)dst;
++	tmp.dst.width  = dst->width;
++	tmp.dst.height = dst->height;
++	tmp.dst.bo = dst_bo;
++	tmp.dst.x = tmp.dst.y = 0;
++	tmp.damage = NULL;
++
++	sna_render_composite_redirect_init(&tmp);
++	if (too_large(tmp.dst.width, tmp.dst.height)) {
++		int i;
++
++		extents = box[0];
++		for (i = 1; i < n; i++) {
++			if (box[i].x1 < extents.x1)
++				extents.x1 = box[i].x1;
++			if (box[i].y1 < extents.y1)
++				extents.y1 = box[i].y1;
++
++			if (box[i].x2 > extents.x2)
++				extents.x2 = box[i].x2;
++			if (box[i].y2 > extents.y2)
++				extents.y2 = box[i].y2;
++		}
++
++		if (!sna_render_composite_redirect(sna, &tmp,
++						   extents.x1 + dst_dx,
++						   extents.y1 + dst_dy,
++						   extents.x2 - extents.x1,
++						   extents.y2 - extents.y1,
++						   n > 1))
++			goto fallback_tiled;
++	}
++
++	tmp.src.card_format = gen9_get_card_format(tmp.src.pict_format);
++	if (too_large(src->width, src->height)) {
++		int i;
++
++		extents = box[0];
++		for (i = 1; i < n; i++) {
++			if (box[i].x1 < extents.x1)
++				extents.x1 = box[i].x1;
++			if (box[i].y1 < extents.y1)
++				extents.y1 = box[i].y1;
++
++			if (box[i].x2 > extents.x2)
++				extents.x2 = box[i].x2;
++			if (box[i].y2 > extents.y2)
++				extents.y2 = box[i].y2;
++		}
++
++		if (!sna_render_pixmap_partial(sna, src, src_bo, &tmp.src,
++					       extents.x1 + src_dx,
++					       extents.y1 + src_dy,
++					       extents.x2 - extents.x1,
++					       extents.y2 - extents.y1))
++			goto fallback_tiled_dst;
++	} else {
++		tmp.src.bo = src_bo;
++		tmp.src.width  = src->width;
++		tmp.src.height = src->height;
++		tmp.src.offset[0] = tmp.src.offset[1] = 0;
++	}
++
++	tmp.mask.bo = NULL;
++
++	tmp.floats_per_vertex = 2;
++	tmp.floats_per_rect = 6;
++	tmp.need_magic_ca_pass = 0;
++
++	tmp.u.gen9.flags = COPY_FLAGS(alu);
++
++	kgem_set_mode(&sna->kgem, KGEM_RENDER, tmp.dst.bo);
++	if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, tmp.src.bo, NULL)) {
++		kgem_submit(&sna->kgem);
++		if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, tmp.src.bo, NULL)) {
++			if (tmp.src.bo != src_bo)
++				kgem_bo_destroy(&sna->kgem, tmp.src.bo);
++			if (tmp.redirect.real_bo)
++				kgem_bo_destroy(&sna->kgem, tmp.dst.bo);
++			goto fallback_blt;
++		}
++		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
++	}
++
++	src_dx += tmp.src.offset[0];
++	src_dy += tmp.src.offset[1];
++
++	dst_dx += tmp.dst.x;
++	dst_dy += tmp.dst.y;
++
++	tmp.dst.x = tmp.dst.y = 0;
++
++	gen9_align_vertex(sna, &tmp);
++	gen9_emit_copy_state(sna, &tmp);
++
++	do {
++		int16_t *v;
++		int n_this_time;
++
++		n_this_time = gen9_get_rectangles(sna, &tmp, n,
++						  gen9_emit_copy_state);
++		n -= n_this_time;
++
++		v = (int16_t *)(sna->render.vertices + sna->render.vertex_used);
++		sna->render.vertex_used += 6 * n_this_time;
++		assert(sna->render.vertex_used <= sna->render.vertex_size);
++		do {
++
++			DBG(("	(%d, %d) -> (%d, %d) + (%d, %d)\n",
++			     box->x1 + src_dx, box->y1 + src_dy,
++			     box->x1 + dst_dx, box->y1 + dst_dy,
++			     box->x2 - box->x1, box->y2 - box->y1));
++			v[0] = box->x2 + dst_dx;
++			v[2] = box->x2 + src_dx;
++			v[1]  = v[5] = box->y2 + dst_dy;
++			v[3]  = v[7] = box->y2 + src_dy;
++			v[8]  = v[4] = box->x1 + dst_dx;
++			v[10] = v[6] = box->x1 + src_dx;
++			v[9]  = box->y1 + dst_dy;
++			v[11] = box->y1 + src_dy;
++			v += 12; box++;
++		} while (--n_this_time);
++	} while (n);
++
++	gen8_vertex_flush(sna);
++	sna_render_composite_redirect_done(sna, &tmp);
++	if (tmp.src.bo != src_bo)
++		kgem_bo_destroy(&sna->kgem, tmp.src.bo);
++	return true;
++
++fallback_tiled_dst:
++	if (tmp.redirect.real_bo)
++		kgem_bo_destroy(&sna->kgem, tmp.dst.bo);
++fallback_tiled:
++	DBG(("%s: fallback tiled\n", __FUNCTION__));
++	if (sna_blt_compare_depth(src, dst) &&
++	    sna_blt_copy_boxes(sna, alu,
++			       src_bo, src_dx, src_dy,
++			       dst_bo, dst_dx, dst_dy,
++			       dst->bitsPerPixel,
++			       box, n))
++		return true;
++
++	return sna_tiling_copy_boxes(sna, alu,
++				     src, src_bo, src_dx, src_dy,
++				     dst, dst_bo, dst_dx, dst_dy,
++				     box, n);
++}
++
++static void
++gen9_render_copy_blt(struct sna *sna,
++		     const struct sna_copy_op *op,
++		     int16_t sx, int16_t sy,
++		     int16_t w,  int16_t h,
++		     int16_t dx, int16_t dy)
++{
++	int16_t *v;
++
++	gen9_get_rectangles(sna, &op->base, 1, gen9_emit_copy_state);
++
++	v = (int16_t *)&sna->render.vertices[sna->render.vertex_used];
++	sna->render.vertex_used += 6;
++	assert(sna->render.vertex_used <= sna->render.vertex_size);
++
++	v[0]  = dx+w; v[1]  = dy+h;
++	v[2]  = sx+w; v[3]  = sy+h;
++	v[4]  = dx;   v[5]  = dy+h;
++	v[6]  = sx;   v[7]  = sy+h;
++	v[8]  = dx;   v[9]  = dy;
++	v[10] = sx;   v[11] = sy;
++}
++
++static void
++gen9_render_copy_done(struct sna *sna, const struct sna_copy_op *op)
++{
++	if (sna->render.vertex_offset)
++		gen8_vertex_flush(sna);
++}
++
++static bool
++gen9_render_copy(struct sna *sna, uint8_t alu,
++		 PixmapPtr src, struct kgem_bo *src_bo,
++		 PixmapPtr dst, struct kgem_bo *dst_bo,
++		 struct sna_copy_op *op)
++{
++	DBG(("%s (alu=%d, src=(%dx%d), dst=(%dx%d))\n",
++	     __FUNCTION__, alu,
++	     src->drawable.width, src->drawable.height,
++	     dst->drawable.width, dst->drawable.height));
++
++	if (prefer_blt_copy(sna, src_bo, dst_bo, 0) &&
++	    sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
++	    sna_blt_copy(sna, alu,
++			 src_bo, dst_bo,
++			 dst->drawable.bitsPerPixel,
++			 op))
++		return true;
++
++	if (!(alu == GXcopy || alu == GXclear) || src_bo == dst_bo ||
++	    too_large(src->drawable.width, src->drawable.height) ||
++	    too_large(dst->drawable.width, dst->drawable.height) ||
++	    unaligned(src_bo, src->drawable.bitsPerPixel) ||
++	    unaligned(dst_bo, dst->drawable.bitsPerPixel)) {
++fallback:
++		if (!sna_blt_compare_depth(&src->drawable, &dst->drawable))
++			return false;
++
++		return sna_blt_copy(sna, alu, src_bo, dst_bo,
++				    dst->drawable.bitsPerPixel,
++				    op);
++	}
++
++	if (dst->drawable.depth == src->drawable.depth) {
++		op->base.dst.format = sna_render_format_for_depth(dst->drawable.depth);
++		op->base.src.pict_format = op->base.dst.format;
++	} else {
++		op->base.dst.format = sna_format_for_depth(dst->drawable.depth);
++		op->base.src.pict_format = sna_format_for_depth(src->drawable.depth);
++	}
++	if (!gen9_check_format(op->base.src.pict_format))
++		goto fallback;
++
++	op->base.dst.pixmap = dst;
++	op->base.dst.width  = dst->drawable.width;
++	op->base.dst.height = dst->drawable.height;
++	op->base.dst.bo = dst_bo;
++
++	op->base.src.bo = src_bo;
++	op->base.src.card_format =
++		gen9_get_card_format(op->base.src.pict_format);
++	op->base.src.width  = src->drawable.width;
++	op->base.src.height = src->drawable.height;
++
++	op->base.mask.bo = NULL;
++
++	op->base.floats_per_vertex = 2;
++	op->base.floats_per_rect = 6;
++
++	op->base.u.gen9.flags = COPY_FLAGS(alu);
++
++	kgem_set_mode(&sna->kgem, KGEM_RENDER, dst_bo);
++	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
++		kgem_submit(&sna->kgem);
++		if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
++			goto fallback;
++		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
++	}
++
++	gen9_align_vertex(sna, &op->base);
++	gen9_emit_copy_state(sna, &op->base);
++
++	op->blt  = gen9_render_copy_blt;
++	op->done = gen9_render_copy_done;
++	return true;
++}
++
++static void
++gen9_emit_fill_state(struct sna *sna, const struct sna_composite_op *op)
++{
++	uint32_t *binding_table;
++	uint16_t offset, dirty;
++
++	/* XXX Render Target Fast Clear
++	 * Set RTFC Enable in PS and render a rectangle.
++	 * Limited to a clearing the full MSC surface only with a
++	 * specific kernel.
++	 */
++
++	gen9_get_batch(sna, op);
++
++	binding_table = gen9_composite_get_binding_table(sna, &offset);
++
++	dirty = kgem_bo_is_dirty(op->dst.bo);
++
++	binding_table[0] =
++		gen9_bind_bo(sna,
++			     op->dst.bo, op->dst.width, op->dst.height,
++			     gen9_get_dest_format(op->dst.format),
++			     true);
++	binding_table[1] =
++		gen9_bind_bo(sna,
++			     op->src.bo, 1, 1,
++			     SURFACEFORMAT_B8G8R8A8_UNORM,
++			     false);
++
++	if (sna->kgem.surface == offset &&
++	    *(uint64_t *)(sna->kgem.batch + sna->render_state.gen9.surface_table) == *(uint64_t*)binding_table) {
++		sna->kgem.surface += SURFACE_DW;
++		offset = sna->render_state.gen9.surface_table;
++	}
++
++	if (sna->kgem.batch[sna->render_state.gen9.surface_table] == binding_table[0])
++		dirty = 0;
++
++	gen9_emit_state(sna, op, offset | dirty);
++}
++
++static bool
++gen9_render_fill_boxes(struct sna *sna,
++		       CARD8 op,
++		       PictFormat format,
++		       const xRenderColor *color,
++		       const DrawableRec *dst, struct kgem_bo *dst_bo,
++		       const BoxRec *box, int n)
++{
++	struct sna_composite_op tmp;
++	uint32_t pixel;
++
++	DBG(("%s (op=%d, color=(%04x, %04x, %04x, %04x) [%08x])\n",
++	     __FUNCTION__, op,
++	     color->red, color->green, color->blue, color->alpha, (int)format));
++
++	if (op >= ARRAY_SIZE(gen9_blend_op)) {
++		DBG(("%s: fallback due to unhandled blend op: %d\n",
++		     __FUNCTION__, op));
++		return false;
++	}
++
++	if (prefer_blt_fill(sna, dst_bo, FILL_BOXES) ||
++	    !gen9_check_dst_format(format) ||
++	    unaligned(dst_bo, PICT_FORMAT_BPP(format))) {
++		uint8_t alu = GXinvalid;
++
++		if (op <= PictOpSrc) {
++			pixel = 0;
++			if (op == PictOpClear)
++				alu = GXclear;
++			else if (sna_get_pixel_from_rgba(&pixel,
++							 color->red,
++							 color->green,
++							 color->blue,
++							 color->alpha,
++							 format))
++				alu = GXcopy;
++		}
++
++		if (alu != GXinvalid &&
++		    sna_blt_fill_boxes(sna, alu,
++				       dst_bo, dst->bitsPerPixel,
++				       pixel, box, n))
++			return true;
++
++		if (!gen9_check_dst_format(format))
++			return false;
++	}
++
++	if (op == PictOpClear) {
++		pixel = 0;
++		op = PictOpSrc;
++	} else if (!sna_get_pixel_from_rgba(&pixel,
++					    color->red,
++					    color->green,
++					    color->blue,
++					    color->alpha,
++					    PICT_a8r8g8b8))
++		return false;
++
++	DBG(("%s(%08x x %d [(%d, %d), (%d, %d) ...])\n",
++	     __FUNCTION__, pixel, n,
++	     box[0].x1, box[0].y1, box[0].x2, box[0].y2));
++
++	tmp.dst.pixmap = (PixmapPtr)dst;
++	tmp.dst.width  = dst->width;
++	tmp.dst.height = dst->height;
++	tmp.dst.format = format;
++	tmp.dst.bo = dst_bo;
++	tmp.dst.x = tmp.dst.y = 0;
++	tmp.damage = NULL;
++
++	sna_render_composite_redirect_init(&tmp);
++	if (too_large(dst->width, dst->height)) {
++		BoxRec extents;
++
++		boxes_extents(box, n, &extents);
++		if (!sna_render_composite_redirect(sna, &tmp,
++						   extents.x1, extents.y1,
++						   extents.x2 - extents.x1,
++						   extents.y2 - extents.y1,
++						   n > 1))
++			return sna_tiling_fill_boxes(sna, op, format, color,
++						     dst, dst_bo, box, n);
++	}
++
++	tmp.src.bo = sna_render_get_solid(sna, pixel);
++	tmp.mask.bo = NULL;
++
++	tmp.floats_per_vertex = 2;
++	tmp.floats_per_rect = 6;
++	tmp.need_magic_ca_pass = false;
++
++	tmp.u.gen9.flags = FILL_FLAGS(op, format);
++
++	kgem_set_mode(&sna->kgem, KGEM_RENDER, dst_bo);
++	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
++		kgem_submit(&sna->kgem);
++		if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
++			kgem_bo_destroy(&sna->kgem, tmp.src.bo);
++			tmp.src.bo = NULL;
++
++			if (tmp.redirect.real_bo) {
++				kgem_bo_destroy(&sna->kgem, tmp.dst.bo);
++				tmp.redirect.real_bo = NULL;
++			}
++
++			return false;
++		}
++		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
++	}
++
++	gen9_align_vertex(sna, &tmp);
++	gen9_emit_fill_state(sna, &tmp);
++
++	do {
++		int n_this_time;
++		int16_t *v;
++
++		n_this_time = gen9_get_rectangles(sna, &tmp, n,
++						  gen9_emit_fill_state);
++		n -= n_this_time;
++
++		v = (int16_t *)(sna->render.vertices + sna->render.vertex_used);
++		sna->render.vertex_used += 6 * n_this_time;
++		assert(sna->render.vertex_used <= sna->render.vertex_size);
++		do {
++			DBG(("	(%d, %d), (%d, %d)\n",
++			     box->x1, box->y1, box->x2, box->y2));
++
++			v[0] = box->x2;
++			v[5] = v[1] = box->y2;
++			v[8] = v[4] = box->x1;
++			v[9] = box->y1;
++			v[2] = v[3]  = v[7]  = 1;
++			v[6] = v[10] = v[11] = 0;
++			v += 12; box++;
++		} while (--n_this_time);
++	} while (n);
++
++	gen8_vertex_flush(sna);
++	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
++	sna_render_composite_redirect_done(sna, &tmp);
++	return true;
++}
++
++static void
++gen9_render_fill_op_blt(struct sna *sna,
++			const struct sna_fill_op *op,
++			int16_t x, int16_t y, int16_t w, int16_t h)
++{
++	int16_t *v;
++
++	DBG(("%s: (%d, %d)x(%d, %d)\n", __FUNCTION__, x, y, w, h));
++
++	gen9_get_rectangles(sna, &op->base, 1, gen9_emit_fill_state);
++
++	v = (int16_t *)&sna->render.vertices[sna->render.vertex_used];
++	sna->render.vertex_used += 6;
++	assert(sna->render.vertex_used <= sna->render.vertex_size);
++
++	v[0] = x+w;
++	v[4] = v[8] = x;
++	v[1] = v[5] = y+h;
++	v[9] = y;
++
++	v[2] = v[3]  = v[7]  = 1;
++	v[6] = v[10] = v[11] = 0;
++}
++
++fastcall static void
++gen9_render_fill_op_box(struct sna *sna,
++			const struct sna_fill_op *op,
++			const BoxRec *box)
++{
++	int16_t *v;
++
++	DBG(("%s: (%d, %d),(%d, %d)\n", __FUNCTION__,
++	     box->x1, box->y1, box->x2, box->y2));
++
++	gen9_get_rectangles(sna, &op->base, 1, gen9_emit_fill_state);
++
++	v = (int16_t *)&sna->render.vertices[sna->render.vertex_used];
++	sna->render.vertex_used += 6;
++	assert(sna->render.vertex_used <= sna->render.vertex_size);
++
++	v[0] = box->x2;
++	v[8] = v[4] = box->x1;
++	v[5] = v[1] = box->y2;
++	v[9] = box->y1;
++
++	v[7] = v[2]  = v[3]  = 1;
++	v[6] = v[10] = v[11] = 0;
++}
++
++fastcall static void
++gen9_render_fill_op_boxes(struct sna *sna,
++			  const struct sna_fill_op *op,
++			  const BoxRec *box,
++			  int nbox)
++{
++	DBG(("%s: (%d, %d),(%d, %d)... x %d\n", __FUNCTION__,
++	     box->x1, box->y1, box->x2, box->y2, nbox));
++
++	do {
++		int nbox_this_time;
++		int16_t *v;
++
++		nbox_this_time = gen9_get_rectangles(sna, &op->base, nbox,
++						     gen9_emit_fill_state);
++		nbox -= nbox_this_time;
++
++		v = (int16_t *)&sna->render.vertices[sna->render.vertex_used];
++		sna->render.vertex_used += 6 * nbox_this_time;
++		assert(sna->render.vertex_used <= sna->render.vertex_size);
++
++		do {
++			v[0] = box->x2;
++			v[8] = v[4] = box->x1;
++			v[5] = v[1] = box->y2;
++			v[9] = box->y1;
++			v[7] = v[2]  = v[3]  = 1;
++			v[6] = v[10] = v[11] = 0;
++			box++; v += 12;
++		} while (--nbox_this_time);
++	} while (nbox);
++}
++
++static void
++gen9_render_fill_op_done(struct sna *sna, const struct sna_fill_op *op)
++{
++	if (sna->render.vertex_offset)
++		gen8_vertex_flush(sna);
++	kgem_bo_destroy(&sna->kgem, op->base.src.bo);
++}
++
++static bool
++gen9_render_fill(struct sna *sna, uint8_t alu,
++		 PixmapPtr dst, struct kgem_bo *dst_bo,
++		 uint32_t color, unsigned flags,
++		 struct sna_fill_op *op)
++{
++	DBG(("%s: (alu=%d, color=%x)\n", __FUNCTION__, alu, color));
++
++	if (prefer_blt_fill(sna, dst_bo, flags) &&
++	    sna_blt_fill(sna, alu,
++			 dst_bo, dst->drawable.bitsPerPixel,
++			 color,
++			 op))
++		return true;
++
++	if (!(alu == GXcopy || alu == GXclear) ||
++	    too_large(dst->drawable.width, dst->drawable.height) ||
++	    unaligned(dst_bo, dst->drawable.bitsPerPixel))
++		return sna_blt_fill(sna, alu,
++				    dst_bo, dst->drawable.bitsPerPixel,
++				    color,
++				    op);
++
++	if (alu == GXclear)
++		color = 0;
++
++	op->base.dst.pixmap = dst;
++	op->base.dst.width  = dst->drawable.width;
++	op->base.dst.height = dst->drawable.height;
++	op->base.dst.format = sna_format_for_depth(dst->drawable.depth);
++	op->base.dst.bo = dst_bo;
++	op->base.dst.x = op->base.dst.y = 0;
++
++	op->base.src.bo =
++		sna_render_get_solid(sna,
++				     sna_rgba_for_color(color,
++							dst->drawable.depth));
++	op->base.mask.bo = NULL;
++
++	op->base.need_magic_ca_pass = false;
++	op->base.floats_per_vertex = 2;
++	op->base.floats_per_rect = 6;
++
++	op->base.u.gen9.flags = FILL_FLAGS_NOBLEND;
++
++	kgem_set_mode(&sna->kgem, KGEM_RENDER, dst_bo);
++	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
++		kgem_submit(&sna->kgem);
++		if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
++			kgem_bo_destroy(&sna->kgem, op->base.src.bo);
++			return false;
++		}
++
++		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
++	}
++
++	gen9_align_vertex(sna, &op->base);
++	gen9_emit_fill_state(sna, &op->base);
++
++	op->blt   = gen9_render_fill_op_blt;
++	op->box   = gen9_render_fill_op_box;
++	op->boxes = gen9_render_fill_op_boxes;
++	op->points = NULL;
++	op->done  = gen9_render_fill_op_done;
++	return true;
++}
++
++static bool
++gen9_render_fill_one_try_blt(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
++			     uint32_t color,
++			     int16_t x1, int16_t y1, int16_t x2, int16_t y2,
++			     uint8_t alu)
++{
++	BoxRec box;
++
++	box.x1 = x1;
++	box.y1 = y1;
++	box.x2 = x2;
++	box.y2 = y2;
++
++	return sna_blt_fill_boxes(sna, alu,
++				  bo, dst->drawable.bitsPerPixel,
++				  color, &box, 1);
++}
++
++static bool
++gen9_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
++		     uint32_t color,
++		     int16_t x1, int16_t y1,
++		     int16_t x2, int16_t y2,
++		     uint8_t alu)
++{
++	struct sna_composite_op tmp;
++	int16_t *v;
++
++	/* Prefer to use the BLT if already engaged */
++	if (prefer_blt_fill(sna, bo, FILL_BOXES) &&
++	    gen9_render_fill_one_try_blt(sna, dst, bo, color,
++					 x1, y1, x2, y2, alu))
++		return true;
++
++	/* Must use the BLT if we can't RENDER... */
++	if (!(alu == GXcopy || alu == GXclear) ||
++	    too_large(dst->drawable.width, dst->drawable.height) ||
++	    unaligned(bo, dst->drawable.bitsPerPixel))
++		return gen9_render_fill_one_try_blt(sna, dst, bo, color,
++						    x1, y1, x2, y2, alu);
++
++	if (alu == GXclear)
++		color = 0;
++
++	tmp.dst.pixmap = dst;
++	tmp.dst.width  = dst->drawable.width;
++	tmp.dst.height = dst->drawable.height;
++	tmp.dst.format = sna_format_for_depth(dst->drawable.depth);
++	tmp.dst.bo = bo;
++	tmp.dst.x = tmp.dst.y = 0;
++
++	tmp.src.bo =
++		sna_render_get_solid(sna,
++				     sna_rgba_for_color(color,
++							dst->drawable.depth));
++	tmp.mask.bo = NULL;
++
++	tmp.floats_per_vertex = 2;
++	tmp.floats_per_rect = 6;
++	tmp.need_magic_ca_pass = false;
++
++	tmp.u.gen9.flags = FILL_FLAGS_NOBLEND;
++
++	kgem_set_mode(&sna->kgem, KGEM_RENDER, bo);
++	if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
++		kgem_submit(&sna->kgem);
++		if (kgem_check_bo(&sna->kgem, bo, NULL)) {
++			kgem_bo_destroy(&sna->kgem, tmp.src.bo);
++			return false;
++		}
++		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
++	}
++
++	gen9_align_vertex(sna, &tmp);
++	gen9_emit_fill_state(sna, &tmp);
++
++	gen9_get_rectangles(sna, &tmp, 1, gen9_emit_fill_state);
++
++	DBG(("	(%d, %d), (%d, %d)\n", x1, y1, x2, y2));
++
++	v = (int16_t *)&sna->render.vertices[sna->render.vertex_used];
++	sna->render.vertex_used += 6;
++	assert(sna->render.vertex_used <= sna->render.vertex_size);
++
++	v[0] = x2;
++	v[8] = v[4] = x1;
++	v[5] = v[1] = y2;
++	v[9] = y1;
++	v[7] = v[2]  = v[3]  = 1;
++	v[6] = v[10] = v[11] = 0;
++
++	gen8_vertex_flush(sna);
++	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
++
++	return true;
++}
++
++static bool
++gen9_render_clear_try_blt(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo)
++{
++	BoxRec box;
++
++	box.x1 = 0;
++	box.y1 = 0;
++	box.x2 = dst->drawable.width;
++	box.y2 = dst->drawable.height;
++
++	return sna_blt_fill_boxes(sna, GXclear,
++				  bo, dst->drawable.bitsPerPixel,
++				  0, &box, 1);
++}
++
++static bool
++gen9_render_clear(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo)
++{
++	struct sna_composite_op tmp;
++	int16_t *v;
++
++	DBG(("%s: %dx%d\n",
++	     __FUNCTION__,
++	     dst->drawable.width,
++	     dst->drawable.height));
++
++	/* Prefer to use the BLT if already engaged */
++	if (sna->kgem.mode == KGEM_BLT &&
++	    gen9_render_clear_try_blt(sna, dst, bo))
++		return true;
++
++	/* Must use the BLT if we can't RENDER... */
++	if (too_large(dst->drawable.width, dst->drawable.height) ||
++	    unaligned(bo, dst->drawable.bitsPerPixel))
++		return gen9_render_clear_try_blt(sna, dst, bo);
++
++	tmp.dst.pixmap = dst;
++	tmp.dst.width  = dst->drawable.width;
++	tmp.dst.height = dst->drawable.height;
++	tmp.dst.format = sna_format_for_depth(dst->drawable.depth);
++	tmp.dst.bo = bo;
++	tmp.dst.x = tmp.dst.y = 0;
++
++	tmp.src.bo = sna_render_get_solid(sna, 0);
++	tmp.mask.bo = NULL;
++
++	tmp.floats_per_vertex = 2;
++	tmp.floats_per_rect = 6;
++	tmp.need_magic_ca_pass = false;
++
++	tmp.u.gen9.flags = FILL_FLAGS_NOBLEND;
++
++	kgem_set_mode(&sna->kgem, KGEM_RENDER, bo);
++	if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
++		kgem_submit(&sna->kgem);
++		if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
++			kgem_bo_destroy(&sna->kgem, tmp.src.bo);
++			return false;
++		}
++		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
++	}
++
++	gen9_align_vertex(sna, &tmp);
++	gen9_emit_fill_state(sna, &tmp);
++
++	gen9_get_rectangles(sna, &tmp, 1, gen9_emit_fill_state);
++
++	v = (int16_t *)&sna->render.vertices[sna->render.vertex_used];
++	sna->render.vertex_used += 6;
++	assert(sna->render.vertex_used <= sna->render.vertex_size);
++
++	v[0] = dst->drawable.width;
++	v[5] = v[1] = dst->drawable.height;
++	v[8] = v[4] = 0;
++	v[9] = 0;
++
++	v[7] = v[2]  = v[3]  = 1;
++	v[6] = v[10] = v[11] = 0;
++
++	gen8_vertex_flush(sna);
++	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
++
++	return true;
++}
++
++#if !NO_VIDEO
++static uint32_t gen9_bind_video_source(struct sna *sna,
++				       struct kgem_bo *bo,
++				       uint32_t delta,
++				       int width,
++				       int height,
++				       int pitch,
++				       uint32_t format)
++{
++	uint32_t *ss;
++	int offset;
++
++	offset = sna->kgem.surface -= SURFACE_DW;
++	ss = sna->kgem.batch + offset;
++	ss[0] = (SURFACE_2D << SURFACE_TYPE_SHIFT |
++		 gen9_tiling_bits(bo->tiling) |
++		 format << SURFACE_FORMAT_SHIFT |
++		 SURFACE_VALIGN_4 | SURFACE_HALIGN_4);
++	ss[1] = 0;
++	ss[2] = ((width - 1)  << SURFACE_WIDTH_SHIFT |
++		 (height - 1) << SURFACE_HEIGHT_SHIFT);
++	ss[3] = (pitch - 1) << SURFACE_PITCH_SHIFT;
++	ss[4] = 0;
++	ss[5] = 0;
++	ss[6] = 0;
++	ss[7] = SURFACE_SWIZZLE(RED, GREEN, BLUE, ALPHA);
++	*(uint64_t *)(ss+8) =
++		kgem_add_reloc64(&sna->kgem, offset + 8, bo,
++				 I915_GEM_DOMAIN_SAMPLER << 16,
++				 delta);
++	ss[10] = 0;
++	ss[11] = 0;
++	ss[12] = 0;
++	ss[13] = 0;
++	ss[14] = 0;
++	ss[15] = 0;
++
++	DBG(("[%x] bind bo(handle=%d, addr=%d), format=%d, width=%d, height=%d, pitch=%d, tiling=%d -> sampler\n",
++	     offset, bo->handle, ss[1],
++	     format, width, height, bo->pitch, bo->tiling));
++
++	return offset * sizeof(uint32_t);
++}
++
++static void gen9_emit_video_state(struct sna *sna,
++				  const struct sna_composite_op *op)
++{
++	struct sna_video_frame *frame = op->priv;
++	uint32_t src_surf_format;
++	uint32_t src_surf_base[6];
++	int src_width[6];
++	int src_height[6];
++	int src_pitch[6];
++	uint32_t *binding_table;
++	uint16_t offset;
++	int n_src, n;
++
++	/* XXX VeBox, bicubic */
++
++	gen9_get_batch(sna, op);
++
++	src_surf_base[0] = 0;
++	src_surf_base[1] = 0;
++	src_surf_base[2] = frame->VBufOffset;
++	src_surf_base[3] = frame->VBufOffset;
++	src_surf_base[4] = frame->UBufOffset;
++	src_surf_base[5] = frame->UBufOffset;
++
++	if (is_planar_fourcc(frame->id)) {
++		src_surf_format = SURFACEFORMAT_R8_UNORM;
++		src_width[1]  = src_width[0]  = frame->width;
++		src_height[1] = src_height[0] = frame->height;
++		src_pitch[1]  = src_pitch[0]  = frame->pitch[1];
++		src_width[4]  = src_width[5]  = src_width[2]  = src_width[3] =
++			frame->width / 2;
++		src_height[4] = src_height[5] = src_height[2] = src_height[3] =
++			frame->height / 2;
++		src_pitch[4]  = src_pitch[5]  = src_pitch[2]  = src_pitch[3] =
++			frame->pitch[0];
++		n_src = 6;
++	} else {
++		if (frame->id == FOURCC_RGB888)
++			src_surf_format = SURFACEFORMAT_B8G8R8X8_UNORM;
++		else if (frame->id == FOURCC_UYVY)
++			src_surf_format = SURFACEFORMAT_YCRCB_SWAPY;
++		else
++			src_surf_format = SURFACEFORMAT_YCRCB_NORMAL;
++
++		src_width[0]  = frame->width;
++		src_height[0] = frame->height;
++		src_pitch[0]  = frame->pitch[0];
++		n_src = 1;
++	}
++
++	binding_table = gen9_composite_get_binding_table(sna, &offset);
++
++	binding_table[0] =
++		gen9_bind_bo(sna,
++			     op->dst.bo, op->dst.width, op->dst.height,
++			     gen9_get_dest_format(op->dst.format),
++			     true);
++	for (n = 0; n < n_src; n++) {
++		binding_table[1+n] =
++			gen9_bind_video_source(sna,
++					       frame->bo,
++					       src_surf_base[n],
++					       src_width[n],
++					       src_height[n],
++					       src_pitch[n],
++					       src_surf_format);
++	}
++
++	gen9_emit_state(sna, op, offset);
++}
++
++static unsigned select_video_kernel(const struct sna_video_frame *frame)
++{
++	switch (frame->id) {
++	case FOURCC_YV12:
++	case FOURCC_I420:
++	case FOURCC_XVMC:
++		return GEN9_WM_KERNEL_VIDEO_PLANAR;
++
++	case FOURCC_RGB888:
++	case FOURCC_RGB565:
++		return GEN9_WM_KERNEL_VIDEO_RGB;
++
++	default:
++		return GEN9_WM_KERNEL_VIDEO_PACKED;
++	}
++}
++
++static bool
++gen9_render_video(struct sna *sna,
++		  struct sna_video *video,
++		  struct sna_video_frame *frame,
++		  RegionPtr dstRegion,
++		  PixmapPtr pixmap)
++{
++	struct sna_composite_op tmp;
++	struct sna_pixmap *priv = sna_pixmap(pixmap);
++	int dst_width = dstRegion->extents.x2 - dstRegion->extents.x1;
++	int dst_height = dstRegion->extents.y2 - dstRegion->extents.y1;
++	int src_width = frame->src.x2 - frame->src.x1;
++	int src_height = frame->src.y2 - frame->src.y1;
++	float src_offset_x, src_offset_y;
++	float src_scale_x, src_scale_y;
++	unsigned filter;
++	const BoxRec *box;
++	int nbox;
++
++	DBG(("%s: src=(%d, %d), dst=(%d, %d), %dx[(%d, %d), (%d, %d)...]\n",
++	     __FUNCTION__,
++	     src_width, src_height, dst_width, dst_height,
++	     region_num_rects(dstRegion),
++	     REGION_EXTENTS(NULL, dstRegion)->x1,
++	     REGION_EXTENTS(NULL, dstRegion)->y1,
++	     REGION_EXTENTS(NULL, dstRegion)->x2,
++	     REGION_EXTENTS(NULL, dstRegion)->y2));
++
++	assert(priv->gpu_bo);
++	assert(!too_large(pixmap->drawable.width, pixmap->drawable.height));
++	assert(!unaligned(priv->gpu_bo, pixmap->drawable.bitsPerPixel));
++
++	memset(&tmp, 0, sizeof(tmp));
++
++	tmp.dst.pixmap = pixmap;
++	tmp.dst.width  = pixmap->drawable.width;
++	tmp.dst.height = pixmap->drawable.height;
++	tmp.dst.format = sna_render_format_for_depth(pixmap->drawable.depth);
++	tmp.dst.bo = priv->gpu_bo;
++
++	tmp.src.bo = frame->bo;
++	tmp.mask.bo = NULL;
++
++	tmp.floats_per_vertex = 3;
++	tmp.floats_per_rect = 9;
++
++	DBG(("%s: scaling?=%d, planar?=%d [%x]\n",
++	     __FUNCTION__,
++	     src_width != dst_width || src_height != dst_height,
++	     is_planar_fourcc(frame->id), frame->id));
++
++	if (src_width == dst_width && src_height == dst_height)
++		filter = SAMPLER_FILTER_NEAREST;
++	else
++		filter = SAMPLER_FILTER_BILINEAR;
++
++	tmp.u.gen9.flags =
++		GEN9_SET_FLAGS(SAMPLER_OFFSET(filter, SAMPLER_EXTEND_PAD,
++					      SAMPLER_FILTER_NEAREST, SAMPLER_EXTEND_NONE),
++			       NO_BLEND,
++			       select_video_kernel(frame),
++			       2);
++	tmp.priv = frame;
++
++	kgem_set_mode(&sna->kgem, KGEM_RENDER, tmp.dst.bo);
++	if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL)) {
++		kgem_submit(&sna->kgem);
++		if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL))
++			return false;
++
++		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
++	}
++
++	gen9_align_vertex(sna, &tmp);
++	gen9_emit_video_state(sna, &tmp);
++
++	DBG(("%s: src=(%d, %d)x(%d, %d); frame=(%dx%d), dst=(%dx%d)\n",
++	     __FUNCTION__,
++	     frame->src.x1, frame->src.y1,
++	     src_width, src_height,
++	     dst_width, dst_height,
++	     frame->width, frame->height));
++
++	src_scale_x = (float)src_width / dst_width / frame->width;
++	src_offset_x = (float)frame->src.x1 / frame->width - dstRegion->extents.x1 * src_scale_x;
++
++	src_scale_y = (float)src_height / dst_height / frame->height;
++	src_offset_y = (float)frame->src.y1 / frame->height - dstRegion->extents.y1 * src_scale_y;
++
++	DBG(("%s: scale=(%f, %f), offset=(%f, %f)\n",
++	     __FUNCTION__,
++	     src_scale_x, src_scale_y,
++	     src_offset_x, src_offset_y));
++
++	box = region_rects(dstRegion);
++	nbox = region_num_rects(dstRegion);
++	while (nbox--) {
++		DBG(("%s: dst=(%d, %d), (%d, %d) + (%d, %d); src=(%f, %f), (%f, %f)\n",
++		     __FUNCTION__,
++		     box->x1, box->y1,
++		     box->x2, box->y2,
++		     box->x1 * src_scale_x + src_offset_x,
++		     box->y1 * src_scale_y + src_offset_y,
++		     box->x2 * src_scale_x + src_offset_x,
++		     box->y2 * src_scale_y + src_offset_y));
++
++		gen9_get_rectangles(sna, &tmp, 1, gen9_emit_video_state);
++
++		OUT_VERTEX(box->x2, box->y2);
++		OUT_VERTEX_F(box->x2 * src_scale_x + src_offset_x);
++		OUT_VERTEX_F(box->y2 * src_scale_y + src_offset_y);
++
++		OUT_VERTEX(box->x1, box->y2);
++		OUT_VERTEX_F(box->x1 * src_scale_x + src_offset_x);
++		OUT_VERTEX_F(box->y2 * src_scale_y + src_offset_y);
++
++		OUT_VERTEX(box->x1, box->y1);
++		OUT_VERTEX_F(box->x1 * src_scale_x + src_offset_x);
++		OUT_VERTEX_F(box->y1 * src_scale_y + src_offset_y);
++
++		box++;
++	}
++	gen8_vertex_flush(sna);
++
++	if (!DAMAGE_IS_ALL(priv->gpu_damage))
++		sna_damage_add(&priv->gpu_damage, dstRegion);
++
++	return true;
++}
++#endif
++
++static void gen9_render_flush(struct sna *sna)
++{
++	gen8_vertex_close(sna);
++
++	assert(sna->render.vb_id == 0);
++	assert(sna->render.vertex_offset == 0);
++}
++
++static void gen9_render_reset(struct sna *sna)
++{
++	sna->render_state.gen9.emit_flush = false;
++	sna->render_state.gen9.needs_invariant = true;
++	sna->render_state.gen9.ve_id = 3 << 2;
++	sna->render_state.gen9.ve_dirty = false;
++	sna->render_state.gen9.last_primitive = -1;
++
++	sna->render_state.gen9.num_sf_outputs = 0;
++	sna->render_state.gen9.samplers = -1;
++	sna->render_state.gen9.blend = -1;
++	sna->render_state.gen9.kernel = -1;
++	sna->render_state.gen9.drawrect_offset = -1;
++	sna->render_state.gen9.drawrect_limit = -1;
++	sna->render_state.gen9.surface_table = 0;
++
++	if (sna->render.vbo && !kgem_bo_can_map(&sna->kgem, sna->render.vbo)) {
++		DBG(("%s: discarding unmappable vbo\n", __FUNCTION__));
++		discard_vbo(sna);
++	}
++
++	sna->render.vertex_offset = 0;
++	sna->render.nvertex_reloc = 0;
++	sna->render.vb_id = 0;
++}
++
++static void gen9_render_fini(struct sna *sna)
++{
++	kgem_bo_destroy(&sna->kgem, sna->render_state.gen9.general_bo);
++}
++
++static bool gen9_render_setup(struct sna *sna)
++{
++	struct gen9_render_state *state = &sna->render_state.gen9;
++	struct sna_static_stream general;
++	struct gen9_sampler_state *ss;
++	int i, j, k, l, m;
++	uint32_t devid;
++
++	devid = intel_get_device_id(sna->dev);
++	if (devid & 0xf)
++		state->gt = GEN9_GT_BIAS + ((devid >> 4) & 0xf) + 1;
++	DBG(("%s: gt=%d\n", __FUNCTION__, state->gt));
++
++	state->info = &min_gt_info;
++	if (is_skl(sna))
++		state->info = &skl_gt_info;
++	if (is_bxt(sna))
++		state->info = &bxt_gt_info;
++	if (is_kbl(sna))
++		state->info = &kbl_gt_info;
++	if (is_glk(sna))
++		state->info = &glk_gt_info;
++
++	sna_static_stream_init(&general);
++
++	/* Zero pad the start. If you see an offset of 0x0 in the batchbuffer
++	 * dumps, you know it points to zero.
++	 */
++	null_create(&general);
++
++	for (m = 0; m < ARRAY_SIZE(wm_kernels); m++) {
++		if (wm_kernels[m].size) {
++			state->wm_kernel[m][1] =
++				sna_static_stream_add(&general,
++						      wm_kernels[m].data,
++						      wm_kernels[m].size,
++						      64);
++		} else {
++			if (USE_8_PIXEL_DISPATCH) {
++				state->wm_kernel[m][0] =
++					sna_static_stream_compile_wm(sna, &general,
++								     wm_kernels[m].data, 8);
++			}
++
++			if (USE_16_PIXEL_DISPATCH) {
++				state->wm_kernel[m][1] =
++					sna_static_stream_compile_wm(sna, &general,
++								     wm_kernels[m].data, 16);
++			}
++
++			if (USE_32_PIXEL_DISPATCH) {
++				state->wm_kernel[m][2] =
++					sna_static_stream_compile_wm(sna, &general,
++								     wm_kernels[m].data, 32);
++			}
++		}
++		assert(state->wm_kernel[m][0]|state->wm_kernel[m][1]|state->wm_kernel[m][2]);
++	}
++
++	COMPILE_TIME_ASSERT(SAMPLER_OFFSET(FILTER_COUNT, EXTEND_COUNT, FILTER_COUNT, EXTEND_COUNT) <= 0x7ff);
++	ss = sna_static_stream_map(&general,
++				   2 * sizeof(*ss) *
++				   (2 +
++				    FILTER_COUNT * EXTEND_COUNT *
++				    FILTER_COUNT * EXTEND_COUNT),
++				   32);
++	state->wm_state = sna_static_stream_offsetof(&general, ss);
++	sampler_copy_init(ss); ss += 2;
++	sampler_fill_init(ss); ss += 2;
++	for (i = 0; i < FILTER_COUNT; i++) {
++		for (j = 0; j < EXTEND_COUNT; j++) {
++			for (k = 0; k < FILTER_COUNT; k++) {
++				for (l = 0; l < EXTEND_COUNT; l++) {
++					sampler_state_init(ss++, i, j);
++					sampler_state_init(ss++, k, l);
++				}
++			}
++		}
++	}
++
++	state->cc_blend = gen9_create_blend_state(&general);
++
++	state->general_bo = sna_static_stream_fini(sna, &general);
++	return state->general_bo != NULL;
++}
++
++const char *gen9_render_init(struct sna *sna, const char *backend)
++{
++	if (!gen9_render_setup(sna))
++		return backend;
++
++	sna->kgem.context_switch = gen6_render_context_switch;
++	sna->kgem.retire = gen6_render_retire;
++	sna->kgem.expire = gen4_render_expire;
++
++#if !NO_COMPOSITE
++	sna->render.composite = gen9_render_composite;
++	sna->render.prefer_gpu |= PREFER_GPU_RENDER;
++#endif
++#if !NO_COMPOSITE_SPANS
++	sna->render.check_composite_spans = gen9_check_composite_spans;
++	sna->render.composite_spans = gen9_render_composite_spans;
++	sna->render.prefer_gpu |= PREFER_GPU_SPANS;
++#endif
++#if !NO_VIDEO
++	sna->render.video = gen9_render_video;
++#endif
++
++#if !NO_COPY_BOXES
++	sna->render.copy_boxes = gen9_render_copy_boxes;
++#endif
++#if !NO_COPY
++	sna->render.copy = gen9_render_copy;
++#endif
++
++#if !NO_FILL_BOXES
++	sna->render.fill_boxes = gen9_render_fill_boxes;
++#endif
++#if !NO_FILL
++	sna->render.fill = gen9_render_fill;
++#endif
++#if !NO_FILL_ONE
++	sna->render.fill_one = gen9_render_fill_one;
++#endif
++#if !NO_FILL_CLEAR
++	sna->render.clear = gen9_render_clear;
++#endif
++
++	sna->render.flush = gen9_render_flush;
++	sna->render.reset = gen9_render_reset;
++	sna->render.fini = gen9_render_fini;
++
++	sna->render.max_3d_size = GEN9_MAX_SIZE;
++	sna->render.max_3d_pitch = 1 << 18;
++	return sna->render_state.gen9.info->name;
++}
+diff --git a/src/sna/gen9_render.h b/src/sna/gen9_render.h
+new file mode 100644
+index 00000000..e3cb3f93
+--- /dev/null
++++ b/src/sna/gen9_render.h
+@@ -0,0 +1,1130 @@
++#ifndef GEN9_RENDER_H
++#define GEN9_RENDER_H
++
++#define INTEL_MASK(high, low) (((1 << ((high) - (low) + 1)) - 1) << (low))
++
++#define GEN9_3D(pipeline,op,sub) \
++	((3 << 29) | ((pipeline) << 27) | ((op) << 24) | ((sub) << 16))
++
++#define GEN9_STATE_BASE_ADDRESS			GEN9_3D(0, 1, 1)
++# define BASE_ADDRESS_MODIFY			(1 << 0)
++
++#define GEN9_STATE_SIP				GEN9_3D(0, 1, 2)
++
++#define GEN9_3DSTATE_VF_STATISTICS		GEN9_3D(1, 0, 0xb)
++#define GEN9_PIPELINE_SELECT			GEN9_3D(1, 1, 4)
++# define PIPELINE_SELECT_3D		0
++# define PIPELINE_SELECT_MEDIA		1
++#define PIPELINE_SELECTION_MASK            (3 << 8)
++
++#define GEN9_MEDIA_STATE_POINTERS		GEN9_3D(2, 0, 0)
++#define GEN9_MEDIA_OBJECT			GEN9_3D(2, 1, 0)
++
++#define GEN9_3DSTATE_CLEAR_PARAMS               GEN9_3D(3, 0, 0x04)
++#define GEN9_3DSTATE_DEPTH_BUFFER               GEN9_3D(3, 0, 0x05)
++# define DEPTH_BUFFER_TYPE_SHIFT	29
++# define DEPTH_BUFFER_FORMAT_SHIFT	18
++
++#define GEN9_3DSTATE_STENCIL_BUFFER		GEN9_3D(3, 0, 0x06)
++#define GEN9_3DSTATE_HIER_DEPTH_BUFFER		GEN9_3D(3, 0, 0x07)
++#define GEN9_3DSTATE_VERTEX_BUFFERS		GEN9_3D(3, 0, 0x08)
++# define VB_INDEX_SHIFT			26
++# define VB_MODIFY_ENABLE		(1 << 14)
++#define GEN9_3DSTATE_VERTEX_ELEMENTS		GEN9_3D(3, 0, 0x09)
++# define VE_INDEX_SHIFT		26
++# define VE_VALID					(1 << 25)
++# define VE_FORMAT_SHIFT				16
++# define VE_OFFSET_SHIFT				0
++# define VE_COMPONENT_0_SHIFT			28
++# define VE_COMPONENT_1_SHIFT			24
++# define VE_COMPONENT_2_SHIFT			20
++# define VE_COMPONENT_3_SHIFT			16
++#define GEN9_3DSTATE_INDEX_BUFFER		GEN9_3D(3, 0, 0x0a)
++#define GEN9_3DSTATE_VF				GEN9_3D(3, 0, 0x0c)
++
++#define GEN9_3DSTATE_MULTISAMPLE		GEN9_3D(3, 0, 0x0d)
++/* DW1 */
++# define MULTISAMPLE_PIXEL_LOCATION_CENTER		(0 << 4)
++# define MULTISAMPLE_PIXEL_LOCATION_UPPER_LEFT	(1 << 4)
++# define MULTISAMPLE_NUMSAMPLES_1			(0 << 1)
++# define MULTISAMPLE_NUMSAMPLES_4			(2 << 1)
++# define MULTISAMPLE_NUMSAMPLES_8			(3 << 1)
++
++#define GEN9_3DSTATE_CC_STATE_POINTERS		GEN9_3D(3, 0, 0x0e)
++#define GEN9_3DSTATE_SCISSOR_STATE_POINTERS	GEN9_3D(3, 0, 0x0f)
++
++#define GEN9_3DSTATE_VS				GEN9_3D(3, 0, 0x10)
++#define GEN9_3DSTATE_GS				GEN9_3D(3, 0, 0x11)
++#define GEN9_3DSTATE_CLIP			GEN9_3D(3, 0, 0x12)
++#define GEN9_3DSTATE_SF				GEN9_3D(3, 0, 0x13)
++# define SF_TRI_PROVOKE_SHIFT		29
++# define SF_LINE_PROVOKE_SHIFT		27
++# define SF_FAN_PROVOKE_SHIFT		25
++
++#define GEN9_3DSTATE_WM				GEN9_3D(3, 0, 0x14)
++/* DW1 */
++# define WM_STATISTICS_ENABLE                              (1 << 31)
++# define WM_DEPTH_CLEAR                                    (1 << 30)
++# define WM_DEPTH_RESOLVE                                  (1 << 28)
++# define WM_HIERARCHICAL_DEPTH_RESOLVE                     (1 << 27)
++# define WM_KILL_ENABLE                                    (1 << 25)
++# define WM_POSITION_ZW_PIXEL                              (0 << 17)
++# define WM_POSITION_ZW_CENTROID                           (2 << 17)
++# define WM_POSITION_ZW_SAMPLE                             (3 << 17)
++# define WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC              (1 << 16)
++# define WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC            (1 << 15)
++# define WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC               (1 << 14)
++# define WM_PERSPECTIVE_SAMPLE_BARYCENTRIC                 (1 << 13)
++# define WM_PERSPECTIVE_CENTROID_BARYCENTRIC               (1 << 12)
++# define WM_PERSPECTIVE_PIXEL_BARYCENTRIC                  (1 << 11)
++# define WM_LINE_END_CAP_AA_WIDTH_0_5                      (0 << 8)
++# define WM_LINE_END_CAP_AA_WIDTH_1_0                      (1 << 8)
++# define WM_LINE_END_CAP_AA_WIDTH_2_0                      (2 << 8)
++# define WM_LINE_END_CAP_AA_WIDTH_4_0                      (3 << 8)
++# define WM_LINE_AA_WIDTH_0_5                              (0 << 6)
++# define WM_LINE_AA_WIDTH_1_0                              (1 << 6)
++# define WM_LINE_AA_WIDTH_2_0                              (2 << 6)
++# define WM_LINE_AA_WIDTH_4_0                              (3 << 6)
++# define WM_POLYGON_STIPPLE_ENABLE                         (1 << 4)
++# define WM_LINE_STIPPLE_ENABLE                            (1 << 3)
++# define WM_POINT_RASTRULE_UPPER_RIGHT                     (1 << 2)
++# define WM_MSRAST_OFF_PIXEL                               (0 << 0)
++# define WM_MSRAST_OFF_PATTERN                             (1 << 0)
++# define WM_MSRAST_ON_PIXEL                                (2 << 0)
++# define WM_MSRAST_ON_PATTERN                              (3 << 0)
++
++#define GEN9_3DSTATE_CONSTANT_VS		GEN9_3D(3, 0, 0x15)
++#define GEN9_3DSTATE_CONSTANT_GS		GEN9_3D(3, 0, 0x16)
++#define GEN9_3DSTATE_CONSTANT_PS		GEN9_3D(3, 0, 0x17)
++
++#define GEN9_3DSTATE_SAMPLE_MASK		GEN9_3D(3, 0, 0x18)
++
++#define GEN9_3DSTATE_CONSTANT_HS                GEN9_3D(3, 0, 0x19)
++#define GEN9_3DSTATE_CONSTANT_DS                GEN9_3D(3, 0, 0x1a)
++
++#define GEN9_3DSTATE_HS                         GEN9_3D(3, 0, 0x1b)
++#define GEN9_3DSTATE_TE                         GEN9_3D(3, 0, 0x1c)
++#define GEN9_3DSTATE_DS                         GEN9_3D(3, 0, 0x1d)
++#define GEN9_3DSTATE_STREAMOUT                  GEN9_3D(3, 0, 0x1e)
++
++#define GEN9_3DSTATE_SBE                        GEN9_3D(3, 0, 0x1f)
++/* DW1 */
++# define SBE_FORCE_VERTEX_URB_READ_LENGTH  (1<<29)
++# define SBE_FORCE_VERTEX_URB_READ_OFFSET  (1<<28)
++# define SBE_NUM_OUTPUTS_SHIFT             22
++# define SBE_SWIZZLE_ENABLE                (1 << 21)
++# define SBE_POINT_SPRITE_LOWERLEFT        (1 << 20)
++# define SBE_URB_ENTRY_READ_LENGTH_SHIFT   11
++# define SBE_URB_ENTRY_READ_OFFSET_SHIFT   5
++#define SBE_ACTIVE_COMPONENT_NONE          0
++#define SBE_ACTIVE_COMPONENT_XY            1 
++#define SBE_ACTIVE_COMPONENT_XYZ           2
++#define SBE_ACTIVE_COMPONENT_XYZW          3
++
++
++#define GEN9_3DSTATE_PS                                 GEN9_3D(3, 0, 0x20)
++/* DW1:DW2 kernel pointer */
++/* DW3 */
++# define PS_SPF_MODE                               (1 << 31)
++# define PS_VECTOR_MASK_ENABLE                     (1 << 30)
++# define PS_SAMPLER_COUNT_SHIFT                    27
++# define PS_BINDING_TABLE_ENTRY_COUNT_SHIFT        18
++# define PS_FLOATING_POINT_MODE_IEEE_754           (0 << 16)
++# define PS_FLOATING_POINT_MODE_ALT                (1 << 16)
++/* DW4:DW5: scratch space */
++/* DW6 */
++# define PS_MAX_THREADS_SHIFT                      23
++# define PS_MAX_THREADS                            (63 << PS_MAX_THREADS_SHIFT)
++# define PS_PUSH_CONSTANT_ENABLE                   (1 << 11)
++# define PS_RENDER_TARGET_CLEAR			   (1 << 8)
++# define PS_RENDER_TARGET_RESOLVE		   (1 << 6)
++# define PS_POSOFFSET_NONE                         (0 << 3)
++# define PS_POSOFFSET_CENTROID                     (2 << 3)
++# define PS_POSOFFSET_SAMPLE                       (3 << 3)
++# define PS_32_DISPATCH_ENABLE                     (1 << 2)
++# define PS_16_DISPATCH_ENABLE                     (1 << 1)
++# define PS_8_DISPATCH_ENABLE                      (1 << 0)
++/* DW7 */
++# define PS_DISPATCH_START_GRF_SHIFT_0             16
++# define PS_DISPATCH_START_GRF_SHIFT_1             8
++# define PS_DISPATCH_START_GRF_SHIFT_2             0
++/* DW8:D9: kernel 1 pointer */
++/* DW10:D11: kernel 2 pointer */
++
++#define GEN9_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP      GEN9_3D(3, 0, 0x21)
++#define GEN9_3DSTATE_VIEWPORT_STATE_POINTERS_CC         GEN9_3D(3, 0, 0x23)
++
++#define GEN9_3DSTATE_BLEND_STATE_POINTERS               GEN9_3D(3, 0, 0x24)
++
++#define GEN9_3DSTATE_BINDING_TABLE_POINTERS_VS          GEN9_3D(3, 0, 0x26)
++#define GEN9_3DSTATE_BINDING_TABLE_POINTERS_HS          GEN9_3D(3, 0, 0x27)
++#define GEN9_3DSTATE_BINDING_TABLE_POINTERS_DS          GEN9_3D(3, 0, 0x28)
++#define GEN9_3DSTATE_BINDING_TABLE_POINTERS_GS          GEN9_3D(3, 0, 0x29)
++#define GEN9_3DSTATE_BINDING_TABLE_POINTERS_PS          GEN9_3D(3, 0, 0x2a)
++
++#define GEN9_3DSTATE_SAMPLER_STATE_POINTERS_VS          GEN9_3D(3, 0, 0x2b)
++#define GEN9_3DSTATE_SAMPLER_STATE_POINTERS_HS          GEN9_3D(3, 0, 0x2c)
++#define GEN9_3DSTATE_SAMPLER_STATE_POINTERS_DS          GEN9_3D(3, 0, 0x2d)
++#define GEN9_3DSTATE_SAMPLER_STATE_POINTERS_GS          GEN9_3D(3, 0, 0x2e)
++#define GEN9_3DSTATE_SAMPLER_STATE_POINTERS_PS          GEN9_3D(3, 0, 0x2f)
++
++#define GEN9_3DSTATE_URB_VS                             GEN9_3D(3, 0, 0x30)
++#define GEN9_3DSTATE_URB_HS                             GEN9_3D(3, 0, 0x31)
++#define GEN9_3DSTATE_URB_DS                             GEN9_3D(3, 0, 0x32)
++#define GEN9_3DSTATE_URB_GS                             GEN9_3D(3, 0, 0x33)
++/* DW1 */
++# define URB_ENTRY_NUMBER_SHIFT            0
++# define URB_ENTRY_SIZE_SHIFT              16
++# define URB_STARTING_ADDRESS_SHIFT        25
++
++#define GEN9_3DSTATE_GATHER_CONSTANT_VS             GEN9_3D(3, 0, 0x34)
++#define GEN9_3DSTATE_GATHER_CONSTANT_GS             GEN9_3D(3, 0, 0x35)
++#define GEN9_3DSTATE_GATHER_CONSTANT_HS             GEN9_3D(3, 0, 0x36)
++#define GEN9_3DSTATE_GATHER_CONSTANT_DS             GEN9_3D(3, 0, 0x37)
++#define GEN9_3DSTATE_GATHER_CONSTANT_PS             GEN9_3D(3, 0, 0x38)
++
++#define GEN9_3DSTATE_DX9_CONSTANTF_VS             GEN9_3D(3, 0, 0x39)
++#define GEN9_3DSTATE_DX9_CONSTANTF_PS             GEN9_3D(3, 0, 0x3a)
++#define GEN9_3DSTATE_DX9_CONSTANTI_VS             GEN9_3D(3, 0, 0x3b)
++#define GEN9_3DSTATE_DX9_CONSTANTI_PS             GEN9_3D(3, 0, 0x3c)
++#define GEN9_3DSTATE_DX9_CONSTANTB_VS             GEN9_3D(3, 0, 0x3d)
++#define GEN9_3DSTATE_DX9_CONSTANTB_PS             GEN9_3D(3, 0, 0x3e)
++#define GEN9_3DSTATE_DX9_LOCAL_VALID_VS           GEN9_3D(3, 0, 0x3f)
++#define GEN9_3DSTATE_DX9_LOCAL_VALID_PS           GEN9_3D(3, 0, 0x40)
++#define GEN9_3DSTATE_DX9_GENERATE_ACTIVE_VS       GEN9_3D(3, 0, 0x41)
++#define GEN9_3DSTATE_DX9_GENERATE_ACTIVE_PS       GEN9_3D(3, 0, 0x42)
++
++#define GEN9_3DSTATE_BINDING_TABLE_EDIT_VS       GEN9_3D(3, 0, 0x43)
++#define GEN9_3DSTATE_BINDING_TABLE_EDIT_GS       GEN9_3D(3, 0, 0x44)
++#define GEN9_3DSTATE_BINDING_TABLE_EDIT_HS       GEN9_3D(3, 0, 0x45)
++#define GEN9_3DSTATE_BINDING_TABLE_EDIT_DS       GEN9_3D(3, 0, 0x46)
++#define GEN9_3DSTATE_BINDING_TABLE_EDIT_PS       GEN9_3D(3, 0, 0x47)
++
++#define GEN9_3DSTATE_VF_INSTANCING		GEN9_3D(3, 0, 0x49)
++#define GEN9_3DSTATE_VF_SGVS			GEN9_3D(3, 0, 0x4a)
++# define SGVS_ENABLE_INSTANCE_ID			(1 << 31)
++# define SGVS_INSTANCE_ID_COMPONENT_SHIFT		29
++# define SGVS_INSTANCE_ID_ELEMENT_OFFSET_SHIFT	16
++# define SGVS_ENABLE_VERTEX_ID			(1 << 15)
++# define SGVS_VERTEX_ID_COMPONENT_SHIFT            13
++# define SGVS_VERTEX_ID_ELEMENT_OFFSET_SHIFT	0
++#define GEN9_3DSTATE_VF_TOPOLOGY		GEN9_3D(3, 0, 0x4b)
++# define POINTLIST         0x01
++# define LINELIST          0x02
++# define LINESTRIP         0x03
++# define TRILIST           0x04
++# define TRISTRIP          0x05
++# define TRIFAN            0x06
++# define QUADLIST          0x07
++# define QUADSTRIP         0x08
++# define LINELIST_ADJ      0x09
++# define LINESTRIP_ADJ     0x0A
++# define TRILIST_ADJ       0x0B
++# define TRISTRIP_ADJ      0x0C
++# define TRISTRIP_REVERSE  0x0D
++# define POLYGON           0x0E
++# define RECTLIST          0x0F
++# define LINELOOP          0x10
++# define POINTLIST_BF      0x11
++# define LINESTRIP_CONT    0x12
++# define LINESTRIP_BF      0x13
++# define LINESTRIP_CONT_BF 0x14
++# define TRIFAN_NOSTIPPLE  0x15
++
++#define GEN9_3DSTATE_WM_CHROMAKEY		GEN9_3D(3, 0, 0x4c)
++
++#define GEN9_3DSTATE_PS_BLEND				GEN9_3D(3, 0, 0x4d)
++# define PS_BLEND_ALPHA_TO_COVERAGE_ENABLE		(1 << 31)
++# define PS_BLEND_HAS_WRITEABLE_RT			(1 << 30)
++# define PS_BLEND_COLOR_BLEND_ENABLE			(1 << 29)
++# define PS_BLEND_SRC_ALPHA_SHIFT			24
++# define PS_BLEND_DST_ALPHA_SHIFT			19
++# define PS_BLEND_SRC_SHIFT				14
++# define PS_BLEND_DST_SHIFT				9
++# define PS_BLEND_ALPHA_TEST_ENABLE			(1 << 8)
++# define PS_BLEND_INDEPENDENT_ALPHA_BLEND_ENABLE	(1 << 7)
++
++#define GEN9_3DSTATE_WM_DEPTH_STENCIL		GEN9_3D(3, 0, 0x4e)
++/* DW1 */
++# define WM_DS_STENCIL_TEST_MASK_MASK		INTEL_MASK(31, 24)
++# define WM_DS_STENCIL_TEST_MASK_SHIFT		24
++# define WM_DS_STENCIL_WRITE_MASK_MASK		INTEL_MASK(23, 16)
++# define WM_DS_STENCIL_WRITE_MASK_SHIFT		16
++# define WM_DS_BF_STENCIL_TEST_MASK_MASK		INTEL_MASK(15, 8)
++# define WM_DS_BF_STENCIL_TEST_MASK_SHIFT		8
++# define WM_DS_BF_STENCIL_WRITE_MASK_MASK		INTEL_MASK(7, 0)
++# define WM_DS_DEPTH_FUNC_SHIFT			5
++# define WM_DS_DOUBLE_SIDED_STENCIL_ENABLE		(1 << 4)
++# define WM_DS_STENCIL_TEST_ENABLE			(1 << 3)
++# define WM_DS_STENCIL_BUFFER_WRITE_ENABLE		(1 << 2)
++# define WM_DS_DEPTH_TEST_ENABLE			(1 << 1)
++# define WM_DS_DEPTH_BUFFER_WRITE_ENABLE		(1 << 0)
++/* DW2 */
++# define WM_DS_STENCIL_TEST_MASK_MASK		INTEL_MASK(31, 24)
++# define WM_DS_STENCIL_TEST_MASK_SHIFT		24
++# define WM_DS_STENCIL_WRITE_MASK_MASK		INTEL_MASK(23, 16)
++# define WM_DS_STENCIL_WRITE_MASK_SHIFT		16
++# define WM_DS_BF_STENCIL_TEST_MASK_MASK		INTEL_MASK(15, 8)
++# define WM_DS_BF_STENCIL_TEST_MASK_SHIFT		8
++# define WM_DS_BF_STENCIL_WRITE_MASK_MASK		INTEL_MASK(7, 0)
++# define WM_DS_BF_STENCIL_WRITE_MASK_SHIFT		0
++
++#define GEN9_3DSTATE_PS_EXTRA		GEN9_3D(3, 0, 0x4f)
++# define PSX_PIXEL_SHADER_VALID                    (1 << 31)
++# define PSX_PIXEL_SHADER_NO_RT_WRITE              (1 << 30)
++# define PSX_OMASK_TO_RENDER_TARGET                (1 << 29)
++# define PSX_KILL_ENABLE                           (1 << 28)
++# define PSX_PSCDEPTH_OFF                          (0 << 26)
++# define PSX_PSCDEPTH_ON                           (1 << 26)
++# define PSX_PSCDEPTH_ON_GE                        (2 << 26)
++# define PSX_PSCDEPTH_ON_LE                        (3 << 26)
++# define PSX_FORCE_COMPUTED_DEPTH                  (1 << 25)
++# define PSX_USES_SOURCE_DEPTH                     (1 << 24)
++# define PSX_USES_SOURCE_W                         (1 << 23)
++# define PSX_ATTRIBUTE_ENABLE                      (1 << 8)
++# define PSX_SHADER_DISABLES_ALPHA_TO_COVERAGE     (1 << 7)
++# define PSX_SHADER_IS_PER_SAMPLE                  (1 << 6)
++# define PSX_SHADER_HAS_UAV                        (1 << 2)
++# define PSX_SHADER_USES_INPUT_COVERAGE_MASK       (1 << 1)
++
++#define GEN9_3DSTATE_RASTER		GEN9_3D(3, 0, 0x50)
++/* DW1 */
++# define RASTER_FRONT_WINDING_CCW                  (1 << 21)
++# define RASTER_CULL_BOTH                          (0 << 16)
++# define RASTER_CULL_NONE                          (1 << 16)
++# define RASTER_CULL_FRONT                         (2 << 16)
++# define RASTER_CULL_BACK                          (3 << 16)
++# define RASTER_SMOOTH_POINT_ENABLE                (1 << 13)
++# define RASTER_LINE_AA_ENABLE                     (1 << 2)
++# define RASTER_VIEWPORT_Z_CLIP_TEST_ENABLE        (1 << 0)
++
++#define GEN9_3DSTATE_SBE_SWIZ		GEN9_3D(3, 0, 0x51)
++#define GEN9_3DSTATE_WM_HZ_OP		GEN9_3D(3, 0, 0x52)
++
++#define GEN9_3DSTATE_COMPONENT_PACKING          GEN6_3D(3, 0, 0x55)
++
++
++
++#define GEN9_3DSTATE_DRAWING_RECTANGLE		GEN9_3D(3, 1, 0x00)
++#define GEN9_3DSTATE_SAMPLER_PALETTE_LOAD	GEN9_3D(3, 1, 0x02)
++#define GEN9_3DSTATE_CHROMA_KEY			GEN9_3D(3, 1, 0x04)
++
++#define GEN9_3DSTATE_POLY_STIPPLE_OFFSET	GEN9_3D(3, 1, 0x06)
++#define GEN9_3DSTATE_POLY_STIPPLE_PATTERN	GEN9_3D(3, 1, 0x07)
++#define GEN9_3DSTATE_LINE_STIPPLE		GEN9_3D(3, 1, 0x08)
++#define GEN9_3DSTATE_AA_LINE_PARAMS		GEN9_3D(3, 1, 0x0a)
++#define GEN9_3DSTATE_SAMPLER_PALETTE_LOAD1	GEN9_3D(3, 1, 0x0c)
++#define GEN9_3DSTATE_MONOFILTER_SIZE		GEN9_3D(3, 1, 0x11)
++#define GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_VS	GEN9_3D(3, 1, 0x12)
++#define GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_HS	GEN9_3D(3, 1, 0x13)
++#define GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_DS	GEN9_3D(3, 1, 0x14)
++#define GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_GS	GEN9_3D(3, 1, 0x15)
++#define GEN9_3DSTATE_PUSH_CONSTANT_ALLOC_PS	GEN9_3D(3, 1, 0x16)
++/* DW1 */
++# define PUSH_CONSTANT_BUFFER_OFFSET_SHIFT 16
++# define PUSH_CONSTANT_BUFFER_SIZE_SHIFT 0
++
++#define GEN9_3DSTATE_SO_DECL_LIST		GEN9_3D(3, 1, 0x17)
++#define GEN9_3DSTATE_SO_BUFFER			GEN9_3D(3, 1, 0x18)
++#define GEN9_3DSTATE_BINDING_TABLE_POOL_ALLOC	GEN9_3D(3, 1, 0x19)
++#define GEN9_3DSTATE_GATHER_BUFFER_POOL_ALLOC	GEN9_3D(3, 1, 0x1a)
++#define GEN9_3DSTATE_DX9_CONSTANT_BUFFER_POOL_ALLOC	GEN9_3D(3, 1, 0x1b)
++#define GEN9_3DSTATE_SAMPLE_PATTERN		GEN9_3D(3, 1, 0x1c)
++
++
++/* for GEN9_PIPE_CONTROL */
++#define GEN9_PIPE_CONTROL		GEN9_3D(3, 2, 0)
++#define PIPE_CONTROL_CS_STALL      (1 << 20)
++#define PIPE_CONTROL_NOWRITE       (0 << 14)
++#define PIPE_CONTROL_WRITE_QWORD   (1 << 14)
++#define PIPE_CONTROL_WRITE_DEPTH   (2 << 14)
++#define PIPE_CONTROL_WRITE_TIME    (3 << 14)
++#define PIPE_CONTROL_DEPTH_STALL   (1 << 13)
++#define PIPE_CONTROL_WC_FLUSH      (1 << 12)
++#define PIPE_CONTROL_IS_FLUSH      (1 << 11)
++#define PIPE_CONTROL_TC_FLUSH      (1 << 10)
++#define PIPE_CONTROL_NOTIFY_ENABLE (1 << 8)
++#define PIPE_CONTROL_FLUSH         (1 << 7)
++#define PIPE_CONTROL_GLOBAL_GTT    (1 << 2)
++#define PIPE_CONTROL_LOCAL_PGTT    (0 << 2)
++#define PIPE_CONTROL_STALL_AT_SCOREBOARD   (1 << 1)
++#define PIPE_CONTROL_DEPTH_CACHE_FLUSH	(1 << 0)
++
++
++#define GEN9_3DPRIMITIVE			GEN9_3D(3, 3, 0)
++
++/* 3DPRIMITIVE bits */
++#define VERTEX_SEQUENTIAL (0 << 15)
++#define VERTEX_RANDOM	  (1 << 15)
++
++#define ANISORATIO_2     0
++#define ANISORATIO_4     1
++#define ANISORATIO_6     2
++#define ANISORATIO_8     3
++#define ANISORATIO_10    4
++#define ANISORATIO_12    5
++#define ANISORATIO_14    6
++#define ANISORATIO_16    7
++
++#define BLENDFACTOR_ONE                 0x1
++#define BLENDFACTOR_SRC_COLOR           0x2
++#define BLENDFACTOR_SRC_ALPHA           0x3
++#define BLENDFACTOR_DST_ALPHA           0x4
++#define BLENDFACTOR_DST_COLOR           0x5
++#define BLENDFACTOR_SRC_ALPHA_SATURATE  0x6
++#define BLENDFACTOR_CONST_COLOR         0x7
++#define BLENDFACTOR_CONST_ALPHA         0x8
++#define BLENDFACTOR_SRC1_COLOR          0x9
++#define BLENDFACTOR_SRC1_ALPHA          0x0A
++#define BLENDFACTOR_ZERO                0x11
++#define BLENDFACTOR_INV_SRC_COLOR       0x12
++#define BLENDFACTOR_INV_SRC_ALPHA       0x13
++#define BLENDFACTOR_INV_DST_ALPHA       0x14
++#define BLENDFACTOR_INV_DST_COLOR       0x15
++#define BLENDFACTOR_INV_CONST_COLOR     0x17
++#define BLENDFACTOR_INV_CONST_ALPHA     0x18
++#define BLENDFACTOR_INV_SRC1_COLOR      0x19
++#define BLENDFACTOR_INV_SRC1_ALPHA      0x1A
++
++#define BLENDFUNCTION_ADD               0
++#define BLENDFUNCTION_SUBTRACT          1
++#define BLENDFUNCTION_REVERSE_SUBTRACT  2
++#define GEN9_BLENDFUNCTION_MIN               3
++#define BLENDFUNCTION_MAX               4
++
++#define ALPHATEST_FORMAT_UNORM8         0
++#define ALPHATEST_FORMAT_FLOAT32        1
++
++#define CHROMAKEY_KILL_ON_ANY_MATCH  0
++#define CHROMAKEY_REPLACE_BLACK      1
++
++#define CLIP_API_OGL     0
++#define CLIP_API_DX      1
++
++#define CLIPMODE_NORMAL              0
++#define CLIPMODE_CLIP_ALL            1
++#define CLIPMODE_CLIP_NON_REJECTED   2
++#define CLIPMODE_REJECT_ALL          3
++#define CLIPMODE_ACCEPT_ALL          4
++
++#define CLIP_NDCSPACE     0
++#define CLIP_SCREENSPACE  1
++
++#define COMPAREFUNCTION_ALWAYS       0
++#define COMPAREFUNCTION_NEVER        1
++#define COMPAREFUNCTION_LESS         2
++#define COMPAREFUNCTION_EQUAL        3
++#define COMPAREFUNCTION_LEQUAL       4
++#define COMPAREFUNCTION_GREATER      5
++#define COMPAREFUNCTION_NOTEQUAL     6
++#define COMPAREFUNCTION_GEQUAL       7
++
++#define COVERAGE_PIXELS_HALF     0
++#define COVERAGE_PIXELS_1        1
++#define COVERAGE_PIXELS_2        2
++#define COVERAGE_PIXELS_4        3
++
++#define DEPTHFORMAT_D32_FLOAT_S8X24_UINT     0
++#define DEPTHFORMAT_D32_FLOAT                1
++#define DEPTHFORMAT_D24_UNORM_S8_UINT        2
++#define DEPTHFORMAT_D16_UNORM                5
++
++#define FLOATING_POINT_IEEE_754        0
++#define FLOATING_POINT_NON_IEEE_754    1
++
++#define INDEX_BYTE     0
++#define INDEX_WORD     1
++#define INDEX_DWORD    2
++
++#define LOGICOPFUNCTION_CLEAR            0
++#define LOGICOPFUNCTION_NOR              1
++#define LOGICOPFUNCTION_AND_INVERTED     2
++#define LOGICOPFUNCTION_COPY_INVERTED    3
++#define LOGICOPFUNCTION_AND_REVERSE      4
++#define LOGICOPFUNCTION_INVERT           5
++#define LOGICOPFUNCTION_XOR              6
++#define LOGICOPFUNCTION_NAND             7
++#define LOGICOPFUNCTION_AND              8
++#define LOGICOPFUNCTION_EQUIV            9
++#define LOGICOPFUNCTION_NOOP             10
++#define LOGICOPFUNCTION_OR_INVERTED      11
++#define LOGICOPFUNCTION_COPY             12
++#define LOGICOPFUNCTION_OR_REVERSE       13
++#define LOGICOPFUNCTION_OR               14
++#define LOGICOPFUNCTION_SET              15
++
++#define MAPFILTER_NEAREST	0x0
++#define MAPFILTER_LINEAR	0x1
++#define MAPFILTER_ANISOTROPIC	0x2
++#define MAPFILTER_FLEXIBLE 	0x3
++#define MAPFILTER_MONO 		0x6
++
++#define MIPFILTER_NONE        0
++#define MIPFILTER_NEAREST     1
++#define MIPFILTER_LINEAR      3
++
++#define POLYGON_FRONT_FACING     0
++#define POLYGON_BACK_FACING      1
++
++#define PREFILTER_ALWAYS     0x0
++#define PREFILTER_NEVER      0x1
++#define PREFILTER_LESS       0x2
++#define PREFILTER_EQUAL      0x3
++#define PREFILTER_LEQUAL     0x4
++#define PREFILTER_GREATER    0x5
++#define PREFILTER_NOTEQUAL   0x6
++#define PREFILTER_GEQUAL     0x7
++
++#define RASTRULE_UPPER_LEFT  0
++#define RASTRULE_UPPER_RIGHT 1
++
++#define STENCILOP_KEEP               0
++#define STENCILOP_ZERO               1
++#define STENCILOP_REPLACE            2
++#define STENCILOP_INCRSAT            3
++#define STENCILOP_DECRSAT            4
++#define STENCILOP_INCR               5
++#define STENCILOP_DECR               6
++#define STENCILOP_INVERT             7
++
++#define SURFACE_MIPMAPLAYOUT_BELOW   0
++#define SURFACE_MIPMAPLAYOUT_RIGHT   1
++
++#define SURFACEFORMAT_R32G32B32A32_FLOAT             0x000
++#define SURFACEFORMAT_R32G32B32A32_SINT              0x001
++#define SURFACEFORMAT_R32G32B32A32_UINT              0x002
++#define SURFACEFORMAT_R32G32B32A32_UNORM             0x003
++#define SURFACEFORMAT_R32G32B32A32_SNORM             0x004
++#define SURFACEFORMAT_R64G64_FLOAT                   0x005
++#define SURFACEFORMAT_R32G32B32X32_FLOAT             0x006
++#define SURFACEFORMAT_R32G32B32A32_SSCALED           0x007
++#define SURFACEFORMAT_R32G32B32A32_USCALED           0x008
++#define SURFACEFORMAT_R32G32B32_FLOAT                0x040
++#define SURFACEFORMAT_R32G32B32_SINT                 0x041
++#define SURFACEFORMAT_R32G32B32_UINT                 0x042
++#define SURFACEFORMAT_R32G32B32_UNORM                0x043
++#define SURFACEFORMAT_R32G32B32_SNORM                0x044
++#define SURFACEFORMAT_R32G32B32_SSCALED              0x045
++#define SURFACEFORMAT_R32G32B32_USCALED              0x046
++#define SURFACEFORMAT_R16G16B16A16_UNORM             0x080
++#define SURFACEFORMAT_R16G16B16A16_SNORM             0x081
++#define SURFACEFORMAT_R16G16B16A16_SINT              0x082
++#define SURFACEFORMAT_R16G16B16A16_UINT              0x083
++#define SURFACEFORMAT_R16G16B16A16_FLOAT             0x084
++#define SURFACEFORMAT_R32G32_FLOAT                   0x085
++#define SURFACEFORMAT_R32G32_SINT                    0x086
++#define SURFACEFORMAT_R32G32_UINT                    0x087
++#define SURFACEFORMAT_R32_FLOAT_X8X24_TYPELESS       0x088
++#define SURFACEFORMAT_X32_TYPELESS_G8X24_UINT        0x089
++#define SURFACEFORMAT_L32A32_FLOAT                   0x08A
++#define SURFACEFORMAT_R32G32_UNORM                   0x08B
++#define SURFACEFORMAT_R32G32_SNORM                   0x08C
++#define SURFACEFORMAT_R64_FLOAT                      0x08D
++#define SURFACEFORMAT_R16G16B16X16_UNORM             0x08E
++#define SURFACEFORMAT_R16G16B16X16_FLOAT             0x08F
++#define SURFACEFORMAT_A32X32_FLOAT                   0x090
++#define SURFACEFORMAT_L32X32_FLOAT                   0x091
++#define SURFACEFORMAT_I32X32_FLOAT                   0x092
++#define SURFACEFORMAT_R16G16B16A16_SSCALED           0x093
++#define SURFACEFORMAT_R16G16B16A16_USCALED           0x094
++#define SURFACEFORMAT_R32G32_SSCALED                 0x095
++#define SURFACEFORMAT_R32G32_USCALED                 0x096
++#define SURFACEFORMAT_B8G8R8A8_UNORM                 0x0C0
++#define SURFACEFORMAT_B8G8R8A8_UNORM_SRGB            0x0C1
++#define SURFACEFORMAT_R10G10B10A2_UNORM              0x0C2
++#define SURFACEFORMAT_R10G10B10A2_UNORM_SRGB         0x0C3
++#define SURFACEFORMAT_R10G10B10A2_UINT               0x0C4
++#define SURFACEFORMAT_R10G10B10_SNORM_A2_UNORM       0x0C5
++#define SURFACEFORMAT_R8G8B8A8_UNORM                 0x0C7
++#define SURFACEFORMAT_R8G8B8A8_UNORM_SRGB            0x0C8
++#define SURFACEFORMAT_R8G8B8A8_SNORM                 0x0C9
++#define SURFACEFORMAT_R8G8B8A8_SINT                  0x0CA
++#define SURFACEFORMAT_R8G8B8A8_UINT                  0x0CB
++#define SURFACEFORMAT_R16G16_UNORM                   0x0CC
++#define SURFACEFORMAT_R16G16_SNORM                   0x0CD
++#define SURFACEFORMAT_R16G16_SINT                    0x0CE
++#define SURFACEFORMAT_R16G16_UINT                    0x0CF
++#define SURFACEFORMAT_R16G16_FLOAT                   0x0D0
++#define SURFACEFORMAT_B10G10R10A2_UNORM              0x0D1
++#define SURFACEFORMAT_B10G10R10A2_UNORM_SRGB         0x0D2
++#define SURFACEFORMAT_R11G11B10_FLOAT                0x0D3
++#define SURFACEFORMAT_R32_SINT                       0x0D6
++#define SURFACEFORMAT_R32_UINT                       0x0D7
++#define SURFACEFORMAT_R32_FLOAT                      0x0D8
++#define SURFACEFORMAT_R24_UNORM_X8_TYPELESS          0x0D9
++#define SURFACEFORMAT_X24_TYPELESS_G8_UINT           0x0DA
++#define SURFACEFORMAT_L16A16_UNORM                   0x0DF
++#define SURFACEFORMAT_I24X8_UNORM                    0x0E0
++#define SURFACEFORMAT_L24X8_UNORM                    0x0E1
++#define SURFACEFORMAT_A24X8_UNORM                    0x0E2
++#define SURFACEFORMAT_I32_FLOAT                      0x0E3
++#define SURFACEFORMAT_L32_FLOAT                      0x0E4
++#define SURFACEFORMAT_A32_FLOAT                      0x0E5
++#define SURFACEFORMAT_B8G8R8X8_UNORM                 0x0E9
++#define SURFACEFORMAT_B8G8R8X8_UNORM_SRGB            0x0EA
++#define SURFACEFORMAT_R8G8B8X8_UNORM                 0x0EB
++#define SURFACEFORMAT_R8G8B8X8_UNORM_SRGB            0x0EC
++#define SURFACEFORMAT_R9G9B9E5_SHAREDEXP             0x0ED
++#define SURFACEFORMAT_B10G10R10X2_UNORM              0x0EE
++#define SURFACEFORMAT_L16A16_FLOAT                   0x0F0
++#define SURFACEFORMAT_R32_UNORM                      0x0F1
++#define SURFACEFORMAT_R32_SNORM                      0x0F2
++#define SURFACEFORMAT_R10G10B10X2_USCALED            0x0F3
++#define SURFACEFORMAT_R8G8B8A8_SSCALED               0x0F4
++#define SURFACEFORMAT_R8G8B8A8_USCALED               0x0F5
++#define SURFACEFORMAT_R16G16_SSCALED                 0x0F6
++#define SURFACEFORMAT_R16G16_USCALED                 0x0F7
++#define SURFACEFORMAT_R32_SSCALED                    0x0F8
++#define SURFACEFORMAT_R32_USCALED                    0x0F9
++#define SURFACEFORMAT_B5G6R5_UNORM                   0x100
++#define SURFACEFORMAT_B5G6R5_UNORM_SRGB              0x101
++#define SURFACEFORMAT_B5G5R5A1_UNORM                 0x102
++#define SURFACEFORMAT_B5G5R5A1_UNORM_SRGB            0x103
++#define SURFACEFORMAT_B4G4R4A4_UNORM                 0x104
++#define SURFACEFORMAT_B4G4R4A4_UNORM_SRGB            0x105
++#define SURFACEFORMAT_R8G8_UNORM                     0x106
++#define SURFACEFORMAT_R8G8_SNORM                     0x107
++#define SURFACEFORMAT_R8G8_SINT                      0x108
++#define SURFACEFORMAT_R8G8_UINT                      0x109
++#define SURFACEFORMAT_R16_UNORM                      0x10A
++#define SURFACEFORMAT_R16_SNORM                      0x10B
++#define SURFACEFORMAT_R16_SINT                       0x10C
++#define SURFACEFORMAT_R16_UINT                       0x10D
++#define SURFACEFORMAT_R16_FLOAT                      0x10E
++#define SURFACEFORMAT_I16_UNORM                      0x111
++#define SURFACEFORMAT_L16_UNORM                      0x112
++#define SURFACEFORMAT_A16_UNORM                      0x113
++#define SURFACEFORMAT_L8A8_UNORM                     0x114
++#define SURFACEFORMAT_I16_FLOAT                      0x115
++#define SURFACEFORMAT_L16_FLOAT                      0x116
++#define SURFACEFORMAT_A16_FLOAT                      0x117
++#define SURFACEFORMAT_R5G5_SNORM_B6_UNORM            0x119
++#define SURFACEFORMAT_B5G5R5X1_UNORM                 0x11A
++#define SURFACEFORMAT_B5G5R5X1_UNORM_SRGB            0x11B
++#define SURFACEFORMAT_R8G8_SSCALED                   0x11C
++#define SURFACEFORMAT_R8G8_USCALED                   0x11D
++#define SURFACEFORMAT_R16_SSCALED                    0x11E
++#define SURFACEFORMAT_R16_USCALED                    0x11F
++#define SURFACEFORMAT_R8_UNORM                       0x140
++#define SURFACEFORMAT_R8_SNORM                       0x141
++#define SURFACEFORMAT_R8_SINT                        0x142
++#define SURFACEFORMAT_R8_UINT                        0x143
++#define SURFACEFORMAT_A8_UNORM                       0x144
++#define SURFACEFORMAT_I8_UNORM                       0x145
++#define SURFACEFORMAT_L8_UNORM                       0x146
++#define SURFACEFORMAT_P4A4_UNORM                     0x147
++#define SURFACEFORMAT_A4P4_UNORM                     0x148
++#define SURFACEFORMAT_R8_SSCALED                     0x149
++#define SURFACEFORMAT_R8_USCALED                     0x14A
++#define SURFACEFORMAT_R1_UINT                        0x181
++#define SURFACEFORMAT_YCRCB_NORMAL                   0x182
++#define SURFACEFORMAT_YCRCB_SWAPUVY                  0x183
++#define SURFACEFORMAT_BC1_UNORM                      0x186
++#define SURFACEFORMAT_BC2_UNORM                      0x187
++#define SURFACEFORMAT_BC3_UNORM                      0x188
++#define SURFACEFORMAT_BC4_UNORM                      0x189
++#define SURFACEFORMAT_BC5_UNORM                      0x18A
++#define SURFACEFORMAT_BC1_UNORM_SRGB                 0x18B
++#define SURFACEFORMAT_BC2_UNORM_SRGB                 0x18C
++#define SURFACEFORMAT_BC3_UNORM_SRGB                 0x18D
++#define SURFACEFORMAT_MONO8                          0x18E
++#define SURFACEFORMAT_YCRCB_SWAPUV                   0x18F
++#define SURFACEFORMAT_YCRCB_SWAPY                    0x190
++#define SURFACEFORMAT_DXT1_RGB                       0x191
++#define SURFACEFORMAT_FXT1                           0x192
++#define SURFACEFORMAT_R8G8B8_UNORM                   0x193
++#define SURFACEFORMAT_R8G8B8_SNORM                   0x194
++#define SURFACEFORMAT_R8G8B8_SSCALED                 0x195
++#define SURFACEFORMAT_R8G8B8_USCALED                 0x196
++#define SURFACEFORMAT_R64G64B64A64_FLOAT             0x197
++#define SURFACEFORMAT_R64G64B64_FLOAT                0x198
++#define SURFACEFORMAT_BC4_SNORM                      0x199
++#define SURFACEFORMAT_BC5_SNORM                      0x19A
++#define SURFACEFORMAT_R16G16B16_UNORM                0x19C
++#define SURFACEFORMAT_R16G16B16_SNORM                0x19D
++#define SURFACEFORMAT_R16G16B16_SSCALED              0x19E
++#define SURFACEFORMAT_R16G16B16_USCALED              0x19F
++
++#define SURFACE_1D      0
++#define SURFACE_2D      1
++#define SURFACE_3D      2
++#define SURFACE_CUBE    3
++#define SURFACE_BUFFER  4
++#define SURFACE_NULL    7
++
++#define TEXCOORDMODE_WRAP            0
++#define TEXCOORDMODE_MIRROR          1
++#define TEXCOORDMODE_CLAMP           2
++#define TEXCOORDMODE_CUBE            3
++#define TEXCOORDMODE_CLAMP_BORDER    4
++#define TEXCOORDMODE_MIRROR_ONCE     5
++
++#define THREAD_PRIORITY_NORMAL   0
++#define THREAD_PRIORITY_HIGH     1
++
++#define VERTEX_SUBPIXEL_PRECISION_8BITS  0
++#define VERTEX_SUBPIXEL_PRECISION_4BITS  1
++
++#define COMPONENT_NOSTORE      0
++#define COMPONENT_STORE_SRC    1
++#define COMPONENT_STORE_0      2
++#define COMPONENT_STORE_1_FLT  3
++#define COMPONENT_STORE_1_INT  4
++#define COMPONENT_STORE_VID    5
++#define COMPONENT_STORE_IID    6
++#define COMPONENT_STORE_PID    7
++
++/* Execution Unit (EU) defines
++ */
++
++#define GEN9_ALIGN_1   0
++#define GEN9_ALIGN_16  1
++
++#define GEN9_ADDRESS_DIRECT                        0
++#define GEN9_ADDRESS_REGISTER_INDIRECT_REGISTER    1
++
++#define GEN9_CHANNEL_X     0
++#define GEN9_CHANNEL_Y     1
++#define GEN9_CHANNEL_Z     2
++#define GEN9_CHANNEL_W     3
++
++#define GEN9_COMPRESSION_NONE          0
++#define GEN9_COMPRESSION_2NDHALF       1
++#define GEN9_COMPRESSION_COMPRESSED    2
++
++#define GEN9_CONDITIONAL_NONE  0
++#define GEN9_CONDITIONAL_Z     1
++#define GEN9_CONDITIONAL_NZ    2
++#define GEN9_CONDITIONAL_EQ    1	/* Z */
++#define GEN9_CONDITIONAL_NEQ   2	/* NZ */
++#define GEN9_CONDITIONAL_G     3
++#define GEN9_CONDITIONAL_GE    4
++#define GEN9_CONDITIONAL_L     5
++#define GEN9_CONDITIONAL_LE    6
++#define GEN9_CONDITIONAL_C     7
++#define GEN9_CONDITIONAL_O     8
++
++#define GEN9_DEBUG_NONE        0
++#define GEN9_DEBUG_BREAKPOINT  1
++
++#define GEN9_DEPENDENCY_NORMAL         0
++#define GEN9_DEPENDENCY_NOTCLEARED     1
++#define GEN9_DEPENDENCY_NOTCHECKED     2
++#define GEN9_DEPENDENCY_DISABLE        3
++
++#define GEN9_EXECUTE_1     0
++#define GEN9_EXECUTE_2     1
++#define GEN9_EXECUTE_4     2
++#define GEN9_EXECUTE_8     3
++#define GEN9_EXECUTE_16    4
++#define GEN9_EXECUTE_32    5
++
++#define GEN9_HORIZONTAL_STRIDE_0   0
++#define GEN9_HORIZONTAL_STRIDE_1   1
++#define GEN9_HORIZONTAL_STRIDE_2   2
++#define GEN9_HORIZONTAL_STRIDE_4   3
++
++#define GEN9_INSTRUCTION_NORMAL    0
++#define GEN9_INSTRUCTION_SATURATE  1
++
++#define GEN9_OPCODE_MOV        1
++#define GEN9_OPCODE_SEL        2
++#define GEN9_OPCODE_NOT        4
++#define GEN9_OPCODE_AND        5
++#define GEN9_OPCODE_OR         6
++#define GEN9_OPCODE_XOR        7
++#define GEN9_OPCODE_SHR        8
++#define GEN9_OPCODE_SHL        9
++#define GEN9_OPCODE_RSR        10
++#define GEN9_OPCODE_RSL        11
++#define GEN9_OPCODE_ASR        12
++#define GEN9_OPCODE_CMP        16
++#define GEN9_OPCODE_JMPI       32
++#define GEN9_OPCODE_IF         34
++#define GEN9_OPCODE_IFF        35
++#define GEN9_OPCODE_ELSE       36
++#define GEN9_OPCODE_ENDIF      37
++#define GEN9_OPCODE_DO         38
++#define GEN9_OPCODE_WHILE      39
++#define GEN9_OPCODE_BREAK      40
++#define GEN9_OPCODE_CONTINUE   41
++#define GEN9_OPCODE_HALT       42
++#define GEN9_OPCODE_MSAVE      44
++#define GEN9_OPCODE_MRESTORE   45
++#define GEN9_OPCODE_PUSH       46
++#define GEN9_OPCODE_POP        47
++#define GEN9_OPCODE_WAIT       48
++#define GEN9_OPCODE_SEND       49
++#define GEN9_OPCODE_ADD        64
++#define GEN9_OPCODE_MUL        65
++#define GEN9_OPCODE_AVG        66
++#define GEN9_OPCODE_FRC        67
++#define GEN9_OPCODE_RNDU       68
++#define GEN9_OPCODE_RNDD       69
++#define GEN9_OPCODE_RNDE       70
++#define GEN9_OPCODE_RNDZ       71
++#define GEN9_OPCODE_MAC        72
++#define GEN9_OPCODE_MACH       73
++#define GEN9_OPCODE_LZD        74
++#define GEN9_OPCODE_SAD2       80
++#define GEN9_OPCODE_SADA2      81
++#define GEN9_OPCODE_DP4        84
++#define GEN9_OPCODE_DPH        85
++#define GEN9_OPCODE_DP3        86
++#define GEN9_OPCODE_DP2        87
++#define GEN9_OPCODE_DPA2       88
++#define GEN9_OPCODE_LINE       89
++#define GEN9_OPCODE_NOP        126
++
++#define GEN9_PREDICATE_NONE             0
++#define GEN9_PREDICATE_NORMAL           1
++#define GEN9_PREDICATE_ALIGN1_ANYV             2
++#define GEN9_PREDICATE_ALIGN1_ALLV             3
++#define GEN9_PREDICATE_ALIGN1_ANY2H            4
++#define GEN9_PREDICATE_ALIGN1_ALL2H            5
++#define GEN9_PREDICATE_ALIGN1_ANY4H            6
++#define GEN9_PREDICATE_ALIGN1_ALL4H            7
++#define GEN9_PREDICATE_ALIGN1_ANY8H            8
++#define GEN9_PREDICATE_ALIGN1_ALL8H            9
++#define GEN9_PREDICATE_ALIGN1_ANY16H           10
++#define GEN9_PREDICATE_ALIGN1_ALL16H           11
++#define GEN9_PREDICATE_ALIGN16_REPLICATE_X     2
++#define GEN9_PREDICATE_ALIGN16_REPLICATE_Y     3
++#define GEN9_PREDICATE_ALIGN16_REPLICATE_Z     4
++#define GEN9_PREDICATE_ALIGN16_REPLICATE_W     5
++#define GEN9_PREDICATE_ALIGN16_ANY4H           6
++#define GEN9_PREDICATE_ALIGN16_ALL4H           7
++
++#define GEN9_ARCHITECTURE_REGISTER_FILE    0
++#define GEN9_GENERAL_REGISTER_FILE         1
++#define GEN9_MESSAGE_REGISTER_FILE         2
++#define GEN9_IMMEDIATE_VALUE               3
++
++#define GEN9_REGISTER_TYPE_UD  0
++#define GEN9_REGISTER_TYPE_D   1
++#define GEN9_REGISTER_TYPE_UW  2
++#define GEN9_REGISTER_TYPE_W   3
++#define GEN9_REGISTER_TYPE_UB  4
++#define GEN9_REGISTER_TYPE_B   5
++#define GEN9_REGISTER_TYPE_VF  5	/* packed float vector, immediates only? */
++#define GEN9_REGISTER_TYPE_HF  6
++#define GEN9_REGISTER_TYPE_V   6	/* packed int vector, immediates only, uword dest only */
++#define GEN9_REGISTER_TYPE_F   7
++
++#define GEN9_ARF_NULL                  0x00
++#define GEN9_ARF_ADDRESS               0x10
++#define GEN9_ARF_ACCUMULATOR           0x20
++#define GEN9_ARF_FLAG                  0x30
++#define GEN9_ARF_MASK                  0x40
++#define GEN9_ARF_MASK_STACK            0x50
++#define GEN9_ARF_MASK_STACK_DEPTH      0x60
++#define GEN9_ARF_STATE                 0x70
++#define GEN9_ARF_CONTROL               0x80
++#define GEN9_ARF_NOTIFICATION_COUNT    0x90
++#define GEN9_ARF_IP                    0xA0
++
++#define GEN9_AMASK   0
++#define GEN9_IMASK   1
++#define GEN9_LMASK   2
++#define GEN9_CMASK   3
++
++#define GEN9_THREAD_NORMAL     0
++#define GEN9_THREAD_ATOMIC     1
++#define GEN9_THREAD_SWITCH     2
++
++#define GEN9_VERTICAL_STRIDE_0                 0
++#define GEN9_VERTICAL_STRIDE_1                 1
++#define GEN9_VERTICAL_STRIDE_2                 2
++#define GEN9_VERTICAL_STRIDE_4                 3
++#define GEN9_VERTICAL_STRIDE_8                 4
++#define GEN9_VERTICAL_STRIDE_16                5
++#define GEN9_VERTICAL_STRIDE_32                6
++#define GEN9_VERTICAL_STRIDE_64                7
++#define GEN9_VERTICAL_STRIDE_128               8
++#define GEN9_VERTICAL_STRIDE_256               9
++#define GEN9_VERTICAL_STRIDE_ONE_DIMENSIONAL   0xF
++
++#define GEN9_WIDTH_1       0
++#define GEN9_WIDTH_2       1
++#define GEN9_WIDTH_4       2
++#define GEN9_WIDTH_8       3
++#define GEN9_WIDTH_16      4
++
++#define GEN9_STATELESS_BUFFER_BOUNDARY_1K      0
++#define GEN9_STATELESS_BUFFER_BOUNDARY_2K      1
++#define GEN9_STATELESS_BUFFER_BOUNDARY_4K      2
++#define GEN9_STATELESS_BUFFER_BOUNDARY_8K      3
++#define GEN9_STATELESS_BUFFER_BOUNDARY_16K     4
++#define GEN9_STATELESS_BUFFER_BOUNDARY_32K     5
++#define GEN9_STATELESS_BUFFER_BOUNDARY_64K     6
++#define GEN9_STATELESS_BUFFER_BOUNDARY_128K    7
++#define GEN9_STATELESS_BUFFER_BOUNDARY_256K    8
++#define GEN9_STATELESS_BUFFER_BOUNDARY_512K    9
++#define GEN9_STATELESS_BUFFER_BOUNDARY_1M      10
++#define GEN9_STATELESS_BUFFER_BOUNDARY_2M      11
++
++#define GEN9_POLYGON_FACING_FRONT      0
++#define GEN9_POLYGON_FACING_BACK       1
++
++#define GEN9_MESSAGE_TARGET_NULL               0
++#define GEN9_MESSAGE_TARGET_MATH               1
++#define GEN9_MESSAGE_TARGET_SAMPLER            2
++#define GEN9_MESSAGE_TARGET_GATEWAY            3
++#define GEN9_MESSAGE_TARGET_DATAPORT_READ      4
++#define GEN9_MESSAGE_TARGET_DATAPORT_WRITE     5
++#define GEN9_MESSAGE_TARGET_URB                6
++#define GEN9_MESSAGE_TARGET_THREAD_SPAWNER     7
++
++#define GEN9_SAMPLER_RETURN_FORMAT_FLOAT32     0
++#define GEN9_SAMPLER_RETURN_FORMAT_UINT32      2
++#define GEN9_SAMPLER_RETURN_FORMAT_SINT32      3
++
++#define GEN9_SAMPLER_MESSAGE_SIMD8_SAMPLE              0
++#define GEN9_SAMPLER_MESSAGE_SIMD16_SAMPLE             0
++#define GEN9_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS        0
++#define GEN9_SAMPLER_MESSAGE_SIMD8_KILLPIX             1
++#define GEN9_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD        1
++#define GEN9_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD         1
++#define GEN9_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS  2
++#define GEN9_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS    2
++#define GEN9_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_COMPARE    0
++#define GEN9_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE     2
++#define GEN9_SAMPLER_MESSAGE_SIMD4X2_RESINFO           2
++#define GEN9_SAMPLER_MESSAGE_SIMD8_RESINFO             2
++#define GEN9_SAMPLER_MESSAGE_SIMD16_RESINFO            2
++#define GEN9_SAMPLER_MESSAGE_SIMD4X2_LD                3
++#define GEN9_SAMPLER_MESSAGE_SIMD8_LD                  3
++#define GEN9_SAMPLER_MESSAGE_SIMD16_LD                 3
++
++#define GEN9_DATAPORT_OWORD_BLOCK_1_OWORDLOW   0
++#define GEN9_DATAPORT_OWORD_BLOCK_1_OWORDHIGH  1
++#define GEN9_DATAPORT_OWORD_BLOCK_2_OWORDS     2
++#define GEN9_DATAPORT_OWORD_BLOCK_4_OWORDS     3
++#define GEN9_DATAPORT_OWORD_BLOCK_8_OWORDS     4
++
++#define GEN9_DATAPORT_OWORD_DUAL_BLOCK_1OWORD     0
++#define GEN9_DATAPORT_OWORD_DUAL_BLOCK_4OWORDS    2
++
++#define GEN9_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS   2
++#define GEN9_DATAPORT_DWORD_SCATTERED_BLOCK_16DWORDS  3
++
++#define GEN9_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ          0
++#define GEN9_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ     1
++#define GEN9_DATAPORT_READ_MESSAGE_DWORD_BLOCK_READ          2
++#define GEN9_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ      3
++
++#define GEN9_DATAPORT_READ_TARGET_DATA_CACHE      0
++#define GEN9_DATAPORT_READ_TARGET_RENDER_CACHE    1
++#define GEN9_DATAPORT_READ_TARGET_SAMPLER_CACHE   2
++
++#define GEN9_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE                0
++#define GEN9_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED     1
++#define GEN9_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01         2
++#define GEN9_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23         3
++#define GEN9_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01       4
++
++#define GEN9_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE                0
++#define GEN9_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE           1
++#define GEN9_DATAPORT_WRITE_MESSAGE_DWORD_BLOCK_WRITE                2
++#define GEN9_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE            3
++#define GEN9_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE              4
++#define GEN9_DATAPORT_WRITE_MESSAGE_STREAMED_VERTEX_BUFFER_WRITE     5
++#define GEN9_DATAPORT_WRITE_MESSAGE_FLUSH_RENDER_CACHE               7
++
++#define GEN9_MATH_FUNCTION_INV                              1
++#define GEN9_MATH_FUNCTION_LOG                              2
++#define GEN9_MATH_FUNCTION_EXP                              3
++#define GEN9_MATH_FUNCTION_SQRT                             4
++#define GEN9_MATH_FUNCTION_RSQ                              5
++#define GEN9_MATH_FUNCTION_SIN                              6 /* was 7 */
++#define GEN9_MATH_FUNCTION_COS                              7 /* was 8 */
++#define GEN9_MATH_FUNCTION_SINCOS                           8 /* was 6 */
++#define GEN9_MATH_FUNCTION_TAN                              9
++#define GEN9_MATH_FUNCTION_POW                              10
++#define GEN9_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER   11
++#define GEN9_MATH_FUNCTION_INT_DIV_QUOTIENT                 12
++#define GEN9_MATH_FUNCTION_INT_DIV_REMAINDER                13
++
++#define GEN9_MATH_INTEGER_UNSIGNED     0
++#define GEN9_MATH_INTEGER_SIGNED       1
++
++#define GEN9_MATH_PRECISION_FULL        0
++#define GEN9_MATH_PRECISION_PARTIAL     1
++
++#define GEN9_MATH_SATURATE_NONE         0
++#define GEN9_MATH_SATURATE_SATURATE     1
++
++#define GEN9_MATH_DATA_VECTOR  0
++#define GEN9_MATH_DATA_SCALAR  1
++
++#define GEN9_URB_OPCODE_WRITE  0
++
++#define GEN9_URB_SWIZZLE_NONE          0
++#define GEN9_URB_SWIZZLE_INTERLEAVE    1
++#define GEN9_URB_SWIZZLE_TRANSPOSE     2
++
++#define GEN9_SCRATCH_SPACE_SIZE_1K     0
++#define GEN9_SCRATCH_SPACE_SIZE_2K     1
++#define GEN9_SCRATCH_SPACE_SIZE_4K     2
++#define GEN9_SCRATCH_SPACE_SIZE_8K     3
++#define GEN9_SCRATCH_SPACE_SIZE_16K    4
++#define GEN9_SCRATCH_SPACE_SIZE_32K    5
++#define GEN9_SCRATCH_SPACE_SIZE_64K    6
++#define GEN9_SCRATCH_SPACE_SIZE_128K   7
++#define GEN9_SCRATCH_SPACE_SIZE_256K   8
++#define GEN9_SCRATCH_SPACE_SIZE_512K   9
++#define GEN9_SCRATCH_SPACE_SIZE_1M     10
++#define GEN9_SCRATCH_SPACE_SIZE_2M     11
++
++struct gen9_blend_state {
++	struct {
++		/* 00 */ uint32_t pad:19;
++		/* 19 */ uint32_t y_dither_offset:2;
++		/* 21 */ uint32_t x_dither_offset:2;
++		/* 23 */ uint32_t color_dither_enable:1;
++		/* 24 */ uint32_t alpha_test_function:3;
++		/* 27 */ uint32_t alpha_test:1;
++		/* 28 */ uint32_t alpha_to_coverage_dither:1;
++		/* 29 */ uint32_t alpha_to_one:1;
++		/* 30 */ uint32_t ia_blend:1;
++		/* 31 */ uint32_t alpha_to_coverage:1;
++	} common;
++
++	struct {
++		/* 00 */ uint32_t write_disable_blue:1;
++		/* 01 */ uint32_t write_disable_green:1;
++		/* 02 */ uint32_t write_disable_red:1;
++		/* 03 */ uint32_t write_disable_alpha:1;
++		/* 04 */ uint32_t pad0:1;
++		/* 05 */ uint32_t alpha_blend_function:3;
++		/* 08 */ uint32_t dest_alpha_blend_factor:5;
++		/* 13 */ uint32_t source_alpha_blend_factor:5;
++		/* 18 */ uint32_t color_blend_function:3;
++		/* 21 */ uint32_t dest_blend_factor:5;
++		/* 26 */ uint32_t source_blend_factor:5;
++		/* 31 */ uint32_t color_blend:1;
++		/* 32 */ uint32_t post_blend_clamp:1;
++		/* 33 */ uint32_t pre_blend_clamp:1;
++		/* 34 */ uint32_t color_clamp_range:2;
++		/* 36 */ uint32_t pre_blend_source_only_clamp:1;
++		/* 37 */ uint32_t pad1:22;
++		/* 59 */ uint32_t logic_op_function:4;
++		/* 63 */ uint32_t logic_op:1;
++	} rt;
++};
++
++struct gen9_color_calc_state {
++	struct {
++		/* 00 */ uint32_t alpha_test_format:1;
++		/* 01 */ uint32_t pad0:14;
++		/* 15 */ uint32_t round_disable:1;
++		/* 16 */ uint32_t bf_stencil_ref:8;
++		/* 24 */ uint32_t stencil_ref:8;
++	} cc0;
++
++	union {
++		float alpha_ref_f;
++		struct {
++			uint32_t ui:8;
++			uint32_t pad0:24;
++		} alpha_ref_fi;
++	} cc1;
++
++	float constant_r;
++	float constant_g;
++	float constant_b;
++	float constant_a;
++};
++
++struct gen9_sampler_state {
++	struct {
++		/* 00 */ unsigned int aniso_algorithm:1;
++		/* 01 */ unsigned int lod_bias:13;
++		/* 14 */ unsigned int min_filter:3;
++		/* 17 */ unsigned int mag_filter:3;
++		/* 20 */ unsigned int mip_filter:2;
++		/* 22 */ unsigned int base_level:5;
++		/* 27 */ unsigned int lod_preclamp:2;
++		/* 29 */ unsigned int default_color_mode:1;
++		/* 30 */ unsigned int flexible_filter_clamp:1;
++		/* 31 */ unsigned int disable:1;
++	} ss0;
++
++	struct {
++		/* 00 */ unsigned int cube_control_mode:1;
++		/* 01 */ unsigned int shadow_function:3;
++		/* 04 */ unsigned int chroma_key_mode:1;
++		/* 05 */ unsigned int chroma_key_index:2;
++		/* 07 */ unsigned int chroma_key_enable:1;
++		/* 08 */ unsigned int max_lod:12;
++		/* 20 */ unsigned int min_lod:12;
++	} ss1;
++
++	struct {
++		unsigned int pad:6;
++		unsigned int default_color_pointer:26;
++	} ss2;
++
++	struct {
++		/* 00 */ unsigned int r_wrap_mode:3;
++		/* 03 */ unsigned int t_wrap_mode:3;
++		/* 06 */ unsigned int s_wrap_mode:3;
++		/* 09 */ unsigned int pad:1;
++		/* 10 */ unsigned int non_normalized_coord:1;
++		/* 11 */ unsigned int trilinear_quality:2;
++		/* 13 */ unsigned int address_round:6;
++		/* 19 */ unsigned int max_aniso:3;
++		/* 22 */ unsigned int pad0:2;
++		/* 24 */ unsigned int non_separable_filter:8;
++	} ss3;
++};
++
++/* Surface state DW0 */
++#define SURFACE_RC_READ_WRITE	(1 << 8)
++#define SURFACE_TILED		(1 << 13)
++#define SURFACE_TILED_Y		(1 << 12)
++#define SURFACE_FORMAT_SHIFT	18
++#define SURFACE_VALIGN_1	(0 << 16) /* reserved! */
++#define SURFACE_VALIGN_4	(1 << 16)
++#define SURFACE_VALIGN_8	(2 << 16)
++#define SURFACE_VALIGN_16	(3 << 16)
++#define SURFACE_HALIGN_1	(0 << 14) /* reserved! */
++#define SURFACE_HALIGN_4	(1 << 14)
++#define SURFACE_HALIGN_8	(2 << 14)
++#define SURFACE_HALIGN_16	(3 << 14)
++#define SURFACE_TYPE_SHIFT		29
++
++/* Surface state DW2 */
++#define SURFACE_HEIGHT_SHIFT        16
++#define SURFACE_WIDTH_SHIFT         0
++
++/* Surface state DW3 */
++#define SURFACE_DEPTH_SHIFT         21
++#define SURFACE_PITCH_SHIFT         0
++
++#define SWIZZLE_ZERO		0
++#define SWIZZLE_ONE		1
++#define SWIZZLE_RED		4
++#define SWIZZLE_GREEN		5
++#define SWIZZLE_BLUE		6
++#define SWIZZLE_ALPHA		7
++#define __SURFACE_SWIZZLE(r,g,b,a) \
++	((a) << 16 | (b) << 19 | (g) << 22 | (r) << 25)
++#define SURFACE_SWIZZLE(r,g,b,a) \
++	__SURFACE_SWIZZLE(SWIZZLE_##r, SWIZZLE_##g, SWIZZLE_##b, SWIZZLE_##a)
++
++typedef enum {
++	SAMPLER_FILTER_NEAREST = 0,
++	SAMPLER_FILTER_BILINEAR,
++	FILTER_COUNT
++} sampler_filter_t;
++
++typedef enum {
++	SAMPLER_EXTEND_NONE = 0,
++	SAMPLER_EXTEND_REPEAT,
++	SAMPLER_EXTEND_PAD,
++	SAMPLER_EXTEND_REFLECT,
++	EXTEND_COUNT
++} sampler_extend_t;
++
++#endif
+diff --git a/src/sna/kgem.c b/src/sna/kgem.c
+index 78ed5407..f0d171ac 100644
+--- a/src/sna/kgem.c
++++ b/src/sna/kgem.c
+@@ -84,6 +84,10 @@ search_snoop_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags);
+ #define DBG_NO_HANDLE_LUT 0
+ #define DBG_NO_WT 0
+ #define DBG_NO_WC_MMAP 0
++#define DBG_NO_BLT_Y 0
++#define DBG_NO_SCANOUT_Y 0
++#define DBG_NO_DIRTYFB 0
++#define DBG_NO_DETILING 0
+ #define DBG_DUMP 0
+ #define DBG_NO_MALLOC_CACHE 0
+ 
+@@ -96,11 +100,6 @@ search_snoop_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags);
+ #define SHOW_BATCH_BEFORE 0
+ #define SHOW_BATCH_AFTER 0
+ 
+-#if !USE_WC_MMAP
+-#undef DBG_NO_WC_MMAP
+-#define DBG_NO_WC_MMAP 1
+-#endif
+-
+ #if 0
+ #define ASSERT_IDLE(kgem__, handle__) assert(!__kgem_busy(kgem__, handle__))
+ #define ASSERT_MAYBE_IDLE(kgem__, handle__, expect__) assert(!(expect__) || !__kgem_busy(kgem__, handle__))
+@@ -187,6 +186,15 @@ struct local_i915_gem_caching {
+ #define LOCAL_IOCTL_I915_GEM_SET_CACHING DRM_IOW(DRM_COMMAND_BASE + LOCAL_I915_GEM_SET_CACHING, struct local_i915_gem_caching)
+ #define LOCAL_IOCTL_I915_GEM_GET_CACHING DRM_IOW(DRM_COMMAND_BASE + LOCAL_I915_GEM_GET_CACHING, struct local_i915_gem_caching)
+ 
++struct local_i915_gem_mmap {
++	uint32_t handle;
++	uint32_t pad;
++	uint64_t offset;
++	uint64_t size;
++	uint64_t addr_ptr;
++};
++#define LOCAL_IOCTL_I915_GEM_MMAP DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_MMAP, struct local_i915_gem_mmap)
++
+ struct local_i915_gem_mmap2 {
+ 	uint32_t handle;
+ 	uint32_t pad;
+@@ -216,6 +224,12 @@ static struct kgem_bo *__kgem_freed_bo;
+ static struct kgem_request *__kgem_freed_request;
+ static struct drm_i915_gem_exec_object2 _kgem_dummy_exec;
+ 
++static inline struct sna *__to_sna(struct kgem *kgem)
++{
++	/* minor layering violations */
++	return container_of(kgem, struct sna, kgem);
++}
++
+ static inline int bytes(struct kgem_bo *bo)
+ {
+ 	return __kgem_bo_size(bo);
+@@ -224,25 +238,31 @@ static inline int bytes(struct kgem_bo *bo)
+ #define bucket(B) (B)->size.pages.bucket
+ #define num_pages(B) (B)->size.pages.count
+ 
+-static int do_ioctl(int fd, unsigned long req, void *arg)
++static int __do_ioctl(int fd, unsigned long req, void *arg)
+ {
+-	int err;
+-
+-restart:
+-	if (ioctl(fd, req, arg) == 0)
+-		return 0;
++	do {
++		int err;
+ 
+-	err = errno;
++		switch ((err = errno)) {
++		case EAGAIN:
++			sched_yield();
++		case EINTR:
++			break;
++		default:
++			return -err;
++		}
+ 
+-	if (err == EINTR)
+-		goto restart;
++		if (likely(ioctl(fd, req, arg) == 0))
++			return 0;
++	} while (1);
++}
+ 
+-	if (err == EAGAIN) {
+-		sched_yield();
+-		goto restart;
+-	}
++inline static int do_ioctl(int fd, unsigned long req, void *arg)
++{
++	if (likely(ioctl(fd, req, arg) == 0))
++		return 0;
+ 
+-	return -err;
++	return __do_ioctl(fd, req, arg);
+ }
+ 
+ #ifdef DEBUG_MEMORY
+@@ -266,6 +286,9 @@ static void assert_tiling(struct kgem *kgem, struct kgem_bo *bo)
+ 
+ 	assert(bo);
+ 
++	if (!kgem->can_fence && kgem->gen >= 040 && bo->tiling)
++		return; /* lies */
++
+ 	VG_CLEAR(tiling);
+ 	tiling.handle = bo->handle;
+ 	tiling.tiling_mode = bo->tiling;
+@@ -273,7 +296,7 @@ static void assert_tiling(struct kgem *kgem, struct kgem_bo *bo)
+ 	assert(tiling.tiling_mode == bo->tiling);
+ }
+ 
+-static void assert_cacheing(struct kgem *kgem, struct kgem_bo *bo)
++static void assert_caching(struct kgem *kgem, struct kgem_bo *bo)
+ {
+ 	struct local_i915_gem_caching arg;
+ 	int expect = kgem->has_llc ? SNOOPED : UNCACHED;
+@@ -294,24 +317,117 @@ static void assert_bo_retired(struct kgem_bo *bo)
+ 	assert(bo->refcnt);
+ 	assert(bo->rq == NULL);
+ 	assert(bo->exec == NULL);
++	assert(!bo->needs_flush);
+ 	assert(list_is_empty(&bo->request));
+ }
+ #else
+ #define assert_tiling(kgem, bo)
+-#define assert_cacheing(kgem, bo)
++#define assert_caching(kgem, bo)
+ #define assert_bo_retired(bo)
+ #endif
+ 
++static int __find_debugfs(struct kgem *kgem)
++{
++	int i;
++
++	for (i = 0; i < DRM_MAX_MINOR; i++) {
++		char path[80];
++
++		sprintf(path, "/sys/kernel/debug/dri/%d/i915_wedged", i);
++		if (access(path, R_OK) == 0)
++			return i;
++
++		sprintf(path, "/debug/dri/%d/i915_wedged", i);
++		if (access(path, R_OK) == 0)
++			return i;
++	}
++
++	return -1;
++}
++
++static int kgem_get_minor(struct kgem *kgem)
++{
++	struct stat st;
++
++	if (fstat(kgem->fd, &st))
++		return __find_debugfs(kgem);
++
++	if (!S_ISCHR(st.st_mode))
++		return __find_debugfs(kgem);
++
++	return st.st_rdev & 0x63;
++}
++
++static bool find_hang_state(struct kgem *kgem, char *path, int maxlen)
++{
++	int minor = kgem_get_minor(kgem);
++
++	/* Search for our hang state in a few canonical locations.
++	 * In the unlikely event of having multiple devices, we
++	 * will need to check which minor actually corresponds to ours.
++	 */
++
++	snprintf(path, maxlen, "/sys/class/drm/card%d/error", minor);
++	if (access(path, R_OK) == 0)
++		return true;
++
++	snprintf(path, maxlen, "/sys/kernel/debug/dri/%d/i915_error_state", minor);
++	if (access(path, R_OK) == 0)
++		return true;
++
++	snprintf(path, maxlen, "/debug/dri/%d/i915_error_state", minor);
++	if (access(path, R_OK) == 0)
++		return true;
++
++	path[0] = '\0';
++	return false;
++}
++
++static bool has_error_state(struct kgem *kgem, char *path)
++{
++   bool ret = false;
++   char no;
++   int fd;
++
++   fd = open(path, O_RDONLY);
++   if (fd >= 0) {
++      ret = read(fd, &no, 1) == 1 && no != 'N';
++      close(fd);
++   }
++
++   return ret;
++}
++
++static int kgem_get_screen_index(struct kgem *kgem)
++{
++	return __to_sna(kgem)->scrn->scrnIndex;
++}
++
+ static void
+ __kgem_set_wedged(struct kgem *kgem)
+ {
++	static int once;
++	char path[256];
++
++	if (kgem->wedged)
++		return;
++
++	if (!once &&
++	    find_hang_state(kgem, path, sizeof(path)) &&
++            has_error_state(kgem, path)) {
++		xf86DrvMsg(kgem_get_screen_index(kgem), X_ERROR,
++			   "When reporting this, please include %s and the full dmesg.\n",
++			   path);
++		once = 1;
++	}
++
+ 	kgem->wedged = true;
+-	sna_render_mark_wedged(container_of(kgem, struct sna, kgem));
++	sna_render_mark_wedged(__to_sna(kgem));
+ }
+ 
+ static void kgem_sna_reset(struct kgem *kgem)
+ {
+-	struct sna *sna = container_of(kgem, struct sna, kgem);
++	struct sna *sna = __to_sna(kgem);
+ 
+ 	sna->render.reset(sna);
+ 	sna->blt_state.fill_bo = 0;
+@@ -319,7 +435,7 @@ static void kgem_sna_reset(struct kgem *kgem)
+ 
+ static void kgem_sna_flush(struct kgem *kgem)
+ {
+-	struct sna *sna = container_of(kgem, struct sna, kgem);
++	struct sna *sna = __to_sna(kgem);
+ 
+ 	sna->render.flush(sna);
+ 
+@@ -327,22 +443,53 @@ static void kgem_sna_flush(struct kgem *kgem)
+ 		sna_render_flush_solid(sna);
+ }
+ 
+-static bool gem_set_tiling(int fd, uint32_t handle, int tiling, int stride)
++static bool kgem_bo_rmfb(struct kgem *kgem, struct kgem_bo *bo)
++{
++	if (bo->scanout && bo->delta) {
++		DBG(("%s: releasing fb=%d for handle=%d\n",
++		     __FUNCTION__, bo->delta, bo->handle));
++		/* XXX will leak if we are not DRM_MASTER. *shrug* */
++		do_ioctl(kgem->fd, DRM_IOCTL_MODE_RMFB, &bo->delta);
++		bo->delta = 0;
++		return true;
++	} else
++		return false;
++}
++
++static bool kgem_set_tiling(struct kgem *kgem, struct kgem_bo *bo,
++			    int tiling, int stride)
+ {
+ 	struct drm_i915_gem_set_tiling set_tiling;
+ 	int err;
+ 
++	if (tiling == bo->tiling) {
++		if (tiling == I915_TILING_NONE) {
++			bo->pitch = stride;
++			return true;
++		}
++		if (stride == bo->pitch)
++			return true;
++	}
++
+ 	if (DBG_NO_TILING)
+ 		return false;
+ 
+ 	VG_CLEAR(set_tiling);
+ restart:
+-	set_tiling.handle = handle;
++	set_tiling.handle = bo->handle;
+ 	set_tiling.tiling_mode = tiling;
+-	set_tiling.stride = stride;
++	set_tiling.stride = tiling ? stride : 0;
+ 
+-	if (ioctl(fd, DRM_IOCTL_I915_GEM_SET_TILING, &set_tiling) == 0)
+-		return true;
++	if (ioctl(kgem->fd, DRM_IOCTL_I915_GEM_SET_TILING, &set_tiling) == 0) {
++		bo->tiling = set_tiling.tiling_mode;
++		bo->pitch = set_tiling.tiling_mode ? set_tiling.stride : stride;
++		DBG(("%s: handle=%d, tiling=%d [%d], pitch=%d [%d]: %d\n",
++		     __FUNCTION__, bo->handle,
++		     bo->tiling, tiling,
++		     bo->pitch, stride,
++		     set_tiling.tiling_mode == tiling));
++		return set_tiling.tiling_mode == tiling;
++	}
+ 
+ 	err = errno;
+ 	if (err == EINTR)
+@@ -353,6 +500,11 @@ restart:
+ 		goto restart;
+ 	}
+ 
++	if (err == EBUSY && kgem_bo_rmfb(kgem, bo))
++		goto restart;
++
++	ERR(("%s: failed to set-tiling(tiling=%d, pitch=%d) for handle=%d: %d\n",
++	     __FUNCTION__, tiling, stride, bo->handle, err));
+ 	return false;
+ }
+ 
+@@ -437,10 +589,15 @@ static void *__kgem_bo_map__gtt(struct kgem *kgem, struct kgem_bo *bo)
+ 	DBG(("%s(handle=%d, size=%d)\n", __FUNCTION__,
+ 	     bo->handle, bytes(bo)));
+ 
++	if (bo->tiling && !kgem->can_fence)
++		return NULL;
++
+ 	VG_CLEAR(gtt);
+ retry_gtt:
+ 	gtt.handle = bo->handle;
+ 	if ((err = do_ioctl(kgem->fd, DRM_IOCTL_I915_GEM_MMAP_GTT, &gtt))) {
++		DBG(("%s: failed %d, throttling/cleaning caches\n",
++		     __FUNCTION__, err));
+ 		assert(err != EINVAL);
+ 
+ 		(void)__kgem_throttle_retire(kgem, 0);
+@@ -460,6 +617,8 @@ retry_mmap:
+ 		   kgem->fd, gtt.offset);
+ 	if (ptr == MAP_FAILED) {
+ 		err = errno;
++		DBG(("%s: failed %d, throttling/cleaning caches\n",
++		     __FUNCTION__, err));
+ 		assert(err != EINVAL);
+ 
+ 		if (__kgem_throttle_retire(kgem, 0))
+@@ -498,6 +657,8 @@ retry_wc:
+ 	wc.size = bytes(bo);
+ 	wc.flags = I915_MMAP_WC;
+ 	if ((err = do_ioctl(kgem->fd, LOCAL_IOCTL_I915_GEM_MMAP_v2, &wc))) {
++		DBG(("%s: failed %d, throttling/cleaning caches\n",
++		     __FUNCTION__, err));
+ 		assert(err != EINVAL);
+ 
+ 		if (__kgem_throttle_retire(kgem, 0))
+@@ -519,16 +680,19 @@ retry_wc:
+ 
+ static void *__kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo)
+ {
+-	struct drm_i915_gem_mmap mmap_arg;
++	struct local_i915_gem_mmap arg;
+ 	int err;
+ 
++	VG_CLEAR(arg);
++	arg.offset = 0;
++
+ retry:
+-	VG_CLEAR(mmap_arg);
+-	mmap_arg.handle = bo->handle;
+-	mmap_arg.offset = 0;
+-	mmap_arg.size = bytes(bo);
+-	if ((err = do_ioctl(kgem->fd, DRM_IOCTL_I915_GEM_MMAP, &mmap_arg))) {
+-		assert(err != EINVAL);
++	arg.handle = bo->handle;
++	arg.size = bytes(bo);
++	if ((err = do_ioctl(kgem->fd, LOCAL_IOCTL_I915_GEM_MMAP, &arg))) {
++		DBG(("%s: failed %d, throttling/cleaning caches\n",
++		     __FUNCTION__, err));
++		assert(err != -EINVAL || bo->prime);
+ 
+ 		if (__kgem_throttle_retire(kgem, 0))
+ 			goto retry;
+@@ -536,15 +700,16 @@ retry:
+ 		if (kgem_cleanup_cache(kgem))
+ 			goto retry;
+ 
+-		ERR(("%s: failed to mmap handle=%d, %d bytes, into CPU domain: %d\n",
+-		     __FUNCTION__, bo->handle, bytes(bo), -err));
++		ERR(("%s: failed to mmap handle=%d (prime? %d), %d bytes, into CPU domain: %d\n",
++		     __FUNCTION__, bo->handle, bo->prime, bytes(bo), -err));
++		bo->purged = 1;
+ 		return NULL;
+ 	}
+ 
+-	VG(VALGRIND_MAKE_MEM_DEFINED(mmap_arg.addr_ptr, bytes(bo)));
++	VG(VALGRIND_MAKE_MEM_DEFINED(arg.addr_ptr, bytes(bo)));
+ 
+ 	DBG(("%s: caching CPU vma for %d\n", __FUNCTION__, bo->handle));
+-	return bo->map__cpu = (void *)(uintptr_t)mmap_arg.addr_ptr;
++	return bo->map__cpu = (void *)(uintptr_t)arg.addr_ptr;
+ }
+ 
+ static int gem_write(int fd, uint32_t handle,
+@@ -634,16 +799,10 @@ static void kgem_bo_retire(struct kgem *kgem, struct kgem_bo *bo)
+ 	assert(bo->exec == NULL);
+ 	assert(list_is_empty(&bo->vma));
+ 
+-	if (bo->rq) {
+-		__kgem_bo_clear_busy(bo);
+-		kgem_retire(kgem);
+-		assert_bo_retired(bo);
+-	} else {
+-		assert(bo->exec == NULL);
+-		assert(list_is_empty(&bo->request));
+-		assert(!bo->needs_flush);
+-		ASSERT_IDLE(kgem, bo->handle);
+-	}
++	if (bo->rq)
++		__kgem_retire_requests_upto(kgem, bo);
++	ASSERT_IDLE(kgem, bo->handle);
++	assert_bo_retired(bo);
+ }
+ 
+ static void kgem_bo_maybe_retire(struct kgem *kgem, struct kgem_bo *bo)
+@@ -655,10 +814,8 @@ static void kgem_bo_maybe_retire(struct kgem *kgem, struct kgem_bo *bo)
+ 	assert(list_is_empty(&bo->vma));
+ 
+ 	if (bo->rq) {
+-		if (!__kgem_busy(kgem, bo->handle)) {
+-			__kgem_bo_clear_busy(bo);
+-			kgem_retire(kgem);
+-		}
++		if (!__kgem_busy(kgem, bo->handle))
++			__kgem_retire_requests_upto(kgem, bo);
+ 	} else {
+ 		assert(!bo->needs_flush);
+ 		ASSERT_IDLE(kgem, bo->handle);
+@@ -694,6 +851,8 @@ retry:
+ 	}
+ 
+ 	if ((err = gem_write(kgem->fd, bo->handle, 0, length, data))) {
++		DBG(("%s: failed %d, throttling/cleaning caches\n",
++		     __FUNCTION__, err));
+ 		assert(err != EINVAL);
+ 
+ 		(void)__kgem_throttle_retire(kgem, 0);
+@@ -728,27 +887,21 @@ static uint32_t gem_create(int fd, int num_pages)
+ 	return create.handle;
+ }
+ 
+-static bool
++static void
+ kgem_bo_set_purgeable(struct kgem *kgem, struct kgem_bo *bo)
+ {
+-#if DBG_NO_MADV
+-	return true;
+-#else
++#if !DBG_NO_MADV
+ 	struct drm_i915_gem_madvise madv;
+ 
+ 	assert(bo->exec == NULL);
+-	assert(!bo->purged);
+ 
+ 	VG_CLEAR(madv);
+ 	madv.handle = bo->handle;
+ 	madv.madv = I915_MADV_DONTNEED;
+ 	if (do_ioctl(kgem->fd, DRM_IOCTL_I915_GEM_MADVISE, &madv) == 0) {
+-		bo->purged = 1;
+-		kgem->need_purge |= !madv.retained && bo->domain == DOMAIN_GPU;
+-		return madv.retained;
++		bo->purged = true;
++		kgem->need_purge |= !madv.retained && bo->domain != DOMAIN_CPU;
+ 	}
+-
+-	return true;
+ #endif
+ }
+ 
+@@ -788,7 +941,7 @@ kgem_bo_clear_purgeable(struct kgem *kgem, struct kgem_bo *bo)
+ 	madv.madv = I915_MADV_WILLNEED;
+ 	if (do_ioctl(kgem->fd, DRM_IOCTL_I915_GEM_MADVISE, &madv) == 0) {
+ 		bo->purged = !madv.retained;
+-		kgem->need_purge |= !madv.retained && bo->domain == DOMAIN_GPU;
++		kgem->need_purge |= !madv.retained && bo->domain != DOMAIN_CPU;
+ 		return madv.retained;
+ 	}
+ 
+@@ -869,13 +1022,17 @@ static struct kgem_request *__kgem_request_alloc(struct kgem *kgem)
+ {
+ 	struct kgem_request *rq;
+ 
+-	rq = __kgem_freed_request;
+-	if (rq) {
+-		__kgem_freed_request = *(struct kgem_request **)rq;
++	if (unlikely(kgem->wedged)) {
++		rq = &kgem->static_request;
+ 	} else {
+-		rq = malloc(sizeof(*rq));
+-		if (rq == NULL)
+-			rq = &kgem->static_request;
++		rq = __kgem_freed_request;
++		if (rq) {
++			__kgem_freed_request = *(struct kgem_request **)rq;
++		} else {
++			rq = malloc(sizeof(*rq));
++			if (rq == NULL)
++				rq = &kgem->static_request;
++		}
+ 	}
+ 
+ 	list_init(&rq->buffers);
+@@ -925,11 +1082,11 @@ total_ram_size(void)
+ #ifdef HAVE_STRUCT_SYSINFO_TOTALRAM
+ 	struct sysinfo info;
+ 	if (sysinfo(&info) == 0)
+-		return info.totalram * info.mem_unit;
++		return (size_t)info.totalram * info.mem_unit;
+ #endif
+ 
+ #ifdef _SC_PHYS_PAGES
+-	 return sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGE_SIZE);
++	 return (size_t)sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGE_SIZE);
+ #endif
+ 
+ 	return 0;
+@@ -1150,6 +1307,10 @@ static bool test_has_wc_mmap(struct kgem *kgem)
+ 	if (DBG_NO_WC_MMAP)
+ 		return false;
+ 
++	/* XXX See https://bugs.freedesktop.org/show_bug.cgi?id=90841 */
++	if (kgem->gen < 033)
++		return false;
++
+ 	if (gem_param(kgem, LOCAL_I915_PARAM_MMAP_VERSION) < 1)
+ 		return false;
+ 
+@@ -1187,7 +1348,7 @@ static bool test_has_caching(struct kgem *kgem)
+ 
+ static bool test_has_userptr(struct kgem *kgem)
+ {
+-	uint32_t handle;
++	struct local_i915_gem_userptr arg;
+ 	void *ptr;
+ 
+ 	if (DBG_NO_USERPTR)
+@@ -1200,11 +1361,23 @@ static bool test_has_userptr(struct kgem *kgem)
+ 	if (posix_memalign(&ptr, PAGE_SIZE, PAGE_SIZE))
+ 		return false;
+ 
+-	handle = gem_userptr(kgem->fd, ptr, PAGE_SIZE, false);
+-	gem_close(kgem->fd, handle);
+-	free(ptr);
++	VG_CLEAR(arg);
++	arg.user_ptr = (uintptr_t)ptr;
++	arg.user_size = PAGE_SIZE;
++	arg.flags = I915_USERPTR_UNSYNCHRONIZED;
+ 
+-	return handle != 0;
++	if (DBG_NO_UNSYNCHRONIZED_USERPTR ||
++	    do_ioctl(kgem->fd, LOCAL_IOCTL_I915_GEM_USERPTR, &arg)) {
++		arg.flags &= ~I915_USERPTR_UNSYNCHRONIZED;
++		if (do_ioctl(kgem->fd, LOCAL_IOCTL_I915_GEM_USERPTR, &arg))
++			arg.handle = 0;
++		/* Leak the userptr bo to keep the mmu_notifier alive */
++	} else {
++		gem_close(kgem->fd, arg.handle);
++		free(ptr);
++	}
++
++	return arg.handle != 0;
+ }
+ 
+ static bool test_has_create2(struct kgem *kgem)
+@@ -1227,67 +1400,187 @@ static bool test_has_create2(struct kgem *kgem)
+ #endif
+ }
+ 
+-static bool test_has_secure_batches(struct kgem *kgem)
++static bool test_can_blt_y(struct kgem *kgem)
+ {
+-	if (DBG_NO_SECURE_BATCHES)
++	struct drm_i915_gem_exec_object2 object;
++	uint32_t batch[] = {
++#define MI_LOAD_REGISTER_IMM (0x22<<23 | (3-2))
++#define BCS_SWCTRL 0x22200
++#define BCS_SRC_Y (1 << 0)
++#define BCS_DST_Y (1 << 1)
++		MI_LOAD_REGISTER_IMM,
++		BCS_SWCTRL,
++		(BCS_SRC_Y | BCS_DST_Y) << 16 | (BCS_SRC_Y | BCS_DST_Y),
++
++		MI_LOAD_REGISTER_IMM,
++		BCS_SWCTRL,
++		(BCS_SRC_Y | BCS_DST_Y) << 16,
++
++		MI_BATCH_BUFFER_END,
++		0,
++	};
++	int ret;
++
++	if (DBG_NO_BLT_Y)
+ 		return false;
+ 
+-	return gem_param(kgem, LOCAL_I915_PARAM_HAS_SECURE_BATCHES) > 0;
++	if (kgem->gen < 060)
++		return false;
++
++	memset(&object, 0, sizeof(object));
++	object.handle = gem_create(kgem->fd, 1);
++
++	ret = gem_write(kgem->fd, object.handle, 0, sizeof(batch), batch);
++	if (ret == 0) {
++		struct drm_i915_gem_execbuffer2 execbuf;
++
++		memset(&execbuf, 0, sizeof(execbuf));
++		execbuf.buffers_ptr = (uintptr_t)&object;
++		execbuf.buffer_count = 1;
++		execbuf.flags = KGEM_BLT;
++
++		ret = do_ioctl(kgem->fd,
++			       DRM_IOCTL_I915_GEM_EXECBUFFER2,
++			       &execbuf);
++	}
++	gem_close(kgem->fd, object.handle);
++
++	return ret == 0;
+ }
+ 
+-static bool test_has_pinned_batches(struct kgem *kgem)
++static bool gem_set_tiling(int fd, uint32_t handle, int tiling, int stride)
+ {
+-	if (DBG_NO_PINNED_BATCHES)
++	struct drm_i915_gem_set_tiling set_tiling;
++
++	if (DBG_NO_TILING)
+ 		return false;
+ 
+-	return gem_param(kgem, LOCAL_I915_PARAM_HAS_PINNED_BATCHES) > 0;
++	VG_CLEAR(set_tiling);
++	set_tiling.handle = handle;
++	set_tiling.tiling_mode = tiling;
++	set_tiling.stride = stride;
++
++	if (ioctl(fd, DRM_IOCTL_I915_GEM_SET_TILING, &set_tiling) == 0)
++		return set_tiling.tiling_mode == tiling;
++
++	return false;
+ }
+ 
+-static int kgem_get_screen_index(struct kgem *kgem)
++static bool test_can_scanout_y(struct kgem *kgem)
+ {
+-	struct sna *sna = container_of(kgem, struct sna, kgem);
+-	return sna->scrn->scrnIndex;
++	struct drm_mode_fb_cmd arg;
++	bool ret = false;
++
++	if (DBG_NO_SCANOUT_Y)
++		return false;
++
++	VG_CLEAR(arg);
++	arg.width = 32;
++	arg.height = 32;
++	arg.pitch = 4*32;
++	arg.bpp = 32;
++	arg.depth = 24;
++	arg.handle = gem_create(kgem->fd, 1);
++
++	if (gem_set_tiling(kgem->fd, arg.handle, I915_TILING_Y, arg.pitch))
++		ret = do_ioctl(kgem->fd, DRM_IOCTL_MODE_ADDFB, &arg) == 0;
++	if (!ret) {
++		struct local_mode_fb_cmd2 {
++			uint32_t fb_id;
++			uint32_t width, height;
++			uint32_t pixel_format;
++			uint32_t flags;
++
++			uint32_t handles[4];
++			uint32_t pitches[4];
++			uint32_t offsets[4];
++			uint64_t modifiers[4];
++		} f;
++#define LOCAL_IOCTL_MODE_ADDFB2 DRM_IOWR(0xb8, struct local_mode_fb_cmd2)
++		memset(&f, 0, sizeof(f));
++		f.width = arg.width;
++		f.height = arg.height;
++		f.handles[0] = arg.handle;
++		f.pitches[0] = arg.pitch;
++		f.modifiers[0] = (uint64_t)1 << 56 | 2; /* MOD_Y_TILED */
++		f.pixel_format = 'X' | 'R' << 8 | '2' << 16 | '4' << 24; /* XRGB8888 */
++		f.flags = 1 << 1; /* + modifier */
++		if (drmIoctl(kgem->fd, LOCAL_IOCTL_MODE_ADDFB2, &f) == 0) {
++			ret = true;
++			arg.fb_id = f.fb_id;
++		}
++	}
++	do_ioctl(kgem->fd, DRM_IOCTL_MODE_RMFB, &arg.fb_id);
++	gem_close(kgem->fd, arg.handle);
++
++	return ret;
+ }
+ 
+-static int __find_debugfs(struct kgem *kgem)
++static bool test_has_dirtyfb(struct kgem *kgem)
+ {
+-	int i;
++	struct drm_mode_fb_cmd create;
++	bool ret = false;
+ 
+-	for (i = 0; i < DRM_MAX_MINOR; i++) {
+-		char path[80];
++	if (DBG_NO_DIRTYFB)
++		return false;
+ 
+-		sprintf(path, "/sys/kernel/debug/dri/%d/i915_wedged", i);
+-		if (access(path, R_OK) == 0)
+-			return i;
++	VG_CLEAR(create);
++	create.width = 32;
++	create.height = 32;
++	create.pitch = 4*32;
++	create.bpp = 32;
++	create.depth = 32;
++	create.handle = gem_create(kgem->fd, 1);
++	if (create.handle == 0)
++		return false;
+ 
+-		sprintf(path, "/debug/dri/%d/i915_wedged", i);
+-		if (access(path, R_OK) == 0)
+-			return i;
++	if (drmIoctl(kgem->fd, DRM_IOCTL_MODE_ADDFB, &create) == 0) {
++		struct drm_mode_fb_dirty_cmd dirty;
++
++		memset(&dirty, 0, sizeof(dirty));
++		dirty.fb_id = create.fb_id;
++		ret = drmIoctl(kgem->fd,
++			       DRM_IOCTL_MODE_DIRTYFB,
++			       &dirty) == 0;
++
++		/* XXX There may be multiple levels of DIRTYFB, depending on
++		 * whether the kernel thinks tracking dirty regions is
++		 * beneficial vs flagging the whole fb as dirty.
++		 */
++
++		drmIoctl(kgem->fd,
++			 DRM_IOCTL_MODE_RMFB,
++			 &create.fb_id);
+ 	}
++	gem_close(kgem->fd, create.handle);
+ 
+-	return -1;
++	return ret;
+ }
+ 
+-static int kgem_get_minor(struct kgem *kgem)
++static bool test_has_secure_batches(struct kgem *kgem)
+ {
+-	struct stat st;
++	if (DBG_NO_SECURE_BATCHES)
++		return false;
+ 
+-	if (fstat(kgem->fd, &st))
+-		return __find_debugfs(kgem);
++	return gem_param(kgem, LOCAL_I915_PARAM_HAS_SECURE_BATCHES) > 0;
++}
+ 
+-	if (!S_ISCHR(st.st_mode))
+-		return __find_debugfs(kgem);
++static bool test_has_pinned_batches(struct kgem *kgem)
++{
++	if (DBG_NO_PINNED_BATCHES)
++		return false;
+ 
+-	return st.st_rdev & 0x63;
++	return gem_param(kgem, LOCAL_I915_PARAM_HAS_PINNED_BATCHES) > 0;
+ }
+ 
+ static bool kgem_init_pinned_batches(struct kgem *kgem)
+ {
+ 	int count[2] = { 16, 4 };
+ 	int size[2] = { 1, 4 };
++	int ret = 0;
+ 	int n, i;
+ 
+-	if (kgem->wedged)
++	if (unlikely(kgem->wedged))
+ 		return true;
+ 
+ 	for (n = 0; n < ARRAY_SIZE(count); n++) {
+@@ -1311,7 +1604,8 @@ static bool kgem_init_pinned_batches(struct kgem *kgem)
+ 			}
+ 
+ 			pin.alignment = 0;
+-			if (do_ioctl(kgem->fd, DRM_IOCTL_I915_GEM_PIN, &pin)) {
++			ret = do_ioctl(kgem->fd, DRM_IOCTL_I915_GEM_PIN, &pin);
++			if (ret) {
+ 				gem_close(kgem->fd, pin.handle);
+ 				free(bo);
+ 				goto err;
+@@ -1333,6 +1627,16 @@ err:
+ 		}
+ 	}
+ 
++	/* If we fail to pin some memory for 830gm/845g, we need to disable
++	 * acceleration as otherwise the machine will eventually fail. However,
++	 * the kernel started arbitrarily rejecting PIN, so hope for the best
++	 * if the ioctl no longer works.
++	 */
++	if (ret != -ENODEV && kgem->gen == 020)
++		return false;
++
++	kgem->has_pinned_batches = false;
++
+ 	/* For simplicity populate the lists with a single unpinned bo */
+ 	for (n = 0; n < ARRAY_SIZE(count); n++) {
+ 		struct kgem_bo *bo;
+@@ -1340,18 +1644,18 @@ err:
+ 
+ 		handle = gem_create(kgem->fd, size[n]);
+ 		if (handle == 0)
+-			break;
++			return false;
+ 
+ 		bo = __kgem_bo_alloc(handle, size[n]);
+ 		if (bo == NULL) {
+ 			gem_close(kgem->fd, handle);
+-			break;
++			return false;
+ 		}
+ 
+ 		debug_alloc__bo(kgem, bo);
+ 		list_add(&bo->list, &kgem->pinned_batches[n]);
+ 	}
+-	return false;
++	return true;
+ }
+ 
+ static void kgem_init_swizzling(struct kgem *kgem)
+@@ -1364,7 +1668,7 @@ static void kgem_init_swizzling(struct kgem *kgem)
+ 	} tiling;
+ #define LOCAL_IOCTL_I915_GEM_GET_TILING DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_GET_TILING, struct local_i915_gem_get_tiling_v2)
+ 
+-	VG_CLEAR(tiling);
++	memset(&tiling, 0, sizeof(tiling));
+ 	tiling.handle = gem_create(kgem->fd, 1);
+ 	if (!tiling.handle)
+ 		return;
+@@ -1375,12 +1679,23 @@ static void kgem_init_swizzling(struct kgem *kgem)
+ 	if (do_ioctl(kgem->fd, LOCAL_IOCTL_I915_GEM_GET_TILING, &tiling))
+ 		goto out;
+ 
+-	if (kgem->gen < 50 && tiling.phys_swizzle_mode != tiling.swizzle_mode)
++	DBG(("%s: swizzle_mode=%d, phys_swizzle_mode=%d\n",
++	     __FUNCTION__, tiling.swizzle_mode, tiling.phys_swizzle_mode));
++
++	kgem->can_fence =
++		!DBG_NO_TILING &&
++		tiling.swizzle_mode != I915_BIT_6_SWIZZLE_UNKNOWN;
++
++	if (kgem->gen < 050 && tiling.phys_swizzle_mode != tiling.swizzle_mode)
+ 		goto out;
+ 
+-	choose_memcpy_tiled_x(kgem, tiling.swizzle_mode);
++	if (!DBG_NO_DETILING)
++		choose_memcpy_tiled_x(kgem,
++				      tiling.swizzle_mode,
++				      __to_sna(kgem)->cpu_features);
+ out:
+ 	gem_close(kgem->fd, tiling.handle);
++	DBG(("%s: can fence?=%d\n", __FUNCTION__, kgem->can_fence));
+ }
+ 
+ static void kgem_fixup_relocs(struct kgem *kgem, struct kgem_bo *bo, int shrink)
+@@ -1399,6 +1714,7 @@ static void kgem_fixup_relocs(struct kgem *kgem, struct kgem_bo *bo, int shrink)
+ 	     bo->handle, (long long)bo->presumed_offset));
+ 	for (n = 0; n < kgem->nreloc__self; n++) {
+ 		int i = kgem->reloc__self[n];
++		uint64_t addr;
+ 
+ 		assert(kgem->reloc[i].target_handle == ~0U);
+ 		kgem->reloc[i].target_handle = bo->target_handle;
+@@ -1412,13 +1728,17 @@ static void kgem_fixup_relocs(struct kgem *kgem, struct kgem_bo *bo, int shrink)
+ 
+ 			kgem->reloc[i].delta -= shrink;
+ 		}
+-		kgem->batch[kgem->reloc[i].offset/sizeof(uint32_t)] =
+-			kgem->reloc[i].delta + bo->presumed_offset;
++		addr = (int)kgem->reloc[i].delta + bo->presumed_offset;
++		kgem->batch[kgem->reloc[i].offset/sizeof(uint32_t)] = addr;
++		if (kgem->gen >= 0100)
++			kgem->batch[kgem->reloc[i].offset/sizeof(uint32_t) + 1] = addr >> 32;
+ 	}
+ 
+ 	if (n == 256) {
+ 		for (n = kgem->reloc__self[255]; n < kgem->nreloc; n++) {
+ 			if (kgem->reloc[n].target_handle == ~0U) {
++				uint64_t addr;
++
+ 				kgem->reloc[n].target_handle = bo->target_handle;
+ 				kgem->reloc[n].presumed_offset = bo->presumed_offset;
+ 
+@@ -1429,8 +1749,11 @@ static void kgem_fixup_relocs(struct kgem *kgem, struct kgem_bo *bo, int shrink)
+ 					     kgem->reloc[n].delta - shrink));
+ 					kgem->reloc[n].delta -= shrink;
+ 				}
+-				kgem->batch[kgem->reloc[n].offset/sizeof(uint32_t)] =
+-					kgem->reloc[n].delta + bo->presumed_offset;
++
++				addr = (int)kgem->reloc[n].delta + bo->presumed_offset;
++				kgem->batch[kgem->reloc[n].offset/sizeof(uint32_t)] = addr;
++				if (kgem->gen >= 0100)
++					kgem->batch[kgem->reloc[n].offset/sizeof(uint32_t) + 1] = addr >> 32;
+ 			}
+ 		}
+ 	}
+@@ -1444,6 +1767,44 @@ static void kgem_fixup_relocs(struct kgem *kgem, struct kgem_bo *bo, int shrink)
+ 	}
+ }
+ 
++static int kgem_bo_wait(struct kgem *kgem, struct kgem_bo *bo)
++{
++	struct local_i915_gem_wait {
++		uint32_t handle;
++		uint32_t flags;
++		int64_t timeout;
++	} wait;
++#define LOCAL_I915_GEM_WAIT       0x2c
++#define LOCAL_IOCTL_I915_GEM_WAIT         DRM_IOWR(DRM_COMMAND_BASE + LOCAL_I915_GEM_WAIT, struct local_i915_gem_wait)
++	int ret;
++
++	DBG(("%s: waiting for handle=%d\n", __FUNCTION__, bo->handle));
++	if (bo->rq == NULL)
++		return 0;
++
++	VG_CLEAR(wait);
++	wait.handle = bo->handle;
++	wait.flags = 0;
++	wait.timeout = -1;
++	ret = do_ioctl(kgem->fd, LOCAL_IOCTL_I915_GEM_WAIT, &wait);
++	if (ret) {
++		struct drm_i915_gem_set_domain set_domain;
++
++		VG_CLEAR(set_domain);
++		set_domain.handle = bo->handle;
++		set_domain.read_domains = I915_GEM_DOMAIN_GTT;
++		set_domain.write_domain = I915_GEM_DOMAIN_GTT;
++		ret = do_ioctl(kgem->fd,
++			       DRM_IOCTL_I915_GEM_SET_DOMAIN,
++			       &set_domain);
++	}
++
++	if (ret == 0)
++		__kgem_retire_requests_upto(kgem, bo);
++
++	return ret;
++}
++
+ static struct kgem_bo *kgem_new_batch(struct kgem *kgem)
+ {
+ 	struct kgem_bo *last;
+@@ -1464,20 +1825,41 @@ static struct kgem_bo *kgem_new_batch(struct kgem *kgem)
+ 	if (!kgem->has_llc)
+ 		flags |= CREATE_UNCACHED;
+ 
++restart:
+ 	kgem->batch_bo = kgem_create_linear(kgem,
+ 					    sizeof(uint32_t)*kgem->batch_size,
+ 					    flags);
+ 	if (kgem->batch_bo)
+ 		kgem->batch = kgem_bo_map__cpu(kgem, kgem->batch_bo);
+ 	if (kgem->batch == NULL) {
+-		DBG(("%s: unable to map batch bo, mallocing(size=%d)\n",
+-		     __FUNCTION__,
+-		     sizeof(uint32_t)*kgem->batch_size));
++		int ring = kgem->ring == KGEM_BLT;
++		assert(ring < ARRAY_SIZE(kgem->requests));
++
+ 		if (kgem->batch_bo) {
+ 			kgem_bo_destroy(kgem, kgem->batch_bo);
+ 			kgem->batch_bo = NULL;
+ 		}
+ 
++		if (!list_is_empty(&kgem->requests[ring])) {
++			struct kgem_request *rq;
++
++			rq = list_first_entry(&kgem->requests[ring],
++					      struct kgem_request, list);
++			assert(rq->ring == ring);
++			assert(rq->bo);
++			assert(RQ(rq->bo->rq) == rq);
++			if (kgem_bo_wait(kgem, rq->bo) == 0)
++				goto restart;
++		}
++
++		if (flags & CREATE_NO_THROTTLE) {
++			flags &= ~CREATE_NO_THROTTLE;
++			if (kgem_cleanup_cache(kgem))
++				goto restart;
++		}
++
++		DBG(("%s: unable to map batch bo, mallocing(size=%d)\n",
++		     __FUNCTION__, sizeof(uint32_t)*kgem->batch_size));
+ 		if (posix_memalign((void **)&kgem->batch, PAGE_SIZE,
+ 				   ALIGN(sizeof(uint32_t) * kgem->batch_size, PAGE_SIZE))) {
+ 			ERR(("%s: batch allocation failed, disabling acceleration\n", __FUNCTION__));
+@@ -1495,18 +1877,79 @@ static struct kgem_bo *kgem_new_batch(struct kgem *kgem)
+ 	return last;
+ }
+ 
+-void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, unsigned gen)
++static void
++no_retire(struct kgem *kgem)
++{
++	(void)kgem;
++}
++
++static void
++no_expire(struct kgem *kgem)
++{
++	(void)kgem;
++}
++
++static void
++no_context_switch(struct kgem *kgem, int new_mode)
++{
++	(void)kgem;
++	(void)new_mode;
++}
++
++static uint64_t get_gtt_size(int fd)
+ {
+ 	struct drm_i915_gem_get_aperture aperture;
++	struct local_i915_gem_context_param {
++		uint32_t context;
++		uint32_t size;
++		uint64_t param;
++#define LOCAL_CONTEXT_PARAM_BAN_PERIOD	0x1
++#define LOCAL_CONTEXT_PARAM_NO_ZEROMAP	0x2
++#define LOCAL_CONTEXT_PARAM_GTT_SIZE	0x3
++		uint64_t value;
++	} p;
++#define LOCAL_I915_GEM_CONTEXT_GETPARAM       0x34
++#define LOCAL_IOCTL_I915_GEM_CONTEXT_GETPARAM DRM_IOWR (DRM_COMMAND_BASE + LOCAL_I915_GEM_CONTEXT_GETPARAM, struct local_i915_gem_context_param)
++
++	memset(&aperture, 0, sizeof(aperture));
++
++	memset(&p, 0, sizeof(p));
++	p.param = LOCAL_CONTEXT_PARAM_GTT_SIZE;
++	if (drmIoctl(fd, LOCAL_IOCTL_I915_GEM_CONTEXT_GETPARAM, &p) == 0)
++		aperture.aper_size = p.value;
++	if (aperture.aper_size == 0)
++		(void)drmIoctl(fd, DRM_IOCTL_I915_GEM_GET_APERTURE, &aperture);
++	if (aperture.aper_size == 0)
++		aperture.aper_size = 64*1024*1024;
++
++	DBG(("%s: aperture size %lld, available now %lld\n",
++	     __FUNCTION__,
++	     (long long)aperture.aper_size,
++	     (long long)aperture.aper_available_size));
++
++	/* clamp aperture to uint32_t for simplicity */
++	if (aperture.aper_size > 0xc0000000)
++		aperture.aper_size = 0xc0000000;
++
++	return aperture.aper_size;
++}
++
++void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, unsigned gen)
++{
+ 	size_t totalram;
+ 	unsigned half_gpu_max;
+ 	unsigned int i, j;
++	uint64_t gtt_size;
+ 
+ 	DBG(("%s: fd=%d, gen=%d\n", __FUNCTION__, fd, gen));
+ 
+ 	kgem->fd = fd;
+ 	kgem->gen = gen;
+ 
++	kgem->retire = no_retire;
++	kgem->expire = no_expire;
++	kgem->context_switch = no_context_switch;
++
+ 	list_init(&kgem->requests[0]);
+ 	list_init(&kgem->requests[1]);
+ 	list_init(&kgem->batch_buffers);
+@@ -1586,10 +2029,21 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, unsigned gen)
+ 	DBG(("%s: can blt to cpu? %d\n", __FUNCTION__,
+ 	     kgem->can_blt_cpu));
+ 
++	kgem->can_blt_y = test_can_blt_y(kgem);
++	DBG(("%s: can blit to Y-tiled surfaces? %d\n", __FUNCTION__,
++	     kgem->can_blt_y));
++
+ 	kgem->can_render_y = gen != 021 && (gen >> 3) != 4;
+ 	DBG(("%s: can render to Y-tiled surfaces? %d\n", __FUNCTION__,
+ 	     kgem->can_render_y));
+ 
++	kgem->can_scanout_y = test_can_scanout_y(kgem);
++	DBG(("%s: can scanout Y-tiled surfaces? %d\n", __FUNCTION__,
++	     kgem->can_scanout_y));
++
++	kgem->has_dirtyfb = test_has_dirtyfb(kgem);
++	DBG(("%s: has dirty fb? %d\n", __FUNCTION__, kgem->has_dirtyfb));
++
+ 	kgem->has_secure_batches = test_has_secure_batches(kgem);
+ 	DBG(("%s: can use privileged batchbuffers? %d\n", __FUNCTION__,
+ 	     kgem->has_secure_batches));
+@@ -1620,7 +2074,7 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, unsigned gen)
+ 	if (!kgem->has_relaxed_delta && kgem->batch_size > 4*1024)
+ 		kgem->batch_size = 4*1024;
+ 
+-	if (!kgem_init_pinned_batches(kgem) && gen == 020) {
++	if (!kgem_init_pinned_batches(kgem)) {
+ 		xf86DrvMsg(kgem_get_screen_index(kgem), X_WARNING,
+ 			   "Unable to reserve memory for GPU, disabling acceleration.\n");
+ 		__kgem_set_wedged(kgem);
+@@ -1640,35 +2094,24 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, unsigned gen)
+ 	     !DBG_NO_CPU && (kgem->has_llc | kgem->has_userptr | kgem->has_caching),
+ 	     kgem->has_llc, kgem->has_caching, kgem->has_userptr));
+ 
+-	VG_CLEAR(aperture);
+-	aperture.aper_size = 0;
+-	(void)do_ioctl(fd, DRM_IOCTL_I915_GEM_GET_APERTURE, &aperture);
+-	if (aperture.aper_size == 0)
+-		aperture.aper_size = 64*1024*1024;
+-
+-	DBG(("%s: aperture size %lld, available now %lld\n",
+-	     __FUNCTION__,
+-	     (long long)aperture.aper_size,
+-	     (long long)aperture.aper_available_size));
+-
+-	kgem->aperture_total = aperture.aper_size;
+-	kgem->aperture_high = aperture.aper_size * 3/4;
+-	kgem->aperture_low = aperture.aper_size * 1/3;
++	gtt_size = get_gtt_size(fd);
++	kgem->aperture_total = gtt_size;
++	kgem->aperture_high = gtt_size * 3/4;
++	kgem->aperture_low = gtt_size * 1/3;
+ 	if (gen < 033) {
+ 		/* Severe alignment penalties */
+ 		kgem->aperture_high /= 2;
+ 		kgem->aperture_low /= 2;
+ 	}
+-	DBG(("%s: aperture low=%d [%d], high=%d [%d]\n", __FUNCTION__,
++	DBG(("%s: aperture low=%u [%u], high=%u [%u]\n", __FUNCTION__,
+ 	     kgem->aperture_low, kgem->aperture_low / (1024*1024),
+ 	     kgem->aperture_high, kgem->aperture_high / (1024*1024)));
+ 
+ 	kgem->aperture_mappable = 256 * 1024 * 1024;
+ 	if (dev != NULL)
+ 		kgem->aperture_mappable = agp_aperture_size(dev, gen);
+-	if (kgem->aperture_mappable == 0 ||
+-	    kgem->aperture_mappable > aperture.aper_size)
+-		kgem->aperture_mappable = aperture.aper_size;
++	if (kgem->aperture_mappable == 0 || kgem->aperture_mappable > gtt_size)
++		kgem->aperture_mappable = gtt_size;
+ 	DBG(("%s: aperture mappable=%d [%d MiB]\n", __FUNCTION__,
+ 	     kgem->aperture_mappable, kgem->aperture_mappable / (1024*1024)));
+ 
+@@ -1697,7 +2140,7 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, unsigned gen)
+ 		     __FUNCTION__));
+ 		totalram = kgem->aperture_total;
+ 	}
+-	DBG(("%s: total ram=%ld\n", __FUNCTION__, (long)totalram));
++	DBG(("%s: total ram=%lld\n", __FUNCTION__, (long long)totalram));
+ 	if (kgem->max_object_size > totalram / 2)
+ 		kgem->max_object_size = totalram / 2;
+ 	if (kgem->max_gpu_size > totalram / 4)
+@@ -1749,11 +2192,11 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, unsigned gen)
+ 	if (DBG_NO_CPU)
+ 		kgem->max_cpu_size = 0;
+ 
+-	DBG(("%s: maximum object size=%d\n",
++	DBG(("%s: maximum object size=%u\n",
+ 	     __FUNCTION__, kgem->max_object_size));
+-	DBG(("%s: large object thresold=%d\n",
++	DBG(("%s: large object thresold=%u\n",
+ 	     __FUNCTION__, kgem->large_object_size));
+-	DBG(("%s: max object sizes (gpu=%d, cpu=%d, tile upload=%d, copy=%d)\n",
++	DBG(("%s: max object sizes (gpu=%u, cpu=%u, tile upload=%u, copy=%u)\n",
+ 	     __FUNCTION__,
+ 	     kgem->max_gpu_size, kgem->max_cpu_size,
+ 	     kgem->max_upload_tile_size, kgem->max_copy_tile_size));
+@@ -2043,8 +2486,34 @@ static void kgem_add_bo(struct kgem *kgem, struct kgem_bo *bo)
+ 	kgem->flush |= bo->flush;
+ }
+ 
++static void kgem_clear_swctrl(struct kgem *kgem)
++{
++	uint32_t *b;
++
++	if (kgem->bcs_state == 0)
++		return;
++
++	DBG(("%s: clearin SWCTRL LRI from %x\n",
++	     __FUNCTION__, kgem->bcs_state));
++
++	b = kgem->batch + kgem->nbatch;
++	kgem->nbatch += 7;
++
++	*b++ = MI_FLUSH_DW;
++	*b++ = 0;
++	*b++ = 0;
++	*b++ = 0;
++
++	*b++ = MI_LOAD_REGISTER_IMM;
++	*b++ = BCS_SWCTRL;
++	*b++ = (BCS_SRC_Y | BCS_DST_Y) << 16;
++
++	kgem->bcs_state = 0;
++}
++
+ static uint32_t kgem_end_batch(struct kgem *kgem)
+ {
++	kgem_clear_swctrl(kgem);
+ 	kgem->batch[kgem->nbatch++] = MI_BATCH_BUFFER_END;
+ 	if (kgem->nbatch & 1)
+ 		kgem->batch[kgem->nbatch++] = MI_NOOP;
+@@ -2064,17 +2533,6 @@ static void kgem_bo_binding_free(struct kgem *kgem, struct kgem_bo *bo)
+ 	}
+ }
+ 
+-static void kgem_bo_rmfb(struct kgem *kgem, struct kgem_bo *bo)
+-{
+-	if (bo->scanout && bo->delta) {
+-		DBG(("%s: releasing fb=%d for handle=%d\n",
+-		     __FUNCTION__, bo->delta, bo->handle));
+-		/* XXX will leak if we are not DRM_MASTER. *shrug* */
+-		do_ioctl(kgem->fd, DRM_IOCTL_MODE_RMFB, &bo->delta);
+-		bo->delta = 0;
+-	}
+-}
+-
+ static void kgem_bo_free(struct kgem *kgem, struct kgem_bo *bo)
+ {
+ 	DBG(("%s: handle=%d, size=%d\n", __FUNCTION__, bo->handle, bytes(bo)));
+@@ -2150,13 +2608,16 @@ inline static void kgem_bo_move_to_inactive(struct kgem *kgem,
+ 	assert(!bo->snoop);
+ 	assert(!bo->flush);
+ 	assert(!bo->needs_flush);
++	assert(!bo->delta);
+ 	assert(list_is_empty(&bo->vma));
+ 	assert_tiling(kgem, bo);
+-	assert_cacheing(kgem, bo);
++	assert_caching(kgem, bo);
+ 	ASSERT_IDLE(kgem, bo->handle);
+ 
+ 	if (bucket(bo) >= NUM_CACHE_BUCKETS) {
+ 		if (bo->map__gtt) {
++			DBG(("%s: relinquishing large GTT mapping for handle=%d\n",
++			     __FUNCTION__, bo->handle));
+ 			munmap(bo->map__gtt, bytes(bo));
+ 			bo->map__gtt = NULL;
+ 		}
+@@ -2167,6 +2628,8 @@ inline static void kgem_bo_move_to_inactive(struct kgem *kgem,
+ 		assert(list_is_empty(&bo->vma));
+ 		list_move(&bo->list, &kgem->inactive[bucket(bo)]);
+ 		if (bo->map__gtt && !kgem_bo_can_map(kgem, bo)) {
++			DBG(("%s: relinquishing old GTT mapping for handle=%d\n",
++			     __FUNCTION__, bo->handle));
+ 			munmap(bo->map__gtt, bytes(bo));
+ 			bo->map__gtt = NULL;
+ 		}
+@@ -2191,6 +2654,10 @@ static struct kgem_bo *kgem_bo_replace_io(struct kgem_bo *bo)
+ 		return bo;
+ 
+ 	assert(!bo->snoop);
++	assert(!bo->purged);
++	assert(!bo->scanout);
++	assert(!bo->delta);
++
+ 	if (__kgem_freed_bo) {
+ 		base = __kgem_freed_bo;
+ 		__kgem_freed_bo = *(struct kgem_bo **)base;
+@@ -2221,6 +2688,7 @@ inline static void kgem_bo_remove_from_inactive(struct kgem *kgem,
+ 	list_del(&bo->list);
+ 	assert(bo->rq == NULL);
+ 	assert(bo->exec == NULL);
++	assert(!bo->purged);
+ 	if (!list_is_empty(&bo->vma)) {
+ 		assert(bo->map__gtt || bo->map__wc || bo->map__cpu);
+ 		list_del(&bo->vma);
+@@ -2305,7 +2773,6 @@ static void kgem_bo_move_to_scanout(struct kgem *kgem, struct kgem_bo *bo)
+ 		list_move(&bo->list, &kgem->scanout);
+ 
+ 	kgem->need_expire = true;
+-
+ }
+ 
+ static void kgem_bo_move_to_snoop(struct kgem *kgem, struct kgem_bo *bo)
+@@ -2316,6 +2783,8 @@ static void kgem_bo_move_to_snoop(struct kgem *kgem, struct kgem_bo *bo)
+ 	assert(!bo->needs_flush);
+ 	assert(bo->refcnt == 0);
+ 	assert(bo->exec == NULL);
++	assert(!bo->purged);
++	assert(!bo->delta);
+ 
+ 	if (DBG_NO_SNOOP_CACHE) {
+ 		kgem_bo_free(kgem, bo);
+@@ -2351,8 +2820,7 @@ static bool kgem_bo_move_to_cache(struct kgem *kgem, struct kgem_bo *bo)
+ 		kgem_bo_move_to_snoop(kgem, bo);
+ 	} else if (bo->scanout) {
+ 		kgem_bo_move_to_scanout(kgem, bo);
+-	} else if ((bo = kgem_bo_replace_io(bo))->reusable &&
+-		   kgem_bo_set_purgeable(kgem, bo)) {
++	} else if ((bo = kgem_bo_replace_io(bo))->reusable) {
+ 		kgem_bo_move_to_inactive(kgem, bo);
+ 		retired = true;
+ 	} else
+@@ -2429,7 +2897,7 @@ void kgem_bo_undo(struct kgem *kgem, struct kgem_bo *bo)
+ 	DBG(("%s: only handle in batch, discarding last operations for handle=%d\n",
+ 	     __FUNCTION__, bo->handle));
+ 
+-	assert(bo->exec == &kgem->exec[0]);
++	assert(bo->exec == &_kgem_dummy_exec || bo->exec == &kgem->exec[0]);
+ 	assert(kgem->exec[0].handle == bo->handle);
+ 	assert(RQ(bo->rq) == kgem->next_request);
+ 
+@@ -2457,16 +2925,23 @@ void kgem_bo_pair_undo(struct kgem *kgem, struct kgem_bo *a, struct kgem_bo *b)
+ 
+ 	if (a == NULL || b == NULL)
+ 		return;
++	assert(a != b);
+ 	if (a->exec == NULL || b->exec == NULL)
+ 		return;
+ 
+-	DBG(("%s: only handles in batch, discarding last operations for handle=%d and handle=%d\n",
+-	     __FUNCTION__, a->handle, b->handle));
++	DBG(("%s: only handles in batch, discarding last operations for handle=%d (index=%d) and handle=%d (index=%d)\n",
++	     __FUNCTION__,
++	     a->handle, a->proxy ? -1 : a->exec - kgem->exec,
++	     b->handle, b->proxy ? -1 : b->exec - kgem->exec));
+ 
+-	assert(a->exec == &kgem->exec[0] || a->exec == &kgem->exec[1]);
++	assert(a->exec == &_kgem_dummy_exec ||
++	       a->exec == &kgem->exec[0] ||
++	       a->exec == &kgem->exec[1]);
+ 	assert(a->handle == kgem->exec[0].handle || a->handle == kgem->exec[1].handle);
+ 	assert(RQ(a->rq) == kgem->next_request);
+-	assert(b->exec == &kgem->exec[0] || b->exec == &kgem->exec[1]);
++	assert(b->exec == &_kgem_dummy_exec ||
++	       b->exec == &kgem->exec[0] ||
++	       b->exec == &kgem->exec[1]);
+ 	assert(b->handle == kgem->exec[0].handle || b->handle == kgem->exec[1].handle);
+ 	assert(RQ(b->rq) == kgem->next_request);
+ 
+@@ -2487,6 +2962,7 @@ static void __kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
+ 	DBG(("%s: handle=%d, size=%d\n", __FUNCTION__, bo->handle, bytes(bo)));
+ 
+ 	assert(list_is_empty(&bo->list));
++	assert(list_is_empty(&bo->vma));
+ 	assert(bo->refcnt == 0);
+ 	assert(bo->proxy == NULL);
+ 	assert(bo->active_scanout == 0);
+@@ -2532,7 +3008,7 @@ static void __kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
+ 	assert(bo->snoop == false);
+ 	assert(bo->io == false);
+ 	assert(bo->scanout == false);
+-	assert_cacheing(kgem, bo);
++	assert_caching(kgem, bo);
+ 
+ 	kgem_bo_undo(kgem, bo);
+ 	assert(bo->refcnt == 0);
+@@ -2556,9 +3032,6 @@ static void __kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
+ 	assert(list_is_empty(&bo->request));
+ 
+ 	if (bo->map__cpu == NULL || bucket(bo) >= NUM_CACHE_BUCKETS) {
+-		if (!kgem_bo_set_purgeable(kgem, bo))
+-			goto destroy;
+-
+ 		if (!kgem->has_llc && bo->domain == DOMAIN_CPU)
+ 			goto destroy;
+ 
+@@ -2647,7 +3120,7 @@ static bool kgem_retire__flushing(struct kgem *kgem)
+ 		int count = 0;
+ 		list_for_each_entry(bo, &kgem->flushing, request)
+ 			count++;
+-		DBG(("%s: %d bo on flushing list\n", __FUNCTION__, count));
++		DBG(("%s: %d bo on flushing list, retired? %d\n", __FUNCTION__, count, retired));
+ 	}
+ #endif
+ 
+@@ -2656,6 +3129,34 @@ static bool kgem_retire__flushing(struct kgem *kgem)
+ 	return retired;
+ }
+ 
++static bool __kgem_bo_flush(struct kgem *kgem, struct kgem_bo *bo)
++{
++	struct drm_i915_gem_busy busy;
++
++	if (!bo->needs_flush)
++		return false;
++
++	bo->needs_flush = false;
++
++	VG_CLEAR(busy);
++	busy.handle = bo->handle;
++	busy.busy = !kgem->wedged;
++	(void)do_ioctl(kgem->fd, DRM_IOCTL_I915_GEM_BUSY, &busy);
++	DBG(("%s: handle=%d, busy=%d, wedged=%d\n",
++	     __FUNCTION__, bo->handle, busy.busy, kgem->wedged));
++
++	if (busy.busy == 0)
++		return false;
++
++	DBG(("%s: moving %d to flushing\n",
++	     __FUNCTION__, bo->handle));
++	list_add(&bo->request, &kgem->flushing);
++	bo->rq = MAKE_REQUEST(kgem, !!(busy.busy & ~0x1ffff));
++	bo->needs_flush = busy.busy & 0xffff;
++	kgem->need_retire = true;
++	return true;
++}
++
+ static bool __kgem_retire_rq(struct kgem *kgem, struct kgem_request *rq)
+ {
+ 	bool retired = false;
+@@ -2663,6 +3164,8 @@ static bool __kgem_retire_rq(struct kgem *kgem, struct kgem_request *rq)
+ 	DBG(("%s: request %d complete\n",
+ 	     __FUNCTION__, rq->bo->handle));
+ 	assert(RQ(rq->bo->rq) == rq);
++	assert(rq != (struct kgem_request *)kgem);
++	assert(rq != &kgem->static_request);
+ 
+ 	if (rq == kgem->fence[rq->ring])
+ 		kgem->fence[rq->ring] = NULL;
+@@ -2680,19 +3183,14 @@ static bool __kgem_retire_rq(struct kgem *kgem, struct kgem_request *rq)
+ 
+ 		list_del(&bo->request);
+ 
+-		if (bo->needs_flush)
+-			bo->needs_flush = __kgem_busy(kgem, bo->handle);
+-		if (bo->needs_flush) {
+-			DBG(("%s: moving %d to flushing\n",
++		if (unlikely(__kgem_bo_flush(kgem, bo))) {
++			assert(bo != rq->bo);
++			DBG(("%s: movied %d to flushing\n",
+ 			     __FUNCTION__, bo->handle));
+-			list_add(&bo->request, &kgem->flushing);
+-			bo->rq = MAKE_REQUEST(kgem, RQ_RING(bo->rq));
+-			kgem->need_retire = true;
+ 			continue;
+ 		}
+ 
+ 		bo->domain = DOMAIN_NONE;
+-		bo->gtt_dirty = false;
+ 		bo->rq = NULL;
+ 		if (bo->refcnt)
+ 			continue;
+@@ -2706,14 +3204,8 @@ static bool __kgem_retire_rq(struct kgem *kgem, struct kgem_request *rq)
+ 	assert(rq->bo->refcnt > 0);
+ 
+ 	if (--rq->bo->refcnt == 0) {
+-		if (kgem_bo_set_purgeable(kgem, rq->bo)) {
+-			kgem_bo_move_to_inactive(kgem, rq->bo);
+-			retired = true;
+-		} else {
+-			DBG(("%s: closing %d\n",
+-			     __FUNCTION__, rq->bo->handle));
+-			kgem_bo_free(kgem, rq->bo);
+-		}
++		kgem_bo_move_to_inactive(kgem, rq->bo);
++		retired = true;
+ 	}
+ 
+ 	__kgem_request_free(rq);
+@@ -2724,13 +3216,18 @@ static bool kgem_retire__requests_ring(struct kgem *kgem, int ring)
+ {
+ 	bool retired = false;
+ 
++	assert(ring < ARRAY_SIZE(kgem->requests));
+ 	while (!list_is_empty(&kgem->requests[ring])) {
+ 		struct kgem_request *rq;
+ 
++		DBG(("%s: retiring ring %d\n", __FUNCTION__, ring));
++
+ 		rq = list_first_entry(&kgem->requests[ring],
+ 				      struct kgem_request,
+ 				      list);
+ 		assert(rq->ring == ring);
++		assert(rq->bo);
++		assert(RQ(rq->bo->rq) == rq);
+ 		if (__kgem_busy(kgem, rq->bo->handle))
+ 			break;
+ 
+@@ -2751,8 +3248,8 @@ static bool kgem_retire__requests_ring(struct kgem *kgem, int ring)
+ 					      struct kgem_request,
+ 					      list)->bo;
+ 
+-		DBG(("%s: ring=%d, %d outstanding requests, oldest=%d\n",
+-		     __FUNCTION__, ring, count, bo ? bo->handle : 0));
++		DBG(("%s: ring=%d, %d outstanding requests, oldest=%d, retired? %d\n",
++		     __FUNCTION__, ring, count, bo ? bo->handle : 0, retired));
+ 	}
+ #endif
+ 
+@@ -2824,6 +3321,8 @@ bool __kgem_ring_is_idle(struct kgem *kgem, int ring)
+ 	rq = list_last_entry(&kgem->requests[ring],
+ 			     struct kgem_request, list);
+ 	assert(rq->ring == ring);
++	assert(rq->bo);
++	assert(RQ(rq->bo->rq) == rq);
+ 	if (__kgem_busy(kgem, rq->bo->handle)) {
+ 		DBG(("%s: last requests handle=%d still busy\n",
+ 		     __FUNCTION__, rq->bo->handle));
+@@ -2845,23 +3344,30 @@ bool __kgem_ring_is_idle(struct kgem *kgem, int ring)
+ 	return true;
+ }
+ 
+-void __kgem_retire_requests_upto(struct kgem *kgem, struct kgem_bo *bo)
++bool __kgem_retire_requests_upto(struct kgem *kgem, struct kgem_bo *bo)
+ {
+-	struct kgem_request *rq = bo->rq, *tmp;
+-	struct list *requests = &kgem->requests[RQ_RING(rq) == I915_EXEC_BLT];
++	struct kgem_request * const rq = RQ(bo->rq), *tmp;
++	struct list *requests = &kgem->requests[rq->ring];
++
++	DBG(("%s(handle=%d, ring=%d)\n", __FUNCTION__, bo->handle, rq->ring));
+ 
+-	rq = RQ(rq);
+ 	assert(rq != &kgem->static_request);
+ 	if (rq == (struct kgem_request *)kgem) {
+ 		__kgem_bo_clear_busy(bo);
+-		return;
++		return false;
+ 	}
+ 
++	assert(rq->ring < ARRAY_SIZE(kgem->requests));
+ 	do {
+ 		tmp = list_first_entry(requests, struct kgem_request, list);
+ 		assert(tmp->ring == rq->ring);
+ 		__kgem_retire_rq(kgem, tmp);
+ 	} while (tmp != rq);
++
++	assert(bo->needs_flush || bo->rq == NULL);
++	assert(bo->needs_flush || list_is_empty(&bo->request));
++	assert(bo->needs_flush || bo->domain == DOMAIN_NONE);
++	return bo->rq;
+ }
+ 
+ #if 0
+@@ -2932,6 +3438,7 @@ static void kgem_commit(struct kgem *kgem)
+ 		bo->binding.offset = 0;
+ 		bo->domain = DOMAIN_GPU;
+ 		bo->gpu_dirty = false;
++		bo->gtt_dirty = false;
+ 
+ 		if (bo->proxy) {
+ 			/* proxies are not used for domain tracking */
+@@ -2955,6 +3462,23 @@ static void kgem_commit(struct kgem *kgem)
+ 			kgem_throttle(kgem);
+ 		}
+ 
++		while (!list_is_empty(&rq->buffers)) {
++			bo = list_first_entry(&rq->buffers,
++					      struct kgem_bo,
++					      request);
++
++			assert(RQ(bo->rq) == rq);
++			assert(bo->exec == NULL);
++			assert(bo->domain == DOMAIN_GPU);
++
++			list_del(&bo->request);
++			bo->domain = DOMAIN_NONE;
++			bo->rq = NULL;
++
++			if (bo->refcnt == 0)
++				_kgem_bo_destroy(kgem, bo);
++		}
++
+ 		kgem_retire(kgem);
+ 		assert(list_is_empty(&rq->buffers));
+ 
+@@ -2964,7 +3488,9 @@ static void kgem_commit(struct kgem *kgem)
+ 		gem_close(kgem->fd, rq->bo->handle);
+ 		kgem_cleanup_cache(kgem);
+ 	} else {
++		assert(rq != (struct kgem_request *)kgem);
+ 		assert(rq->ring < ARRAY_SIZE(kgem->requests));
++		assert(rq->bo);
+ 		list_add_tail(&rq->list, &kgem->requests[rq->ring]);
+ 		kgem->need_throttle = kgem->need_retire = 1;
+ 
+@@ -2988,8 +3514,10 @@ static void kgem_close_inactive(struct kgem *kgem)
+ {
+ 	unsigned int i;
+ 
+-	for (i = 0; i < ARRAY_SIZE(kgem->inactive); i++)
++	for (i = 0; i < ARRAY_SIZE(kgem->inactive); i++) {
+ 		kgem_close_list(kgem, &kgem->inactive[i]);
++		assert(list_is_empty(&kgem->inactive[i]));
++	}
+ }
+ 
+ static void kgem_finish_buffers(struct kgem *kgem)
+@@ -3079,10 +3607,13 @@ static void kgem_finish_buffers(struct kgem *kgem)
+ 						kgem->has_handle_lut ? bo->base.target_handle : shrink->handle;
+ 					for (n = 0; n < kgem->nreloc; n++) {
+ 						if (kgem->reloc[n].target_handle == bo->base.target_handle) {
++							uint64_t addr = (int)kgem->reloc[n].delta + shrink->presumed_offset;
++							kgem->batch[kgem->reloc[n].offset/sizeof(kgem->batch[0])] = addr;
++							if (kgem->gen >= 0100)
++								kgem->batch[kgem->reloc[n].offset/sizeof(kgem->batch[0]) + 1] = addr >> 32;
++
+ 							kgem->reloc[n].target_handle = shrink->target_handle;
+ 							kgem->reloc[n].presumed_offset = shrink->presumed_offset;
+-							kgem->batch[kgem->reloc[n].offset/sizeof(kgem->batch[0])] =
+-								kgem->reloc[n].delta + shrink->presumed_offset;
+ 						}
+ 					}
+ 
+@@ -3124,10 +3655,13 @@ static void kgem_finish_buffers(struct kgem *kgem)
+ 						kgem->has_handle_lut ? bo->base.target_handle : shrink->handle;
+ 					for (n = 0; n < kgem->nreloc; n++) {
+ 						if (kgem->reloc[n].target_handle == bo->base.target_handle) {
++							uint64_t addr = (int)kgem->reloc[n].delta + shrink->presumed_offset;
++							kgem->batch[kgem->reloc[n].offset/sizeof(kgem->batch[0])] = addr;
++							if (kgem->gen >= 0100)
++								kgem->batch[kgem->reloc[n].offset/sizeof(kgem->batch[0]) + 1] = addr >> 32;
++
+ 							kgem->reloc[n].target_handle = shrink->target_handle;
+ 							kgem->reloc[n].presumed_offset = shrink->presumed_offset;
+-							kgem->batch[kgem->reloc[n].offset/sizeof(kgem->batch[0])] =
+-								kgem->reloc[n].delta + shrink->presumed_offset;
+ 						}
+ 					}
+ 
+@@ -3195,6 +3729,9 @@ static void kgem_cleanup(struct kgem *kgem)
+ 					kgem_bo_free(kgem, bo);
+ 			}
+ 
++			if (--rq->bo->refcnt == 0)
++				kgem_bo_free(kgem, rq->bo);
++
+ 			__kgem_request_free(rq);
+ 		}
+ 	}
+@@ -3210,7 +3747,9 @@ kgem_batch_write(struct kgem *kgem,
+ 	char *ptr;
+ 	int ret;
+ 
+-	ASSERT_IDLE(kgem, bo->handle);
++	assert(bo->exec == NULL);
++	assert(bo->rq == NULL);
++	assert(!__kgem_busy(kgem, bo->handle));
+ 
+ #if DBG_NO_EXEC
+ 	{
+@@ -3371,55 +3910,54 @@ static int compact_batch_surface(struct kgem *kgem, int *shrink)
+ 	return size * sizeof(uint32_t);
+ }
+ 
++static struct kgem_bo *first_available(struct kgem *kgem, struct list *list)
++{
++	struct kgem_bo *bo;
++
++	list_for_each_entry(bo, list, list) {
++		assert(bo->refcnt > 0);
++
++		if (bo->rq) {
++			assert(RQ(bo->rq)->bo == bo);
++			if (__kgem_busy(kgem, bo->handle))
++				break;
++
++			__kgem_retire_rq(kgem, RQ(bo->rq));
++			assert(bo->rq == NULL);
++		}
++
++		if (bo->refcnt > 1)
++			continue;
++
++		list_move_tail(&bo->list, list);
++		return kgem_bo_reference(bo);
++	}
++
++	return NULL;
++}
++
+ static struct kgem_bo *
+ kgem_create_batch(struct kgem *kgem)
+ {
+-#if !DBG_NO_SHRINK_BATCHES
+-	struct drm_i915_gem_set_domain set_domain;
+ 	struct kgem_bo *bo;
+-	int shrink = 0;
+-	int size;
++	int size, shrink = 0;
+ 
++#if !DBG_NO_SHRINK_BATCHES
+ 	if (kgem->surface != kgem->batch_size)
+ 		size = compact_batch_surface(kgem, &shrink);
+ 	else
+ 		size = kgem->nbatch * sizeof(uint32_t);
+ 
+ 	if (size <= 4096) {
+-		bo = list_first_entry(&kgem->pinned_batches[0],
+-				      struct kgem_bo,
+-				      list);
+-		if (!bo->rq) {
+-out_4096:
+-			assert(bo->refcnt > 0);
+-			list_move_tail(&bo->list, &kgem->pinned_batches[0]);
+-			bo = kgem_bo_reference(bo);
++		bo = first_available(kgem, &kgem->pinned_batches[0]);
++		if (bo)
+ 			goto write;
+-		}
+-
+-		if (!__kgem_busy(kgem, bo->handle)) {
+-			assert(RQ(bo->rq)->bo == bo);
+-			__kgem_retire_rq(kgem, RQ(bo->rq));
+-			goto out_4096;
+-		}
+ 	}
+ 
+-	if (size <= 16384) {
+-		bo = list_first_entry(&kgem->pinned_batches[1],
+-				      struct kgem_bo,
+-				      list);
+-		if (!bo->rq) {
+-out_16384:
+-			assert(bo->refcnt > 0);
+-			list_move_tail(&bo->list, &kgem->pinned_batches[1]);
+-			bo = kgem_bo_reference(bo);
+-			goto write;
+-		}
+-
+-		if (!__kgem_busy(kgem, bo->handle)) {
+-			__kgem_retire_rq(kgem, RQ(bo->rq));
+-			goto out_16384;
+-		}
++	if (size <= 16384) {
++		bo = first_available(kgem, &kgem->pinned_batches[1]);
++		if (bo)
++			goto write;
+ 	}
+ 
+ 	if (kgem->gen == 020) {
+@@ -3443,16 +3981,8 @@ out_16384:
+ 			list_move_tail(&bo->list, &kgem->pinned_batches[size > 4096]);
+ 
+ 			DBG(("%s: syncing due to busy batches\n", __FUNCTION__));
+-
+-			VG_CLEAR(set_domain);
+-			set_domain.handle = bo->handle;
+-			set_domain.read_domains = I915_GEM_DOMAIN_GTT;
+-			set_domain.write_domain = I915_GEM_DOMAIN_GTT;
+-			if (do_ioctl(kgem->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &set_domain)) {
+-				DBG(("%s: sync: GPU hang detected\n", __FUNCTION__));
+-				kgem_throttle(kgem);
++			if (kgem_bo_wait(kgem, bo))
+ 				return NULL;
+-			}
+ 
+ 			kgem_retire(kgem);
+ 			assert(bo->rq == NULL);
+@@ -3460,9 +3990,14 @@ out_16384:
+ 			goto write;
+ 		}
+ 	}
++#else
++	if (kgem->surface != kgem->batch_size)
++		size = kgem->batch_size * sizeof(uint32_t);
++	else
++		size = kgem->nbatch * sizeof(uint32_t);
++#endif
+ 
+-	bo = NULL;
+-	if (!kgem->has_llc) {
++	if (!kgem->batch_bo || !kgem->has_llc) {
+ 		bo = kgem_create_linear(kgem, size, CREATE_NO_THROTTLE);
+ 		if (bo) {
+ write:
+@@ -3471,14 +4006,11 @@ write:
+ 				kgem_bo_destroy(kgem, bo);
+ 				return NULL;
+ 			}
++			return bo;
+ 		}
+ 	}
+-	if (bo == NULL)
+-		bo = kgem_new_batch(kgem);
+-	return bo;
+-#else
++
+ 	return kgem_new_batch(kgem);
+-#endif
+ }
+ 
+ #if !NDEBUG
+@@ -3530,7 +4062,7 @@ static void dump_fence_regs(struct kgem *kgem)
+ 
+ static int do_execbuf(struct kgem *kgem, struct drm_i915_gem_execbuffer2 *execbuf)
+ {
+-	int ret, err;
++	int ret;
+ 
+ retry:
+ 	ret = do_ioctl(kgem->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, execbuf);
+@@ -3547,26 +4079,25 @@ retry:
+ 
+ 	/* last gasp */
+ 	ret = do_ioctl(kgem->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, execbuf);
+-	if (ret == 0)
+-		return 0;
++	if (ret != -ENOSPC)
++		return ret;
++
++	/* One final trick up our sleeve for when we run out of space.
++	 * We turn everything off to free up our pinned framebuffers,
++	 * sprites and cursors, and try just one more time.
++	 */
+ 
+ 	xf86DrvMsg(kgem_get_screen_index(kgem), X_WARNING,
+ 		   "Failed to submit rendering commands, trying again with outputs disabled.\n");
+ 
+-	/* One last trick up our sleeve for when we run out of space.
+-	 * We turn everything off to free up our pinned framebuffers,
+-	 * sprites and cursors, and try one last time.
+-	 */
+-	err = errno;
+-	if (sna_mode_disable(container_of(kgem, struct sna, kgem))) {
++	if (sna_mode_disable(__to_sna(kgem))) {
+ 		kgem_cleanup_cache(kgem);
+ 		ret = do_ioctl(kgem->fd,
+ 			       DRM_IOCTL_I915_GEM_EXECBUFFER2,
+ 			       execbuf);
+ 		DBG(("%s: last_gasp ret=%d\n", __FUNCTION__, ret));
+-		sna_mode_enable(container_of(kgem, struct sna, kgem));
++		sna_mode_enable(__to_sna(kgem));
+ 	}
+-	errno = err;
+ 
+ 	return ret;
+ }
+@@ -3575,6 +4106,7 @@ void _kgem_submit(struct kgem *kgem)
+ {
+ 	struct kgem_request *rq;
+ 	uint32_t batch_end;
++	int i, ret;
+ 
+ 	assert(!DBG_NO_HW);
+ 	assert(!kgem->wedged);
+@@ -3609,7 +4141,6 @@ void _kgem_submit(struct kgem *kgem)
+ 	rq->bo = kgem_create_batch(kgem);
+ 	if (rq->bo) {
+ 		struct drm_i915_gem_execbuffer2 execbuf;
+-		int i, ret;
+ 
+ 		assert(!rq->bo->needs_flush);
+ 
+@@ -3619,7 +4150,8 @@ void _kgem_submit(struct kgem *kgem)
+ 		kgem->exec[i].relocs_ptr = (uintptr_t)kgem->reloc;
+ 		kgem->exec[i].alignment = 0;
+ 		kgem->exec[i].offset = rq->bo->presumed_offset;
+-		kgem->exec[i].flags = 0;
++		/* Make sure the kernel releases any fence, ignored if gen4+ */
++		kgem->exec[i].flags = EXEC_OBJECT_NEEDS_FENCE;
+ 		kgem->exec[i].rsvd1 = 0;
+ 		kgem->exec[i].rsvd2 = 0;
+ 
+@@ -3631,7 +4163,8 @@ void _kgem_submit(struct kgem *kgem)
+ 		memset(&execbuf, 0, sizeof(execbuf));
+ 		execbuf.buffers_ptr = (uintptr_t)kgem->exec;
+ 		execbuf.buffer_count = kgem->nexec;
+-		execbuf.batch_len = batch_end*sizeof(uint32_t);
++		if (kgem->gen < 030)
++			execbuf.batch_len = batch_end*sizeof(uint32_t);
+ 		execbuf.flags = kgem->ring | kgem->batch_flags;
+ 
+ 		if (DBG_DUMP) {
+@@ -3645,91 +4178,98 @@ void _kgem_submit(struct kgem *kgem)
+ 		}
+ 
+ 		ret = do_execbuf(kgem, &execbuf);
+-		if (DEBUG_SYNC && ret == 0) {
+-			struct drm_i915_gem_set_domain set_domain;
+-
+-			VG_CLEAR(set_domain);
+-			set_domain.handle = rq->bo->handle;
+-			set_domain.read_domains = I915_GEM_DOMAIN_GTT;
+-			set_domain.write_domain = I915_GEM_DOMAIN_GTT;
++	} else
++		ret = -ENOMEM;
+ 
+-			ret = do_ioctl(kgem->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &set_domain);
++	if (ret < 0) {
++		kgem_throttle(kgem);
++		if (!kgem->wedged) {
++			xf86DrvMsg(kgem_get_screen_index(kgem), X_ERROR,
++				   "Failed to submit rendering commands (%s), disabling acceleration.\n",
++				   strerror(-ret));
++			__kgem_set_wedged(kgem);
+ 		}
+-		if (ret < 0) {
+-			kgem_throttle(kgem);
+-			if (!kgem->wedged) {
+-				xf86DrvMsg(kgem_get_screen_index(kgem), X_ERROR,
+-					   "Failed to submit rendering commands, disabling acceleration.\n");
+-				__kgem_set_wedged(kgem);
+-			}
+ 
+ #if !NDEBUG
+-			ErrorF("batch[%d/%d]: %d %d %d, nreloc=%d, nexec=%d, nfence=%d, aperture=%d, fenced=%d, high=%d,%d: errno=%d\n",
+-			       kgem->mode, kgem->ring, batch_end, kgem->nbatch, kgem->surface,
+-			       kgem->nreloc, kgem->nexec, kgem->nfence, kgem->aperture, kgem->aperture_fenced, kgem->aperture_high, kgem->aperture_total, -ret);
++		ErrorF("batch[%d/%d]: %d %d %d, nreloc=%d, nexec=%d, nfence=%d, aperture=%d, fenced=%d, high=%d,%d: errno=%d\n",
++		       kgem->mode, kgem->ring, batch_end, kgem->nbatch, kgem->surface,
++		       kgem->nreloc, kgem->nexec, kgem->nfence, kgem->aperture, kgem->aperture_fenced, kgem->aperture_high, kgem->aperture_total, -ret);
+ 
+-			for (i = 0; i < kgem->nexec; i++) {
+-				struct kgem_bo *bo, *found = NULL;
++		for (i = 0; i < kgem->nexec; i++) {
++			struct kgem_bo *bo, *found = NULL;
+ 
+-				list_for_each_entry(bo, &kgem->next_request->buffers, request) {
+-					if (bo->handle == kgem->exec[i].handle) {
+-						found = bo;
+-						break;
+-					}
++			list_for_each_entry(bo, &kgem->next_request->buffers, request) {
++				if (bo->handle == kgem->exec[i].handle) {
++					found = bo;
++					break;
+ 				}
+-				ErrorF("exec[%d] = handle:%d, presumed offset: %x, size: %d, tiling %d, fenced %d, snooped %d, deleted %d\n",
+-				       i,
+-				       kgem->exec[i].handle,
+-				       (int)kgem->exec[i].offset,
+-				       found ? kgem_bo_size(found) : -1,
+-				       found ? found->tiling : -1,
+-				       (int)(kgem->exec[i].flags & EXEC_OBJECT_NEEDS_FENCE),
+-				       found ? found->snoop : -1,
+-				       found ? found->purged : -1);
+ 			}
+-			for (i = 0; i < kgem->nreloc; i++) {
+-				ErrorF("reloc[%d] = pos:%d, target:%d, delta:%d, read:%x, write:%x, offset:%x\n",
+-				       i,
+-				       (int)kgem->reloc[i].offset,
+-				       kgem->reloc[i].target_handle,
+-				       kgem->reloc[i].delta,
+-				       kgem->reloc[i].read_domains,
+-				       kgem->reloc[i].write_domain,
+-				       (int)kgem->reloc[i].presumed_offset);
++			ErrorF("exec[%d] = handle:%d, presumed offset: %x, size: %d, tiling %d, fenced %d, snooped %d, deleted %d\n",
++			       i,
++			       kgem->exec[i].handle,
++			       (int)kgem->exec[i].offset,
++			       found ? kgem_bo_size(found) : -1,
++			       found ? found->tiling : -1,
++			       (int)(kgem->exec[i].flags & EXEC_OBJECT_NEEDS_FENCE),
++			       found ? found->snoop : -1,
++			       found ? found->purged : -1);
++		}
++		for (i = 0; i < kgem->nreloc; i++) {
++			ErrorF("reloc[%d] = pos:%d, target:%d, delta:%d, read:%x, write:%x, offset:%x\n",
++			       i,
++			       (int)kgem->reloc[i].offset,
++			       kgem->reloc[i].target_handle,
++			       kgem->reloc[i].delta,
++			       kgem->reloc[i].read_domains,
++			       kgem->reloc[i].write_domain,
++			       (int)kgem->reloc[i].presumed_offset);
++		}
++
++		{
++			struct drm_i915_gem_get_aperture aperture;
++			if (do_ioctl(kgem->fd, DRM_IOCTL_I915_GEM_GET_APERTURE, &aperture) == 0)
++				ErrorF("Aperture size %lld, available %lld\n",
++				       (long long)aperture.aper_size,
++				       (long long)aperture.aper_available_size);
++		}
++
++		if (ret == -ENOSPC)
++			dump_gtt_info(kgem);
++		if (ret == -EDEADLK)
++			dump_fence_regs(kgem);
++
++		if (DEBUG_SYNC) {
++			int fd = open("/tmp/batchbuffer", O_WRONLY | O_CREAT | O_APPEND, 0666);
++			if (fd != -1) {
++				int ignored = write(fd, kgem->batch, batch_end*sizeof(uint32_t));
++				assert(ignored == batch_end*sizeof(uint32_t));
++				close(fd);
+ 			}
+ 
+-			{
+-				struct drm_i915_gem_get_aperture aperture;
+-				if (do_ioctl(kgem->fd, DRM_IOCTL_I915_GEM_GET_APERTURE, &aperture) == 0)
+-					ErrorF("Aperture size %lld, available %lld\n",
+-					       (long long)aperture.aper_size,
+-					       (long long)aperture.aper_available_size);
+-			}
++			FatalError("SNA: failed to submit batchbuffer, errno=%d\n", -ret);
++		}
++#endif
++	} else {
++		if (DEBUG_SYNC) {
++			struct drm_i915_gem_set_domain set_domain;
+ 
+-			if (ret == -ENOSPC)
+-				dump_gtt_info(kgem);
+-			if (ret == -EDEADLK)
+-				dump_fence_regs(kgem);
+-
+-			if (DEBUG_SYNC) {
+-				int fd = open("/tmp/batchbuffer", O_WRONLY | O_CREAT | O_APPEND, 0666);
+-				if (fd != -1) {
+-					int ignored = write(fd, kgem->batch, batch_end*sizeof(uint32_t));
+-					assert(ignored == batch_end*sizeof(uint32_t));
+-					close(fd);
+-				}
++			VG_CLEAR(set_domain);
++			set_domain.handle = rq->bo->handle;
++			set_domain.read_domains = I915_GEM_DOMAIN_GTT;
++			set_domain.write_domain = I915_GEM_DOMAIN_GTT;
+ 
+-				FatalError("SNA: failed to submit batchbuffer, errno=%d\n", -ret);
+-			}
+-#endif
++			ret = do_ioctl(kgem->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &set_domain);
+ 		}
+-	}
++
+ #if SHOW_BATCH_AFTER
+-	if (gem_read(kgem->fd, rq->bo->handle, kgem->batch, 0, batch_end*sizeof(uint32_t)) == 0)
+-		__kgem_batch_debug(kgem, batch_end);
++		if (gem_read(kgem->fd, rq->bo->handle, kgem->batch, 0, batch_end*sizeof(uint32_t)) == 0)
++			__kgem_batch_debug(kgem, batch_end);
+ #endif
+-	kgem_commit(kgem);
+-	if (kgem->wedged)
++
++		kgem_commit(kgem);
++	}
++
++	if (unlikely(kgem->wedged))
+ 		kgem_cleanup(kgem);
+ 
+ 	kgem_reset(kgem);
+@@ -3737,49 +4277,14 @@ void _kgem_submit(struct kgem *kgem)
+ 	assert(kgem->next_request != NULL);
+ }
+ 
+-static bool find_hang_state(struct kgem *kgem, char *path, int maxlen)
+-{
+-	int minor = kgem_get_minor(kgem);
+-
+-	/* Search for our hang state in a few canonical locations.
+-	 * In the unlikely event of having multiple devices, we
+-	 * will need to check which minor actually corresponds to ours.
+-	 */
+-
+-	snprintf(path, maxlen, "/sys/class/drm/card%d/error", minor);
+-	if (access(path, R_OK) == 0)
+-		return true;
+-
+-	snprintf(path, maxlen, "/sys/kernel/debug/dri/%d/i915_error_state", minor);
+-	if (access(path, R_OK) == 0)
+-		return true;
+-
+-	snprintf(path, maxlen, "/debug/dri/%d/i915_error_state", minor);
+-	if (access(path, R_OK) == 0)
+-		return true;
+-
+-	path[0] = '\0';
+-	return false;
+-}
+-
+ void kgem_throttle(struct kgem *kgem)
+ {
+-	if (kgem->wedged)
++	if (unlikely(kgem->wedged))
+ 		return;
+ 
+ 	if (__kgem_throttle(kgem, true)) {
+-		static int once;
+-		char path[128];
+-
+ 		xf86DrvMsg(kgem_get_screen_index(kgem), X_ERROR,
+ 			   "Detected a hung GPU, disabling acceleration.\n");
+-		if (!once && find_hang_state(kgem, path, sizeof(path))) {
+-			xf86DrvMsg(kgem_get_screen_index(kgem), X_ERROR,
+-				   "When reporting this, please include %s and the full dmesg.\n",
+-				   path);
+-			once = 1;
+-		}
+-
+ 		__kgem_set_wedged(kgem);
+ 		kgem->need_throttle = false;
+ 	}
+@@ -3860,7 +4365,8 @@ bool kgem_expire_cache(struct kgem *kgem)
+ 	bool idle;
+ 	unsigned int i;
+ 
+-	time(&now);
++	if (!time(&now))
++		return false;
+ 
+ 	while (__kgem_freed_bo) {
+ 		bo = __kgem_freed_bo;
+@@ -3875,7 +4381,7 @@ bool kgem_expire_cache(struct kgem *kgem)
+ 	}
+ 
+ 	kgem_clean_large_cache(kgem);
+-	if (container_of(kgem, struct sna, kgem)->scrn->vtSema)
++	if (__to_sna(kgem)->scrn->vtSema)
+ 		kgem_clean_scanout_cache(kgem);
+ 
+ 	expire = 0;
+@@ -3885,6 +4391,7 @@ bool kgem_expire_cache(struct kgem *kgem)
+ 			break;
+ 		}
+ 
++		assert(now);
+ 		bo->delta = now;
+ 	}
+ 	if (expire) {
+@@ -3909,7 +4416,7 @@ bool kgem_expire_cache(struct kgem *kgem)
+ #endif
+ 
+ 	kgem_retire(kgem);
+-	if (kgem->wedged)
++	if (unlikely(kgem->wedged))
+ 		kgem_cleanup(kgem);
+ 
+ 	kgem->expire(kgem);
+@@ -3930,6 +4437,8 @@ bool kgem_expire_cache(struct kgem *kgem)
+ 				break;
+ 			}
+ 
++			assert(now);
++			kgem_bo_set_purgeable(kgem, bo);
+ 			bo->delta = now;
+ 		}
+ 	}
+@@ -3960,16 +4469,11 @@ bool kgem_expire_cache(struct kgem *kgem)
+ 				count++;
+ 				size += bytes(bo);
+ 				kgem_bo_free(kgem, bo);
+-				DBG(("%s: expiring %d\n",
++				DBG(("%s: expiring handle=%d\n",
+ 				     __FUNCTION__, bo->handle));
+ 			}
+ 		}
+-		if (!list_is_empty(&preserve)) {
+-			preserve.prev->next = kgem->inactive[i].next;
+-			kgem->inactive[i].next->prev = preserve.prev;
+-			kgem->inactive[i].next = preserve.next;
+-			preserve.next->prev = &kgem->inactive[i];
+-		}
++		list_splice_tail(&preserve, &kgem->inactive[i]);
+ 	}
+ 
+ #ifdef DEBUG_MEMORY
+@@ -3998,31 +4502,30 @@ bool kgem_cleanup_cache(struct kgem *kgem)
+ 	unsigned int i;
+ 	int n;
+ 
++	DBG(("%s\n", __FUNCTION__));
++
+ 	/* sync to the most recent request */
+ 	for (n = 0; n < ARRAY_SIZE(kgem->requests); n++) {
+ 		if (!list_is_empty(&kgem->requests[n])) {
+ 			struct kgem_request *rq;
+-			struct drm_i915_gem_set_domain set_domain;
+ 
+-			rq = list_first_entry(&kgem->requests[n],
+-					      struct kgem_request,
+-					      list);
++			rq = list_last_entry(&kgem->requests[n],
++					     struct kgem_request,
++					     list);
+ 
+ 			DBG(("%s: sync on cleanup\n", __FUNCTION__));
+-
+-			VG_CLEAR(set_domain);
+-			set_domain.handle = rq->bo->handle;
+-			set_domain.read_domains = I915_GEM_DOMAIN_GTT;
+-			set_domain.write_domain = I915_GEM_DOMAIN_GTT;
+-			(void)do_ioctl(kgem->fd,
+-				       DRM_IOCTL_I915_GEM_SET_DOMAIN,
+-				       &set_domain);
++			assert(rq->ring == n);
++			assert(rq->bo);
++			assert(RQ(rq->bo->rq) == rq);
++			kgem_bo_wait(kgem, rq->bo);
+ 		}
++		assert(list_is_empty(&kgem->requests[n]));
+ 	}
+ 
+ 	kgem_retire(kgem);
+ 	kgem_cleanup(kgem);
+ 
++	DBG(("%s: need_expire?=%d\n", __FUNCTION__, kgem->need_expire));
+ 	if (!kgem->need_expire)
+ 		return false;
+ 
+@@ -4049,6 +4552,8 @@ bool kgem_cleanup_cache(struct kgem *kgem)
+ 
+ 	kgem->need_purge = false;
+ 	kgem->need_expire = false;
++
++	DBG(("%s: complete\n", __FUNCTION__));
+ 	return true;
+ }
+ 
+@@ -4079,16 +4584,15 @@ retry_large:
+ 				goto discard;
+ 
+ 			if (bo->tiling != I915_TILING_NONE) {
+-				if (use_active)
++				if (use_active && kgem->gen < 040)
+ 					goto discard;
+ 
+-				if (!gem_set_tiling(kgem->fd, bo->handle,
++				if (!kgem_set_tiling(kgem, bo,
+ 						    I915_TILING_NONE, 0))
+ 					goto discard;
+-
+-				bo->tiling = I915_TILING_NONE;
+-				bo->pitch = 0;
+ 			}
++			assert(bo->tiling == I915_TILING_NONE);
++			bo->pitch = 0;
+ 
+ 			if (bo->purged && !kgem_bo_clear_purgeable(kgem, bo))
+ 				goto discard;
+@@ -4169,17 +4673,17 @@ discard:
+ 				break;
+ 			}
+ 
+-			if (I915_TILING_NONE != bo->tiling &&
+-			    !gem_set_tiling(kgem->fd, bo->handle,
+-					    I915_TILING_NONE, 0))
+-				continue;
++			if (!kgem_set_tiling(kgem, bo, I915_TILING_NONE, 0)) {
++				kgem_bo_free(kgem, bo);
++				break;
++			}
+ 
+ 			kgem_bo_remove_from_inactive(kgem, bo);
+ 			assert(list_is_empty(&bo->vma));
+ 			assert(list_is_empty(&bo->list));
+ 
+-			bo->tiling = I915_TILING_NONE;
+-			bo->pitch = 0;
++			assert(bo->tiling == I915_TILING_NONE);
++			assert(bo->pitch == 0);
+ 			bo->delta = 0;
+ 			DBG(("  %s: found handle=%d (num_pages=%d) in linear vma cache\n",
+ 			     __FUNCTION__, bo->handle, num_pages(bo)));
+@@ -4225,13 +4729,13 @@ discard:
+ 			if (first)
+ 				continue;
+ 
+-			if (!gem_set_tiling(kgem->fd, bo->handle,
+-					    I915_TILING_NONE, 0))
+-				continue;
+-
+-			bo->tiling = I915_TILING_NONE;
+-			bo->pitch = 0;
++			if (!kgem_set_tiling(kgem, bo, I915_TILING_NONE, 0)) {
++				kgem_bo_free(kgem, bo);
++				break;
++			}
+ 		}
++		assert(bo->tiling == I915_TILING_NONE);
++		bo->pitch = 0;
+ 
+ 		if (bo->map__gtt || bo->map__wc || bo->map__cpu) {
+ 			if (flags & (CREATE_CPU_MAP | CREATE_GTT_MAP)) {
+@@ -4269,7 +4773,7 @@ discard:
+ 			kgem_bo_remove_from_inactive(kgem, bo);
+ 
+ 		assert(bo->tiling == I915_TILING_NONE);
+-		bo->pitch = 0;
++		assert(bo->pitch == 0);
+ 		bo->delta = 0;
+ 		DBG(("  %s: found handle=%d (num_pages=%d) in linear %s cache\n",
+ 		     __FUNCTION__, bo->handle, num_pages(bo),
+@@ -4340,9 +4844,9 @@ struct kgem_bo *kgem_create_for_name(struct kgem *kgem, uint32_t name)
+ 
+ 	bo->unique_id = kgem_get_unique_id(kgem);
+ 	bo->tiling = tiling.tiling_mode;
+-	bo->reusable = false;
+ 	bo->prime = true;
+-	bo->purged = true; /* no coherency guarantees */
++	bo->reusable = false;
++	kgem_bo_unclean(kgem, bo);
+ 
+ 	debug_alloc__bo(kgem, bo);
+ 	return bo;
+@@ -4448,6 +4952,8 @@ int kgem_bo_export_to_prime(struct kgem *kgem, struct kgem_bo *bo)
+ #if defined(DRM_IOCTL_PRIME_HANDLE_TO_FD) && defined(O_CLOEXEC)
+ 	struct drm_prime_handle args;
+ 
++	assert(kgem_bo_is_fenced(kgem, bo));
++
+ 	VG_CLEAR(args);
+ 	args.handle = bo->handle;
+ 	args.flags = O_CLOEXEC;
+@@ -4479,6 +4985,8 @@ struct kgem_bo *kgem_create_linear(struct kgem *kgem, int size, unsigned flags)
+ 	if ((flags & CREATE_UNCACHED) == 0) {
+ 		bo = search_linear_cache(kgem, size, CREATE_INACTIVE | flags);
+ 		if (bo) {
++			assert(!bo->purged);
++			assert(!bo->delta);
+ 			assert(bo->domain != DOMAIN_GPU);
+ 			ASSERT_IDLE(kgem, bo->handle);
+ 			bo->refcnt = 1;
+@@ -4760,8 +5268,7 @@ static void __kgem_bo_make_scanout(struct kgem *kgem,
+ 				   struct kgem_bo *bo,
+ 				   int width, int height)
+ {
+-	ScrnInfoPtr scrn =
+-		container_of(kgem, struct sna, kgem)->scrn;
++	ScrnInfoPtr scrn = __to_sna(kgem)->scrn;
+ 	struct drm_mode_fb_cmd arg;
+ 
+ 	assert(bo->proxy == NULL);
+@@ -4809,6 +5316,48 @@ static void __kgem_bo_make_scanout(struct kgem *kgem,
+ 	}
+ }
+ 
++static bool tiling_changed(struct kgem_bo *bo, int tiling, int pitch)
++{
++	if (tiling != bo->tiling)
++		return true;
++
++	return tiling != I915_TILING_NONE && pitch != bo->pitch;
++}
++
++static void set_gpu_tiling(struct kgem *kgem,
++			   struct kgem_bo *bo,
++			   int tiling, int pitch)
++{
++	DBG(("%s: handle=%d, tiling=%d, pitch=%d\n",
++	     __FUNCTION__, bo->handle, tiling, pitch));
++
++	if (tiling_changed(bo, tiling, pitch) && bo->map__gtt) {
++		if (!list_is_empty(&bo->vma)) {
++			list_del(&bo->vma);
++			kgem->vma[0].count--;
++		}
++		munmap(bo->map__gtt, bytes(bo));
++		bo->map__gtt = NULL;
++	}
++
++	bo->tiling = tiling;
++	bo->pitch = pitch;
++}
++
++bool kgem_bo_is_fenced(struct kgem *kgem, struct kgem_bo *bo)
++{
++	struct drm_i915_gem_get_tiling tiling;
++
++	assert(kgem);
++	assert(bo);
++
++	VG_CLEAR(tiling);
++	tiling.handle = bo->handle;
++	tiling.tiling_mode = bo->tiling;
++	(void)do_ioctl(kgem->fd, DRM_IOCTL_I915_GEM_GET_TILING, &tiling);
++	return tiling.tiling_mode == bo->tiling; /* assume pitch is fine! */
++}
++
+ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
+ 			       int width,
+ 			       int height,
+@@ -4892,8 +5441,8 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
+ 			return last;
+ 		}
+ 
+-		if (container_of(kgem, struct sna, kgem)->scrn->vtSema) {
+-			ScrnInfoPtr scrn = container_of(kgem, struct sna, kgem)->scrn;
++		if (__to_sna(kgem)->scrn->vtSema) {
++			ScrnInfoPtr scrn = __to_sna(kgem)->scrn;
+ 
+ 			list_for_each_entry_reverse(bo, &kgem->scanout, list) {
+ 				struct drm_mode_fb_cmd arg;
+@@ -4915,11 +5464,8 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
+ 						bo->delta = 0;
+ 					}
+ 
+-					if (gem_set_tiling(kgem->fd, bo->handle,
+-							   tiling, pitch)) {
+-						bo->tiling = tiling;
+-						bo->pitch = pitch;
+-					} else {
++					if (!kgem_set_tiling(kgem, bo,
++							     tiling, pitch)) {
+ 						kgem_bo_free(kgem, bo);
+ 						break;
+ 					}
+@@ -4950,6 +5496,9 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
+ 			}
+ 		}
+ 
++		if (flags & CREATE_CACHED)
++			return NULL;
++
+ 		bo = __kgem_bo_create_as_display(kgem, size, tiling, pitch);
+ 		if (bo)
+ 			return bo;
+@@ -4987,14 +5536,9 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
+ 				if (num_pages(bo) < size)
+ 					continue;
+ 
+-				if (bo->pitch != pitch || bo->tiling != tiling) {
+-					if (!gem_set_tiling(kgem->fd, bo->handle,
+-							    tiling, pitch))
+-						continue;
+-
+-					bo->pitch = pitch;
+-					bo->tiling = tiling;
+-				}
++				if (!kgem_set_tiling(kgem, bo, tiling, pitch) &&
++				    !exact)
++					set_gpu_tiling(kgem, bo, tiling, pitch);
+ 			}
+ 
+ 			kgem_bo_remove_from_active(kgem, bo);
+@@ -5020,14 +5564,11 @@ large_inactive:
+ 			if (size > num_pages(bo))
+ 				continue;
+ 
+-			if (bo->tiling != tiling ||
+-			    (tiling != I915_TILING_NONE && bo->pitch != pitch)) {
+-				if (!gem_set_tiling(kgem->fd, bo->handle,
+-						    tiling, pitch))
++			if (!kgem_set_tiling(kgem, bo, tiling, pitch)) {
++				if (kgem->gen >= 040 && !exact)
++					set_gpu_tiling(kgem, bo, tiling, pitch);
++				else
+ 					continue;
+-
+-				bo->tiling = tiling;
+-				bo->pitch = pitch;
+ 			}
+ 
+ 			if (bo->purged && !kgem_bo_clear_purgeable(kgem, bo)) {
+@@ -5039,7 +5580,6 @@ large_inactive:
+ 
+ 			assert(bo->domain != DOMAIN_GPU);
+ 			bo->unique_id = kgem_get_unique_id(kgem);
+-			bo->pitch = pitch;
+ 			bo->delta = 0;
+ 			DBG(("  1:from large inactive: pitch=%d, tiling=%d, handle=%d, id=%d\n",
+ 			     bo->pitch, bo->tiling, bo->handle, bo->unique_id));
+@@ -5088,14 +5628,13 @@ large_inactive:
+ 				if (bo->tiling != tiling ||
+ 				    (tiling != I915_TILING_NONE && bo->pitch != pitch)) {
+ 					if (bo->map__gtt ||
+-					    !gem_set_tiling(kgem->fd, bo->handle,
+-							    tiling, pitch)) {
++					    !kgem_set_tiling(kgem, bo,
++							     tiling, pitch)) {
+ 						DBG(("inactive GTT vma with wrong tiling: %d < %d\n",
+ 						     bo->tiling, tiling));
+-						continue;
++						kgem_bo_free(kgem, bo);
++						break;
+ 					}
+-					bo->tiling = tiling;
+-					bo->pitch = pitch;
+ 				}
+ 
+ 				if (bo->purged && !kgem_bo_clear_purgeable(kgem, bo)) {
+@@ -5103,8 +5642,11 @@ large_inactive:
+ 					break;
+ 				}
+ 
++				if (tiling == I915_TILING_NONE)
++					bo->pitch = pitch;
++
+ 				assert(bo->tiling == tiling);
+-				bo->pitch = pitch;
++				assert(bo->pitch >= pitch);
+ 				bo->delta = 0;
+ 				bo->unique_id = kgem_get_unique_id(kgem);
+ 
+@@ -5170,15 +5712,12 @@ search_active:
+ 				if (num_pages(bo) < size)
+ 					continue;
+ 
+-				if (bo->pitch != pitch) {
+-					if (!gem_set_tiling(kgem->fd,
+-							    bo->handle,
+-							    tiling, pitch))
+-						continue;
+-
+-					bo->pitch = pitch;
+-				}
++				if (!kgem_set_tiling(kgem, bo, tiling, pitch) &&
++				    !exact)
++					set_gpu_tiling(kgem, bo, tiling, pitch);
+ 			}
++			assert(bo->tiling == tiling);
++			assert(bo->pitch >= pitch);
+ 
+ 			kgem_bo_remove_from_active(kgem, bo);
+ 
+@@ -5233,19 +5772,21 @@ search_active:
+ 				if (num_pages(bo) < size)
+ 					continue;
+ 
+-				if (bo->tiling != tiling ||
+-				    (tiling != I915_TILING_NONE && bo->pitch != pitch)) {
+-					if (!gem_set_tiling(kgem->fd,
+-							    bo->handle,
+-							    tiling, pitch))
+-						continue;
++				if (!kgem_set_tiling(kgem, bo, tiling, pitch)) {
++					if (kgem->gen >= 040 && !exact) {
++						set_gpu_tiling(kgem, bo,
++							       tiling, pitch);
++					} else {
++						kgem_bo_free(kgem, bo);
++						break;
++					}
+ 				}
++				assert(bo->tiling == tiling);
++				assert(bo->pitch >= pitch);
+ 
+ 				kgem_bo_remove_from_active(kgem, bo);
+ 
+ 				bo->unique_id = kgem_get_unique_id(kgem);
+-				bo->pitch = pitch;
+-				bo->tiling = tiling;
+ 				bo->delta = 0;
+ 				DBG(("  1:from active: pitch=%d, tiling=%d, handle=%d, id=%d\n",
+ 				     bo->pitch, bo->tiling, bo->handle, bo->unique_id));
+@@ -5323,11 +5864,13 @@ search_inactive:
+ 			continue;
+ 		}
+ 
+-		if (bo->tiling != tiling ||
+-		    (tiling != I915_TILING_NONE && bo->pitch != pitch)) {
+-			if (!gem_set_tiling(kgem->fd, bo->handle,
+-					    tiling, pitch))
+-				continue;
++		if (!kgem_set_tiling(kgem, bo, tiling, pitch)) {
++			if (kgem->gen >= 040 && !exact) {
++				set_gpu_tiling(kgem, bo, tiling, pitch);
++			} else {
++				kgem_bo_free(kgem, bo);
++				break;
++			}
+ 		}
+ 
+ 		if (bo->purged && !kgem_bo_clear_purgeable(kgem, bo)) {
+@@ -5338,9 +5881,8 @@ search_inactive:
+ 		kgem_bo_remove_from_inactive(kgem, bo);
+ 		assert(list_is_empty(&bo->list));
+ 		assert(list_is_empty(&bo->vma));
+-
+-		bo->pitch = pitch;
+-		bo->tiling = tiling;
++		assert(bo->tiling == tiling);
++		assert(bo->pitch >= pitch);
+ 
+ 		bo->delta = 0;
+ 		bo->unique_id = kgem_get_unique_id(kgem);
+@@ -5388,14 +5930,17 @@ search_inactive:
+ 			kgem_bo_remove_from_active(kgem, bo);
+ 			__kgem_bo_clear_busy(bo);
+ 
+-			if (tiling != I915_TILING_NONE && bo->pitch != pitch) {
+-				if (!gem_set_tiling(kgem->fd, bo->handle, tiling, pitch)) {
++			if (!kgem_set_tiling(kgem, bo, tiling, pitch)) {
++				if (kgem->gen >= 040 && !exact) {
++					set_gpu_tiling(kgem, bo, tiling, pitch);
++				} else {
+ 					kgem_bo_free(kgem, bo);
+ 					goto no_retire;
+ 				}
+ 			}
++			assert(bo->tiling == tiling);
++			assert(bo->pitch >= pitch);
+ 
+-			bo->pitch = pitch;
+ 			bo->unique_id = kgem_get_unique_id(kgem);
+ 			bo->delta = 0;
+ 			DBG(("  2:from active: pitch=%d, tiling=%d, handle=%d, id=%d\n",
+@@ -5440,18 +5985,21 @@ create:
+ 	}
+ 
+ 	bo->unique_id = kgem_get_unique_id(kgem);
+-	if (tiling == I915_TILING_NONE ||
+-	    gem_set_tiling(kgem->fd, handle, tiling, pitch)) {
+-		bo->tiling = tiling;
+-		bo->pitch = pitch;
++	if (kgem_set_tiling(kgem, bo, tiling, pitch)) {
+ 		if (flags & CREATE_SCANOUT)
+ 			__kgem_bo_make_scanout(kgem, bo, width, height);
+ 	} else {
+-		if (flags & CREATE_EXACT) {
+-			DBG(("%s: failed to set exact tiling (gem_set_tiling)\n", __FUNCTION__));
+-			gem_close(kgem->fd, handle);
+-			free(bo);
+-			return NULL;
++		if (kgem->gen >= 040) {
++			assert(!kgem->can_fence);
++			bo->tiling = tiling;
++			bo->pitch = pitch;
++		} else {
++			if (flags & CREATE_EXACT) {
++				DBG(("%s: failed to set exact tiling (gem_set_tiling)\n", __FUNCTION__));
++				gem_close(kgem->fd, handle);
++				free(bo);
++				return NULL;
++			}
+ 		}
+ 	}
+ 
+@@ -5608,7 +6156,7 @@ static void __kgem_flush(struct kgem *kgem, struct kgem_bo *bo)
+ 
+ void kgem_scanout_flush(struct kgem *kgem, struct kgem_bo *bo)
+ {
+-	if (!bo->needs_flush)
++	if (!bo->needs_flush && !bo->gtt_dirty)
+ 		return;
+ 
+ 	kgem_bo_submit(kgem, bo);
+@@ -5621,18 +6169,24 @@ void kgem_scanout_flush(struct kgem *kgem, struct kgem_bo *bo)
+ 	if (bo->rq)
+ 		__kgem_flush(kgem, bo);
+ 
++	if (bo->scanout && kgem->needs_dirtyfb) {
++		struct drm_mode_fb_dirty_cmd cmd;
++		memset(&cmd, 0, sizeof(cmd));
++		cmd.fb_id = bo->delta;
++		(void)drmIoctl(kgem->fd, DRM_IOCTL_MODE_DIRTYFB, &cmd);
++	}
++
+ 	/* Whatever actually happens, we can regard the GTT write domain
+ 	 * as being flushed.
+ 	 */
+-	bo->gtt_dirty = false;
+-	bo->needs_flush = false;
+-	bo->domain = DOMAIN_NONE;
++	__kgem_bo_clear_dirty(bo);
+ }
+ 
+ inline static bool nearly_idle(struct kgem *kgem)
+ {
+ 	int ring = kgem->ring == KGEM_BLT;
+ 
++	assert(ring < ARRAY_SIZE(kgem->requests));
+ 	if (list_is_singular(&kgem->requests[ring]))
+ 		return true;
+ 
+@@ -5720,7 +6274,7 @@ static inline bool kgem_flush(struct kgem *kgem, bool flush)
+ 	if (kgem->nreloc == 0)
+ 		return true;
+ 
+-	if (container_of(kgem, struct sna, kgem)->flags & SNA_POWERSAVE)
++	if (__to_sna(kgem)->flags & SNA_POWERSAVE)
+ 		return true;
+ 
+ 	if (kgem->flush == flush && kgem->aperture < kgem->aperture_low)
+@@ -5982,6 +6536,55 @@ bool kgem_check_many_bo_fenced(struct kgem *kgem, ...)
+ 	return kgem_flush(kgem, flush);
+ }
+ 
++void __kgem_bcs_set_tiling(struct kgem *kgem,
++			   struct kgem_bo *src,
++			   struct kgem_bo *dst)
++{
++	uint32_t state, *b;
++
++	DBG(("%s: src handle=%d:tiling=%d, dst handle=%d:tiling=%d\n",
++	     __FUNCTION__,
++	     src ? src->handle : 0, src ? src->tiling : 0,
++	     dst ? dst->handle : 0, dst ? dst->tiling : 0));
++	assert(kgem->mode == KGEM_BLT);
++	assert(dst == NULL || kgem_bo_can_blt(kgem, dst));
++	assert(src == NULL || kgem_bo_can_blt(kgem, src));
++
++	state = 0;
++	if (dst && dst->tiling == I915_TILING_Y)
++		state |= BCS_DST_Y;
++	if (src && src->tiling == I915_TILING_Y)
++		state |= BCS_SRC_Y;
++
++	if (kgem->bcs_state == state)
++		return;
++
++	DBG(("%s: updating SWCTRL %x -> %x\n", __FUNCTION__,
++	     kgem->bcs_state, state));
++
++	/* Over-estimate space in case we need to re-emit the cmd packet */
++	if (!kgem_check_batch(kgem, 24)) {
++		_kgem_submit(kgem);
++		_kgem_set_mode(kgem, KGEM_BLT);
++		if (state == 0)
++			return;
++	}
++
++	b = kgem->batch + kgem->nbatch;
++	if (kgem->nbatch) {
++		*b++ = MI_FLUSH_DW;
++		*b++ = 0;
++		*b++ = 0;
++		*b++ = 0;
++	}
++	*b++ = MI_LOAD_REGISTER_IMM;
++	*b++ = BCS_SWCTRL;
++	*b++ = (BCS_SRC_Y | BCS_DST_Y) << 16 | state;
++	kgem->nbatch = b - kgem->batch;
++
++	kgem->bcs_state = state;
++}
++
+ uint32_t kgem_add_reloc(struct kgem *kgem,
+ 			uint32_t pos,
+ 			struct kgem_bo *bo,
+@@ -6195,12 +6798,6 @@ static void kgem_trim_vma_cache(struct kgem *kgem, int type, int bucket)
+ 
+ 		list_del(&bo->vma);
+ 		kgem->vma[type].count--;
+-
+-		if (!bo->purged && !kgem_bo_set_purgeable(kgem, bo)) {
+-			DBG(("%s: freeing unpurgeable old mapping\n",
+-			     __FUNCTION__));
+-			kgem_bo_free(kgem, bo);
+-		}
+ 	}
+ }
+ 
+@@ -6216,8 +6813,8 @@ static void *__kgem_bo_map__gtt_or_wc(struct kgem *kgem, struct kgem_bo *bo)
+ 	kgem_trim_vma_cache(kgem, MAP_GTT, bucket(bo));
+ 
+ 	if (bo->tiling || !kgem->has_wc_mmap) {
+-		assert(num_pages(bo) <= kgem->aperture_mappable / 2);
+ 		assert(kgem->gen != 021 || bo->tiling != I915_TILING_Y);
++		warn_unless(num_pages(bo) <= kgem->aperture_mappable / 2);
+ 
+ 		ptr = bo->map__gtt;
+ 		if (ptr == NULL)
+@@ -6291,6 +6888,7 @@ void *kgem_bo_map(struct kgem *kgem, struct kgem_bo *bo)
+ 			DBG(("%s: sync: GPU hang detected\n", __FUNCTION__));
+ 			kgem_throttle(kgem);
+ 		}
++		bo->needs_flush = false;
+ 		kgem_bo_retire(kgem, bo);
+ 		bo->domain = DOMAIN_GTT;
+ 		bo->gtt_dirty = true;
+@@ -6319,14 +6917,16 @@ void *kgem_bo_map__wc(struct kgem *kgem, struct kgem_bo *bo)
+ 	     bo->handle, (long)bo->presumed_offset, bo->tiling, bo->map__gtt, bo->map__cpu, bo->domain));
+ 
+ 	assert(bo->proxy == NULL);
+-	assert(bo->exec == NULL);
+ 	assert(list_is_empty(&bo->list));
+ 	assert_tiling(kgem, bo);
+ 	assert(!bo->purged || bo->reusable);
+ 
+ 	if (bo->map__wc)
+ 		return bo->map__wc;
++	if (!kgem->has_wc_mmap)
++		return NULL;
+ 
++	kgem_trim_vma_cache(kgem, MAP_GTT, bucket(bo));
+ 	return __kgem_bo_map__wc(kgem, bo);
+ }
+ 
+@@ -6373,6 +6973,8 @@ uint32_t kgem_bo_flink(struct kgem *kgem, struct kgem_bo *bo)
+ {
+ 	struct drm_gem_flink flink;
+ 
++	assert(kgem_bo_is_fenced(kgem, bo));
++
+ 	VG_CLEAR(flink);
+ 	flink.handle = bo->handle;
+ 	if (do_ioctl(kgem->fd, DRM_IOCTL_GEM_FLINK, &flink))
+@@ -6387,7 +6989,6 @@ uint32_t kgem_bo_flink(struct kgem *kgem, struct kgem_bo *bo)
+ 	 * party, we track the lifetime accurately.
+ 	 */
+ 	bo->reusable = false;
+-
+ 	kgem_bo_unclean(kgem, bo);
+ 
+ 	return flink.name;
+@@ -6411,16 +7012,34 @@ struct kgem_bo *kgem_create_map(struct kgem *kgem,
+ 	first_page = (uintptr_t)ptr;
+ 	last_page = first_page + size + PAGE_SIZE - 1;
+ 
+-	first_page &= ~(PAGE_SIZE-1);
+-	last_page &= ~(PAGE_SIZE-1);
++	first_page &= ~(uintptr_t)(PAGE_SIZE-1);
++	last_page &= ~(uintptr_t)(PAGE_SIZE-1);
+ 	assert(last_page > first_page);
+ 
+ 	handle = gem_userptr(kgem->fd,
+ 			     (void *)first_page, last_page-first_page,
+ 			     read_only);
+ 	if (handle == 0) {
+-		DBG(("%s: import failed, errno=%d\n", __FUNCTION__, errno));
+-		return NULL;
++		if (read_only && kgem->has_wc_mmap) {
++			struct drm_i915_gem_set_domain set_domain;
++
++			handle = gem_userptr(kgem->fd,
++					     (void *)first_page, last_page-first_page,
++					     false);
++
++			VG_CLEAR(set_domain);
++			set_domain.handle = handle;
++			set_domain.read_domains = I915_GEM_DOMAIN_GTT;
++			set_domain.write_domain = 0;
++			if (do_ioctl(kgem->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &set_domain)) {
++				gem_close(kgem->fd, handle);
++				handle = 0;
++			}
++		}
++		if (handle == 0) {
++			DBG(("%s: import failed, errno=%d\n", __FUNCTION__, errno));
++			return NULL;
++		}
+ 	}
+ 
+ 	bo = __kgem_bo_alloc(handle, (last_page - first_page) / PAGE_SIZE);
+@@ -6483,8 +7102,10 @@ void kgem_bo_sync__cpu(struct kgem *kgem, struct kgem_bo *bo)
+ 			DBG(("%s: sync: GPU hang detected\n", __FUNCTION__));
+ 			kgem_throttle(kgem);
+ 		}
++		bo->needs_flush = false;
+ 		kgem_bo_retire(kgem, bo);
+ 		bo->domain = DOMAIN_CPU;
++		bo->gtt_dirty = true;
+ 	}
+ }
+ 
+@@ -6505,6 +7126,9 @@ void kgem_bo_sync__cpu_full(struct kgem *kgem, struct kgem_bo *bo, bool write)
+ 	assert(bo->refcnt);
+ 	assert(!bo->purged);
+ 
++	if (bo->rq == NULL && (kgem->has_llc || bo->snoop) && !write)
++		return;
++
+ 	if (bo->domain != DOMAIN_CPU || FORCE_MMAP_SYNC & (1 << DOMAIN_CPU)) {
+ 		struct drm_i915_gem_set_domain set_domain;
+ 
+@@ -6522,9 +7146,11 @@ void kgem_bo_sync__cpu_full(struct kgem *kgem, struct kgem_bo *bo, bool write)
+ 			DBG(("%s: sync: GPU hang detected\n", __FUNCTION__));
+ 			kgem_throttle(kgem);
+ 		}
++		bo->needs_flush = false;
+ 		if (write) {
+ 			kgem_bo_retire(kgem, bo);
+ 			bo->domain = DOMAIN_CPU;
++			bo->gtt_dirty = true;
+ 		} else {
+ 			if (bo->exec == NULL)
+ 				kgem_bo_maybe_retire(kgem, bo);
+@@ -6539,6 +7165,7 @@ void kgem_bo_sync__gtt(struct kgem *kgem, struct kgem_bo *bo)
+ 	assert(bo->refcnt);
+ 	assert(bo->proxy == NULL);
+ 	assert_tiling(kgem, bo);
++	assert(!bo->snoop);
+ 
+ 	kgem_bo_submit(kgem, bo);
+ 
+@@ -6559,6 +7186,7 @@ void kgem_bo_sync__gtt(struct kgem *kgem, struct kgem_bo *bo)
+ 			DBG(("%s: sync: GPU hang detected\n", __FUNCTION__));
+ 			kgem_throttle(kgem);
+ 		}
++		bo->needs_flush = false;
+ 		kgem_bo_retire(kgem, bo);
+ 		bo->domain = DOMAIN_GTT;
+ 		bo->gtt_dirty = true;
+@@ -7485,6 +8113,7 @@ kgem_replace_bo(struct kgem *kgem,
+ 		}
+ 		_kgem_set_mode(kgem, KGEM_BLT);
+ 	}
++	kgem_bcs_set_tiling(kgem, src, dst);
+ 
+ 	br00 = XY_SRC_COPY_BLT_CMD;
+ 	br13 = pitch;
+@@ -7553,6 +8182,9 @@ bool kgem_bo_convert_to_gpu(struct kgem *kgem,
+ 	     __FUNCTION__, bo->handle, flags, __kgem_bo_is_busy(kgem, bo)));
+ 	assert(bo->tiling == I915_TILING_NONE);
+ 
++	if (flags & (__MOVE_PRIME | __MOVE_SCANOUT))
++		return false;
++
+ 	if (kgem->has_llc)
+ 		return true;
+ 
+diff --git a/src/sna/kgem.h b/src/sna/kgem.h
+index 2267bacf..08b4eb20 100644
+--- a/src/sna/kgem.h
++++ b/src/sna/kgem.h
+@@ -42,6 +42,7 @@ struct kgem_bo {
+ #define RQ(rq) ((struct kgem_request *)((uintptr_t)(rq) & ~3))
+ #define RQ_RING(rq) ((uintptr_t)(rq) & 3)
+ #define RQ_IS_BLT(rq) (RQ_RING(rq) == KGEM_BLT)
++#define RQ_IS_RENDER(rq) (RQ_RING(rq) == KGEM_RENDER)
+ #define MAKE_REQUEST(rq, ring) ((struct kgem_request *)((uintptr_t)(rq) | (ring)))
+ 
+ 	struct drm_i915_gem_exec_object2 *exec;
+@@ -103,7 +104,7 @@ struct kgem_request {
+ 	struct list list;
+ 	struct kgem_bo *bo;
+ 	struct list buffers;
+-	int ring;
++	unsigned ring;
+ };
+ 
+ enum {
+@@ -112,6 +113,12 @@ enum {
+ 	NUM_MAP_TYPES,
+ };
+ 
++typedef void (*memcpy_box_func)(const void *src, void *dst, int bpp,
++				int32_t src_stride, int32_t dst_stride,
++				int16_t src_x, int16_t src_y,
++				int16_t dst_x, int16_t dst_y,
++				uint16_t width, uint16_t height);
++
+ struct kgem {
+ 	unsigned wedged;
+ 	int fd;
+@@ -157,6 +164,8 @@ struct kgem {
+ 		int16_t count;
+ 	} vma[NUM_MAP_TYPES];
+ 
++	uint32_t bcs_state;
++
+ 	uint32_t batch_flags;
+ 	uint32_t batch_flags_base;
+ #define I915_EXEC_SECURE (1<<9)
+@@ -186,9 +195,15 @@ struct kgem {
+ 	uint32_t has_no_reloc :1;
+ 	uint32_t has_handle_lut :1;
+ 	uint32_t has_wc_mmap :1;
++	uint32_t has_dirtyfb :1;
+ 
++	uint32_t can_fence :1;
+ 	uint32_t can_blt_cpu :1;
++	uint32_t can_blt_y :1;
+ 	uint32_t can_render_y :1;
++	uint32_t can_scanout_y :1;
++
++	uint32_t needs_dirtyfb :1;
+ 
+ 	uint16_t fence_max;
+ 	uint16_t half_cpu_cache_pages;
+@@ -203,16 +218,9 @@ struct kgem {
+ 	void (*retire)(struct kgem *kgem);
+ 	void (*expire)(struct kgem *kgem);
+ 
+-	void (*memcpy_to_tiled_x)(const void *src, void *dst, int bpp,
+-				  int32_t src_stride, int32_t dst_stride,
+-				  int16_t src_x, int16_t src_y,
+-				  int16_t dst_x, int16_t dst_y,
+-				  uint16_t width, uint16_t height);
+-	void (*memcpy_from_tiled_x)(const void *src, void *dst, int bpp,
+-				    int32_t src_stride, int32_t dst_stride,
+-				    int16_t src_x, int16_t src_y,
+-				    int16_t dst_x, int16_t dst_y,
+-				    uint16_t width, uint16_t height);
++	memcpy_box_func memcpy_to_tiled_x;
++	memcpy_box_func memcpy_from_tiled_x;
++	memcpy_box_func memcpy_between_tiled_x;
+ 
+ 	struct kgem_bo *batch_bo;
+ 
+@@ -230,7 +238,7 @@ struct kgem {
+ 
+ #define KGEM_MAX_DEFERRED_VBO 16
+ 
+-#define KGEM_BATCH_RESERVED 1
++#define KGEM_BATCH_RESERVED 8 /* LRI(SWCTRL) + END */
+ #define KGEM_RELOC_RESERVED (KGEM_MAX_DEFERRED_VBO)
+ #define KGEM_EXEC_RESERVED (1+KGEM_MAX_DEFERRED_VBO)
+ 
+@@ -317,6 +325,7 @@ bool kgem_bo_convert_to_gpu(struct kgem *kgem,
+ 			    struct kgem_bo *bo,
+ 			    unsigned flags);
+ 
++bool kgem_bo_is_fenced(struct kgem *kgem, struct kgem_bo *bo);
+ uint32_t kgem_bo_get_binding(struct kgem_bo *bo, uint32_t format);
+ void kgem_bo_set_binding(struct kgem_bo *bo, uint32_t format, uint16_t offset);
+ 
+@@ -342,6 +351,11 @@ static inline bool kgem_ring_is_idle(struct kgem *kgem, int ring)
+ {
+ 	ring = ring == KGEM_BLT;
+ 
++	if (kgem->needs_semaphore &&
++	    !list_is_empty(&kgem->requests[!ring]) &&
++	    !__kgem_ring_is_idle(kgem, !ring))
++		return false;
++
+ 	if (list_is_empty(&kgem->requests[ring]))
+ 		return true;
+ 
+@@ -390,6 +404,7 @@ void _kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo);
+ static inline void kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
+ {
+ 	assert(bo->refcnt);
++	assert(bo->refcnt > bo->active_scanout);
+ 	if (--bo->refcnt == 0)
+ 		_kgem_bo_destroy(kgem, bo);
+ }
+@@ -400,13 +415,13 @@ static inline void kgem_set_mode(struct kgem *kgem,
+ 				 enum kgem_mode mode,
+ 				 struct kgem_bo *bo)
+ {
+-	assert(!kgem->wedged);
++	warn_unless(!kgem->wedged);
+ 
+ #if DEBUG_FLUSH_BATCH
+ 	kgem_submit(kgem);
+ #endif
+ 
+-	if (kgem->nreloc && bo->exec == NULL && kgem_ring_is_idle(kgem, kgem->ring)) {
++	if (kgem->nreloc && bo->rq == NULL && kgem_ring_is_idle(kgem, kgem->ring)) {
+ 		DBG(("%s: flushing before new bo\n", __FUNCTION__));
+ 		_kgem_submit(kgem);
+ 	}
+@@ -422,7 +437,7 @@ static inline void _kgem_set_mode(struct kgem *kgem, enum kgem_mode mode)
+ {
+ 	assert(kgem->mode == KGEM_NONE);
+ 	assert(kgem->nbatch == 0);
+-	assert(!kgem->wedged);
++	warn_unless(!kgem->wedged);
+ 	kgem->context_switch(kgem, mode);
+ 	kgem->mode = mode;
+ }
+@@ -566,7 +581,7 @@ static inline bool kgem_bo_can_blt(struct kgem *kgem,
+ {
+ 	assert(bo->refcnt);
+ 
+-	if (bo->tiling == I915_TILING_Y) {
++	if (bo->tiling == I915_TILING_Y && !kgem->can_blt_y) {
+ 		DBG(("%s: can not blt to handle=%d, tiling=Y\n",
+ 		     __FUNCTION__, bo->handle));
+ 		return false;
+@@ -581,6 +596,22 @@ static inline bool kgem_bo_can_blt(struct kgem *kgem,
+ 	return kgem_bo_blt_pitch_is_ok(kgem, bo);
+ }
+ 
++void __kgem_bcs_set_tiling(struct kgem *kgem,
++			   struct kgem_bo *src,
++			   struct kgem_bo *dst);
++
++inline static void kgem_bcs_set_tiling(struct kgem *kgem,
++				       struct kgem_bo *src,
++				       struct kgem_bo *dst)
++{
++	assert(kgem->mode == KGEM_BLT);
++
++	if (!kgem->can_blt_y)
++		return;
++
++	__kgem_bcs_set_tiling(kgem, src, dst);
++}
++
+ static inline bool kgem_bo_is_snoop(struct kgem_bo *bo)
+ {
+ 	assert(bo->refcnt);
+@@ -607,17 +638,24 @@ static inline void kgem_bo_mark_busy(struct kgem *kgem, struct kgem_bo *bo, int
+ 	}
+ }
+ 
+-inline static void __kgem_bo_clear_busy(struct kgem_bo *bo)
++static inline void __kgem_bo_clear_dirty(struct kgem_bo *bo)
+ {
+ 	DBG(("%s: handle=%d\n", __FUNCTION__, bo->handle));
+-	bo->rq = NULL;
+-	list_del(&bo->request);
+ 
+ 	bo->domain = DOMAIN_NONE;
+ 	bo->needs_flush = false;
+ 	bo->gtt_dirty = false;
+ }
+ 
++inline static void __kgem_bo_clear_busy(struct kgem_bo *bo)
++{
++	DBG(("%s: handle=%d\n", __FUNCTION__, bo->handle));
++	bo->rq = NULL;
++	list_del(&bo->request);
++
++	__kgem_bo_clear_dirty(bo);
++}
++
+ static inline bool kgem_bo_is_busy(struct kgem_bo *bo)
+ {
+ 	DBG(("%s: handle=%d, domain: %d exec? %d, rq? %d\n", __FUNCTION__,
+@@ -626,7 +664,7 @@ static inline bool kgem_bo_is_busy(struct kgem_bo *bo)
+ 	return bo->rq;
+ }
+ 
+-void __kgem_retire_requests_upto(struct kgem *kgem, struct kgem_bo *bo);
++bool __kgem_retire_requests_upto(struct kgem *kgem, struct kgem_bo *bo);
+ static inline bool __kgem_bo_is_busy(struct kgem *kgem, struct kgem_bo *bo)
+ {
+ 	DBG(("%s: handle=%d, domain: %d exec? %d, rq? %d\n", __FUNCTION__,
+@@ -636,14 +674,13 @@ static inline bool __kgem_bo_is_busy(struct kgem *kgem, struct kgem_bo *bo)
+ 	if (bo->exec)
+ 		return true;
+ 
+-	if (bo->rq && !__kgem_busy(kgem, bo->handle)) {
+-		__kgem_retire_requests_upto(kgem, bo);
+-		assert(list_is_empty(&bo->request));
+-		assert(bo->rq == NULL);
+-		assert(bo->domain == DOMAIN_NONE);
+-	}
++	if (bo->rq == NULL)
++		return false;
++
++	if (__kgem_busy(kgem, bo->handle))
++		return true;
+ 
+-	return kgem_bo_is_busy(bo);
++	return __kgem_retire_requests_upto(kgem, bo);
+ }
+ 
+ static inline bool kgem_bo_is_render(struct kgem_bo *bo)
+@@ -651,7 +688,15 @@ static inline bool kgem_bo_is_render(struct kgem_bo *bo)
+ 	DBG(("%s: handle=%d, rq? %d [%d]\n", __FUNCTION__,
+ 	     bo->handle, bo->rq != NULL, (int)RQ_RING(bo->rq)));
+ 	assert(bo->refcnt);
+-	return bo->rq && RQ_RING(bo->rq) == I915_EXEC_RENDER;
++	return bo->rq && RQ_RING(bo->rq) != KGEM_BLT;
++}
++
++static inline bool kgem_bo_is_blt(struct kgem_bo *bo)
++{
++	DBG(("%s: handle=%d, rq? %d\n", __FUNCTION__,
++	     bo->handle, bo->rq != NULL, (int)RQ_RING(bo->rq)));
++	assert(bo->refcnt);
++	return RQ_RING(bo->rq) == KGEM_BLT;
+ }
+ 
+ static inline void kgem_bo_mark_unreusable(struct kgem_bo *bo)
+@@ -852,6 +897,6 @@ memcpy_from_tiled_x(struct kgem *kgem,
+ 					 width, height);
+ }
+ 
+-void choose_memcpy_tiled_x(struct kgem *kgem, int swizzling);
++void choose_memcpy_tiled_x(struct kgem *kgem, int swizzling, unsigned cpu);
+ 
+ #endif /* KGEM_H */
+diff --git a/src/sna/kgem_debug_gen4.c b/src/sna/kgem_debug_gen4.c
+index 9b80dc88..8e6e47b6 100644
+--- a/src/sna/kgem_debug_gen4.c
++++ b/src/sna/kgem_debug_gen4.c
+@@ -598,7 +598,7 @@ int kgem_gen4_decode_3d(struct kgem *kgem, uint32_t offset)
+ 		assert(len == 7);
+ 		kgem_debug_print(data, offset, 0,
+ 			  "3DSTATE_DEPTH_BUFFER\n");
+-		kgem_debug_print(data, offset, 1, "%s, %s, pitch = %d bytes, %stiled, HiZ %d, Seperate Stencil %d\n",
++		kgem_debug_print(data, offset, 1, "%s, %s, pitch = %d bytes, %stiled, HiZ %d, Separate Stencil %d\n",
+ 			  get_965_surfacetype(data[1] >> 29),
+ 			  get_965_depthformat((data[1] >> 18) & 0x7),
+ 			  (data[1] & 0x0001ffff) + 1,
+diff --git a/src/sna/kgem_debug_gen5.c b/src/sna/kgem_debug_gen5.c
+index 8b55dd91..f1b1275f 100644
+--- a/src/sna/kgem_debug_gen5.c
++++ b/src/sna/kgem_debug_gen5.c
+@@ -573,7 +573,7 @@ int kgem_gen5_decode_3d(struct kgem *kgem, uint32_t offset)
+ 		assert(len == 7);
+ 		kgem_debug_print(data, offset, 0,
+ 			  "3DSTATE_DEPTH_BUFFER\n");
+-		kgem_debug_print(data, offset, 1, "%s, %s, pitch = %d bytes, %stiled, HiZ %d, Seperate Stencil %d\n",
++		kgem_debug_print(data, offset, 1, "%s, %s, pitch = %d bytes, %stiled, HiZ %d, Separate Stencil %d\n",
+ 			  get_965_surfacetype(data[1] >> 29),
+ 			  get_965_depthformat((data[1] >> 18) & 0x7),
+ 			  (data[1] & 0x0001ffff) + 1,
+diff --git a/src/sna/kgem_debug_gen6.c b/src/sna/kgem_debug_gen6.c
+index 7ef55d38..579c5d54 100644
+--- a/src/sna/kgem_debug_gen6.c
++++ b/src/sna/kgem_debug_gen6.c
+@@ -985,7 +985,7 @@ int kgem_gen6_decode_3d(struct kgem *kgem, uint32_t offset)
+ 		assert(len == 7);
+ 		kgem_debug_print(data, offset, 0,
+ 			  "3DSTATE_DEPTH_BUFFER\n");
+-		kgem_debug_print(data, offset, 1, "%s, %s, pitch = %d bytes, %stiled, HiZ %d, Seperate Stencil %d\n",
++		kgem_debug_print(data, offset, 1, "%s, %s, pitch = %d bytes, %stiled, HiZ %d, Separate Stencil %d\n",
+ 			  get_965_surfacetype(data[1] >> 29),
+ 			  get_965_depthformat((data[1] >> 18) & 0x7),
+ 			  (data[1] & 0x0001ffff) + 1,
+diff --git a/src/sna/sna.h b/src/sna/sna.h
+index 18425e30..7861110a 100644
+--- a/src/sna/sna.h
++++ b/src/sna/sna.h
+@@ -154,6 +154,8 @@ struct sna_pixmap {
+ #define MAPPED_GTT 1
+ #define MAPPED_CPU 2
+ 	uint8_t flush :2;
++#define FLUSH_READ 1
++#define FLUSH_WRITE 2
+ 	uint8_t shm :1;
+ 	uint8_t clear :1;
+ 	uint8_t header :1;
+@@ -179,18 +181,31 @@ static inline WindowPtr get_root_window(ScreenPtr screen)
+ #endif
+ }
+ 
++#if !NDEBUG
++static PixmapPtr check_pixmap(PixmapPtr pixmap)
++{
++	if (pixmap != NULL) {
++		assert(pixmap->refcnt >= 1);
++		assert(pixmap->devKind != 0xdeadbeef);
++	}
++	return pixmap;
++}
++#else
++#define check_pixmap(p) p
++#endif
++
+ static inline PixmapPtr get_window_pixmap(WindowPtr window)
+ {
+ 	assert(window);
+ 	assert(window->drawable.type != DRAWABLE_PIXMAP);
+-	return fbGetWindowPixmap(window);
++	return check_pixmap(fbGetWindowPixmap(window));
+ }
+ 
+ static inline PixmapPtr get_drawable_pixmap(DrawablePtr drawable)
+ {
+ 	assert(drawable);
+ 	if (drawable->type == DRAWABLE_PIXMAP)
+-		return (PixmapPtr)drawable;
++		return check_pixmap((PixmapPtr)drawable);
+ 	else
+ 		return get_window_pixmap((WindowPtr)drawable);
+ }
+@@ -244,11 +259,12 @@ struct sna {
+ #define SNA_NO_VSYNC		0x40
+ #define SNA_TRIPLE_BUFFER	0x80
+ #define SNA_TEAR_FREE		0x100
+-#define SNA_FORCE_SHADOW	0x200
+-#define SNA_FLUSH_GTT		0x400
++#define SNA_WANT_TEAR_FREE	0x200
++#define SNA_FORCE_SHADOW	0x400
++#define SNA_FLUSH_GTT		0x800
+ #define SNA_PERFORMANCE		0x1000
+ #define SNA_POWERSAVE		0x2000
+-#define SNA_REMOVE_OUTPUTS	0x4000
++#define SNA_NO_DPMS		0x4000
+ #define SNA_HAS_FLIP		0x10000
+ #define SNA_HAS_ASYNC_FLIP	0x20000
+ #define SNA_LINEAR_FB		0x40000
+@@ -265,7 +281,13 @@ struct sna {
+ #define AVX 0x80
+ #define AVX2 0x100
+ 
+-	unsigned watch_flush;
++	bool ignore_copy_area : 1;
++
++	unsigned watch_shm_flush;
++	unsigned watch_dri_flush;
++	unsigned damage_event;
++	bool needs_shm_flush;
++	bool needs_dri_flush;
+ 
+ 	struct timeval timer_tv;
+ 	uint32_t timer_expire[NUM_TIMERS];
+@@ -284,9 +306,17 @@ struct sna {
+ 		struct kgem_bo *shadow;
+ 		unsigned front_active;
+ 		unsigned shadow_active;
++		unsigned rr_active;
+ 		unsigned flip_active;
++		unsigned hidden;
++		bool shadow_enabled;
++		bool shadow_wait;
+ 		bool dirty;
+ 
++		struct drm_event_vblank *shadow_events;
++		int shadow_nevent;
++		int shadow_size;
++
+ 		int max_crtc_width, max_crtc_height;
+ 		RegionRec shadow_region;
+ 		RegionRec shadow_cancel;
+@@ -318,7 +348,8 @@ struct sna {
+ 		uint32_t fg, bg;
+ 		int size;
+ 
+-		int active;
++		bool disable;
++		bool active;
+ 		int last_x;
+ 		int last_y;
+ 
+@@ -331,8 +362,9 @@ struct sna {
+ 	} cursor;
+ 
+ 	struct sna_dri2 {
+-		bool available;
+-		bool open;
++		bool available : 1;
++		bool enable : 1;
++		bool open : 1;
+ 
+ #if HAVE_DRI2
+ 		void *flip_pending;
+@@ -341,8 +373,11 @@ struct sna {
+ 	} dri2;
+ 
+ 	struct sna_dri3 {
+-		bool available;
+-		bool open;
++		bool available :1;
++		bool override : 1;
++		bool enable : 1;
++		bool open :1;
++
+ #if HAVE_DRI3
+ 		SyncScreenCreateFenceFunc create_fence;
+ 		struct list pixmaps;
+@@ -353,6 +388,9 @@ struct sna {
+ 		bool available;
+ 		bool open;
+ #if HAVE_PRESENT
++		struct list vblank_queue;
++		uint64_t unflip;
++		void *freed_info;
+ #endif
+ 	} present;
+ 
+@@ -364,8 +402,10 @@ struct sna {
+ 	EntityInfoPtr pEnt;
+ 	const struct intel_device_info *info;
+ 
++#if !HAVE_NOTIFY_FD
+ 	ScreenBlockHandlerProcPtr BlockHandler;
+ 	ScreenWakeupHandlerProcPtr WakeupHandler;
++#endif
+ 	CloseScreenProcPtr CloseScreen;
+ 
+ 	PicturePtr clear;
+@@ -383,6 +423,7 @@ struct sna {
+ 		struct gen6_render_state gen6;
+ 		struct gen7_render_state gen7;
+ 		struct gen8_render_state gen8;
++		struct gen9_render_state gen9;
+ 	} render_state;
+ 
+ 	/* Broken-out options. */
+@@ -420,7 +461,7 @@ bool sna_mode_pre_init(ScrnInfoPtr scrn, struct sna *sna);
+ bool sna_mode_fake_init(struct sna *sna, int num_fake);
+ bool sna_mode_wants_tear_free(struct sna *sna);
+ void sna_mode_adjust_frame(struct sna *sna, int x, int y);
+-extern void sna_mode_discover(struct sna *sna);
++extern void sna_mode_discover(struct sna *sna, bool tell);
+ extern void sna_mode_check(struct sna *sna);
+ extern bool sna_mode_disable(struct sna *sna);
+ extern void sna_mode_enable(struct sna *sna);
+@@ -434,6 +475,7 @@ extern void sna_shadow_unset_crtc(struct sna *sna, xf86CrtcPtr crtc);
+ extern bool sna_pixmap_discard_shadow_damage(struct sna_pixmap *priv,
+ 					     const RegionRec *region);
+ extern void sna_mode_set_primary(struct sna *sna);
++extern bool sna_mode_find_hotplug_connector(struct sna *sna, unsigned id);
+ extern void sna_mode_close(struct sna *sna);
+ extern void sna_mode_fini(struct sna *sna);
+ 
+@@ -444,6 +486,7 @@ extern bool sna_cursors_init(ScreenPtr screen, struct sna *sna);
+ typedef void (*sna_flip_handler_t)(struct drm_event_vblank *e,
+ 				   void *data);
+ 
++extern bool sna_needs_page_flip(struct sna *sna, struct kgem_bo *bo);
+ extern int sna_page_flip(struct sna *sna,
+ 			 struct kgem_bo *bo,
+ 			 sna_flip_handler_t handler,
+@@ -461,6 +504,11 @@ to_sna_from_screen(ScreenPtr screen)
+ 	return to_sna(xf86ScreenToScrn(screen));
+ }
+ 
++pure static inline ScreenPtr to_screen_from_sna(struct sna *sna)
++{
++	return xf86ScrnToScreen(sna->scrn);
++}
++
+ pure static inline struct sna *
+ to_sna_from_pixmap(PixmapPtr pixmap)
+ {
+@@ -498,12 +546,11 @@ to_sna_from_kgem(struct kgem *kgem)
+ extern xf86CrtcPtr sna_covering_crtc(struct sna *sna,
+ 				     const BoxRec *box,
+ 				     xf86CrtcPtr desired);
++extern xf86CrtcPtr sna_primary_crtc(struct sna *sna);
+ 
+ extern bool sna_wait_for_scanline(struct sna *sna, PixmapPtr pixmap,
+ 				  xf86CrtcPtr crtc, const BoxRec *clip);
+ 
+-xf86CrtcPtr sna_mode_first_crtc(struct sna *sna);
+-
+ const struct ust_msc {
+ 	uint64_t msc;
+ 	int tv_sec;
+@@ -536,6 +583,11 @@ static inline uint64_t ust64(int tv_sec, int tv_usec)
+ 	return (uint64_t)tv_sec * 1000000 + tv_usec;
+ }
+ 
++static inline uint64_t swap_ust(const struct ust_msc *swap)
++{
++	return ust64(swap->tv_sec, swap->tv_usec);
++}
++
+ #if HAVE_DRI2
+ bool sna_dri2_open(struct sna *sna, ScreenPtr pScreen);
+ void sna_dri2_page_flip_handler(struct sna *sna, struct drm_event_vblank *event);
+@@ -567,20 +619,59 @@ bool sna_present_open(struct sna *sna, ScreenPtr pScreen);
+ void sna_present_update(struct sna *sna);
+ void sna_present_close(struct sna *sna, ScreenPtr pScreen);
+ void sna_present_vblank_handler(struct drm_event_vblank *event);
++void sna_present_cancel_flip(struct sna *sna);
+ #else
+ static inline bool sna_present_open(struct sna *sna, ScreenPtr pScreen) { return false; }
+ static inline void sna_present_update(struct sna *sna) { }
+ static inline void sna_present_close(struct sna *sna, ScreenPtr pScreen) { }
+ static inline void sna_present_vblank_handler(struct drm_event_vblank *event) { }
++static inline void sna_present_cancel_flip(struct sna *sna) { }
+ #endif
+ 
+-extern bool sna_crtc_set_sprite_rotation(xf86CrtcPtr crtc, uint32_t rotation);
+-extern int sna_crtc_to_pipe(xf86CrtcPtr crtc);
+-extern uint32_t sna_crtc_to_sprite(xf86CrtcPtr crtc);
+-extern uint32_t sna_crtc_id(xf86CrtcPtr crtc);
+-extern bool sna_crtc_is_on(xf86CrtcPtr crtc);
++extern unsigned sna_crtc_count_sprites(xf86CrtcPtr crtc);
++extern bool sna_crtc_set_sprite_rotation(xf86CrtcPtr crtc, unsigned idx, uint32_t rotation);
++extern uint32_t sna_crtc_to_sprite(xf86CrtcPtr crtc, unsigned idx);
+ extern bool sna_crtc_is_transformed(xf86CrtcPtr crtc);
+ 
++#define CRTC_VBLANK 0x3
++#define CRTC_ON 0x80000000
++
++uint32_t sna_crtc_id(xf86CrtcPtr crtc);
++
++static inline unsigned long *sna_crtc_flags(xf86CrtcPtr crtc)
++{
++	unsigned long *flags = crtc->driver_private;
++	assert(flags);
++	return flags;
++}
++
++static inline unsigned sna_crtc_pipe(xf86CrtcPtr crtc)
++{
++	return *sna_crtc_flags(crtc) >> 8 & 0xff;
++}
++
++static inline bool sna_crtc_is_on(xf86CrtcPtr crtc)
++{
++	return *sna_crtc_flags(crtc) & CRTC_ON;
++}
++
++static inline void sna_crtc_set_vblank(xf86CrtcPtr crtc)
++{
++	assert((*sna_crtc_flags(crtc) & CRTC_VBLANK) < 3);
++	++*sna_crtc_flags(crtc);
++}
++
++static inline void sna_crtc_clear_vblank(xf86CrtcPtr crtc)
++{
++	assert(*sna_crtc_flags(crtc) & CRTC_VBLANK);
++	--*sna_crtc_flags(crtc);
++}
++
++static inline bool sna_crtc_has_vblank(xf86CrtcPtr crtc)
++{
++	return *sna_crtc_flags(crtc) & CRTC_VBLANK;
++}
++
+ CARD32 sna_format_for_depth(int depth);
+ CARD32 sna_render_format_for_depth(int depth);
+ 
+@@ -998,15 +1089,14 @@ static inline uint32_t pixmap_size(PixmapPtr pixmap)
+ 
+ bool sna_accel_init(ScreenPtr sreen, struct sna *sna);
+ void sna_accel_create(struct sna *sna);
+-void sna_accel_block_handler(struct sna *sna, struct timeval **tv);
+-void sna_accel_wakeup_handler(struct sna *sna);
+-void sna_accel_watch_flush(struct sna *sna, int enable);
++void sna_accel_block(struct sna *sna, struct timeval **tv);
+ void sna_accel_flush(struct sna *sna);
+ void sna_accel_enter(struct sna *sna);
+ void sna_accel_leave(struct sna *sna);
+ void sna_accel_close(struct sna *sna);
+ void sna_accel_free(struct sna *sna);
+ 
++void sna_watch_flush(struct sna *sna, int enable);
+ void sna_copy_fbcon(struct sna *sna);
+ 
+ bool sna_composite_create(struct sna *sna);
+@@ -1127,6 +1217,16 @@ memcpy_blt(const void *src, void *dst, int bpp,
+ 	   uint16_t width, uint16_t height);
+ 
+ void
++affine_blt(const void *src, void *dst, int bpp,
++	   int16_t src_x, int16_t src_y,
++	   int16_t src_width, int16_t src_height,
++	   int32_t src_stride,
++	   int16_t dst_x, int16_t dst_y,
++	   uint16_t dst_width, uint16_t dst_height,
++	   int32_t dst_stride,
++	   const struct pixman_f_transform *t);
++
++void
+ memmove_box(const void *src, void *dst,
+ 	    int bpp, int32_t stride,
+ 	    const BoxRec *box,
+@@ -1182,6 +1282,31 @@ box_intersect(BoxPtr a, const BoxRec *b)
+ 	return true;
+ }
+ 
++const BoxRec *
++__find_clip_box_for_y(const BoxRec *begin, const BoxRec *end, int16_t y);
++inline static const BoxRec *
++find_clip_box_for_y(const BoxRec *begin, const BoxRec *end, int16_t y)
++{
++	/* Special case for incremental trapezoid clipping */
++	if (begin == end)
++		return end;
++
++	/* Quick test if scanline is within range of clip boxes */
++	if (begin->y2 > y) {
++		assert(end == begin + 1 ||
++		       __find_clip_box_for_y(begin, end, y) == begin);
++		return begin;
++	}
++	if (y >= end[-1].y2) {
++		assert(end == begin + 1 ||
++		       __find_clip_box_for_y(begin, end, y) == end);
++		return end;
++	}
++
++	/* Otherwise bisect to find the first box crossing y */
++	return __find_clip_box_for_y(begin, end, y);
++}
++
+ unsigned sna_cpu_detect(void);
+ char *sna_cpu_features_to_string(unsigned features, char *line);
+ 
+@@ -1237,4 +1362,17 @@ static inline void sigtrap_put(void)
+ extern int getline(char **line, size_t *len, FILE *file);
+ #endif
+ 
++static inline void add_shm_flush(struct sna *sna, struct sna_pixmap *priv)
++{
++	if (!priv->shm)
++		return;
++
++	DBG(("%s: marking handle=%d for SHM flush\n",
++	     __FUNCTION__, priv->cpu_bo->handle));
++
++	assert(!priv->flush);
++	sna_add_flush_pixmap(sna, priv, priv->cpu_bo);
++	sna->needs_shm_flush = true;
++}
++
+ #endif /* _SNA_H */
+diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
+index baf5f609..25a075cf 100644
+--- a/src/sna/sna_accel.c
++++ b/src/sna/sna_accel.c
+@@ -50,8 +50,11 @@
+ #endif
+ #include <shmint.h>
+ 
++#include <X11/extensions/damageproto.h>
++
+ #include <sys/time.h>
+ #include <sys/mman.h>
++#include <sys/ioctl.h>
+ #include <unistd.h>
+ 
+ #ifdef HAVE_VALGRIND
+@@ -66,7 +69,8 @@
+ #define FORCE_FLUSH 0
+ #define FORCE_FULL_SYNC 0 /* https://bugs.freedesktop.org/show_bug.cgi?id=61628 */
+ 
+-#define DEFAULT_TILING I915_TILING_X
++#define DEFAULT_PIXMAP_TILING I915_TILING_X
++#define DEFAULT_SCANOUT_TILING I915_TILING_X
+ 
+ #define USE_INPLACE 1
+ #define USE_SPANS 0 /* -1 force CPU, 1 force GPU */
+@@ -115,6 +119,11 @@
+ #define RECTILINEAR	0x4
+ #define OVERWRITES	0x8
+ 
++#if XFONT2_CLIENT_FUNCS_VERSION >= 1
++#define AllocateFontPrivateIndex() xfont2_allocate_font_private_index()
++#define FontSetPrivate(font, idx, data) xfont2_font_set_private(font, idx, data)
++#endif
++
+ #if 0
+ static void __sna_fallback_flush(DrawablePtr d)
+ {
+@@ -213,6 +222,7 @@ static GCOps sna_gc_ops__tmp;
+ static const GCFuncs sna_gc_funcs;
+ static const GCFuncs sna_gc_funcs__cpu;
+ 
++static void sna_shm_watch_flush(struct sna *sna, int enable);
+ static void
+ sna_poly_fill_rect__gpu(DrawablePtr draw, GCPtr gc, int n, xRectangle *rect);
+ 
+@@ -527,10 +537,10 @@ sna_pixmap_alloc_cpu(struct sna *sna,
+ 		DBG(("%s: allocating CPU buffer (%dx%d)\n", __FUNCTION__,
+ 		     pixmap->drawable.width, pixmap->drawable.height));
+ 
+-		hint = 0;
+-		if ((flags & MOVE_ASYNC_HINT) == 0 &&
+-		    ((flags & MOVE_READ) == 0 || (priv->gpu_damage && !priv->clear && !sna->kgem.has_llc)))
+-			hint = CREATE_CPU_MAP | CREATE_INACTIVE | CREATE_NO_THROTTLE;
++		hint = CREATE_CPU_MAP | CREATE_INACTIVE | CREATE_NO_THROTTLE;
++		if ((flags & MOVE_ASYNC_HINT) ||
++		    (priv->gpu_damage && !priv->clear && kgem_bo_is_busy(priv->gpu_bo) && sna->kgem.can_blt_cpu))
++			hint = 0;
+ 
+ 		priv->cpu_bo = kgem_create_cpu_2d(&sna->kgem,
+ 						  pixmap->drawable.width,
+@@ -580,7 +590,7 @@ static void __sna_pixmap_free_cpu(struct sna *sna, struct sna_pixmap *priv)
+ 		if (priv->cpu_bo->flush) {
+ 			assert(!priv->cpu_bo->reusable);
+ 			kgem_bo_sync__cpu(&sna->kgem, priv->cpu_bo);
+-			sna_accel_watch_flush(sna, -1);
++			sna_shm_watch_flush(sna, -1);
+ 		}
+ 		kgem_bo_destroy(&sna->kgem, priv->cpu_bo);
+ 	} else if (!IS_STATIC_PTR(priv->ptr))
+@@ -612,9 +622,9 @@ static bool sna_pixmap_free_cpu(struct sna *sna, struct sna_pixmap *priv, bool a
+ 
+ static inline uint32_t default_tiling(struct sna *sna, PixmapPtr pixmap)
+ {
+-#if DEFAULT_TILING == I915_TILING_NONE
++#if DEFAULT_PIXMAP_TILING == I915_TILING_NONE
+ 	return I915_TILING_NONE;
+-#elif DEFAULT_TILING == I915_TILING_X
++#elif DEFAULT_PIXMAP_TILING == I915_TILING_X
+ 	return I915_TILING_X;
+ #else
+ 	/* Try to avoid hitting the Y-tiling GTT mapping bug on 855GM */
+@@ -630,15 +640,6 @@ static inline uint32_t default_tiling(struct sna *sna, PixmapPtr pixmap)
+ 	     pixmap->drawable.height > sna->render.max_3d_size))
+ 		return I915_TILING_X;
+ 
+-	if (sna_damage_is_all(&sna_pixmap(pixmap)->cpu_damage,
+-			      pixmap->drawable.width,
+-			      pixmap->drawable.height)) {
+-		DBG(("%s: entire source is damaged, using Y-tiling\n",
+-		     __FUNCTION__));
+-		sna_damage_destroy(&sna_pixmap(priv)->gpu_damage);
+-		return I915_TILING_Y;
+-	}
+-
+ 	return I915_TILING_Y;
+ #endif
+ }
+@@ -666,6 +667,7 @@ struct kgem_bo *sna_pixmap_change_tiling(PixmapPtr pixmap, uint32_t tiling)
+ 	     __FUNCTION__, priv->gpu_bo->tiling, tiling,
+ 	     pixmap->drawable.width, pixmap->drawable.height));
+ 	assert(priv->gpu_damage == NULL || priv->gpu_bo);
++	assert(priv->gpu_bo->tiling != tiling);
+ 
+ 	if (priv->pinned) {
+ 		DBG(("%s: can't convert pinned bo\n", __FUNCTION__));
+@@ -690,6 +692,12 @@ struct kgem_bo *sna_pixmap_change_tiling(PixmapPtr pixmap, uint32_t tiling)
+ 		return NULL;
+ 	}
+ 
++	if (bo->tiling == priv->gpu_bo->tiling) {
++		DBG(("%s: tiling request failed\n", __FUNCTION__));
++		kgem_bo_destroy(&sna->kgem, bo);
++		return NULL;
++	}
++
+ 	box.x1 = box.y1 = 0;
+ 	box.x2 = pixmap->drawable.width;
+ 	box.y2 = pixmap->drawable.height;
+@@ -824,8 +832,8 @@ create_pixmap(struct sna *sna, ScreenPtr screen,
+ 		datasize += adjust;
+ 	}
+ 
+-	DBG(("%s: allocating pixmap %dx%d, depth=%d, size=%ld\n",
+-	     __FUNCTION__, width, height, depth, (long)datasize));
++	DBG(("%s: allocating pixmap %dx%d, depth=%d/%d, size=%ld\n",
++	     __FUNCTION__, width, height, depth, bpp, (long)datasize));
+ 	pixmap = AllocatePixmap(screen, datasize);
+ 	if (!pixmap)
+ 		return NullPixmap;
+@@ -878,7 +886,11 @@ __pop_freed_pixmap(struct sna *sna)
+ 	pixmap = sna->freed_pixmap;
+ 	sna->freed_pixmap = pixmap->devPrivate.ptr;
+ 
++	DBG(("%s: reusing freed pixmap=%ld header\n",
++	     __FUNCTION__, pixmap->drawable.serialNumber));
++
+ 	assert(pixmap->refcnt == 0);
++	assert(pixmap->devKind = 0xdeadbeef);
+ 	assert(sna_pixmap(pixmap));
+ 	assert(sna_pixmap(pixmap)->header);
+ 
+@@ -990,7 +1002,7 @@ fallback:
+ 	}
+ 	priv->cpu_bo->pitch = pitch;
+ 	kgem_bo_mark_unreusable(priv->cpu_bo);
+-	sna_accel_watch_flush(sna, 1);
++	sna_shm_watch_flush(sna, 1);
+ #ifdef DEBUG_MEMORY
+ 	sna->debug_memory.cpu_bo_allocs++;
+ 	sna->debug_memory.cpu_bo_bytes += kgem_bo_size(priv->cpu_bo);
+@@ -1081,6 +1093,18 @@ sna_pixmap_create_scratch(ScreenPtr screen,
+ 	return pixmap;
+ }
+ 
++static unsigned small_copy(const RegionRec *region)
++{
++	if ((region->extents.x2 - region->extents.x1)*(region->extents.y2 - region->extents.y1) < 1024) {
++		DBG(("%s: region:%dx%d\n", __FUNCTION__,
++		     (region->extents.x2 - region->extents.x1),
++		     (region->extents.y2 - region->extents.y1)));
++		return COPY_SMALL;
++	}
++
++	return 0;
++}
++
+ #ifdef CREATE_PIXMAP_USAGE_SHARED
+ static Bool
+ sna_share_pixmap_backing(PixmapPtr pixmap, ScreenPtr slave, void **fd_handle)
+@@ -1124,7 +1148,7 @@ sna_share_pixmap_backing(PixmapPtr pixmap, ScreenPtr slave, void **fd_handle)
+ 				    pixmap->drawable.height,
+ 				    pixmap->drawable.bitsPerPixel,
+ 				    I915_TILING_NONE,
+-				    CREATE_GTT_MAP | CREATE_PRIME | CREATE_EXACT);
++				    CREATE_GTT_MAP | CREATE_SCANOUT | CREATE_PRIME | CREATE_EXACT);
+ 		if (bo == NULL) {
+ 			DBG(("%s: allocation failed\n", __FUNCTION__));
+ 			return FALSE;
+@@ -1243,7 +1267,7 @@ sna_create_pixmap_shared(struct sna *sna, ScreenPtr screen,
+ 					      width, height,
+ 					      pixmap->drawable.bitsPerPixel,
+ 					      I915_TILING_NONE,
+-					      CREATE_GTT_MAP | CREATE_PRIME | CREATE_EXACT);
++					      CREATE_GTT_MAP | CREATE_SCANOUT | CREATE_PRIME | CREATE_EXACT);
+ 		if (priv->gpu_bo == NULL) {
+ 			free(priv);
+ 			FreePixmap(pixmap);
+@@ -1311,7 +1335,7 @@ static PixmapPtr sna_create_pixmap(ScreenPtr screen,
+ 
+ 	if (unlikely((sna->render.prefer_gpu & PREFER_GPU_RENDER) == 0))
+ 		flags &= ~KGEM_CAN_CREATE_GPU;
+-	if (wedged(sna))
++	if (wedged(sna) && usage != SNA_CREATE_FB)
+ 		flags &= ~KGEM_CAN_CREATE_GTT;
+ 
+ 	DBG(("%s: usage=%d, flags=%x\n", __FUNCTION__, usage, flags));
+@@ -1417,10 +1441,13 @@ static void __sna_free_pixmap(struct sna *sna,
+ 	__sna_pixmap_free_cpu(sna, priv);
+ 
+ 	if (priv->flush)
+-		sna_accel_watch_flush(sna, -1);
++		sna_watch_flush(sna, -1);
+ 
++#if !NDEBUG
++	pixmap->devKind = 0xdeadbeef;
++#endif
+ 	if (priv->header) {
+-		assert(pixmap->drawable.pScreen == sna->scrn->pScreen);
++		assert(pixmap->drawable.pScreen == to_screen_from_sna(sna));
+ 		assert(!priv->shm);
+ 		pixmap->devPrivate.ptr = sna->freed_pixmap;
+ 		sna->freed_pixmap = pixmap;
+@@ -1485,7 +1512,7 @@ static Bool sna_destroy_pixmap(PixmapPtr pixmap)
+ 	if (priv->shm && kgem_bo_is_busy(priv->cpu_bo)) {
+ 		DBG(("%s: deferring release of active SHM pixmap=%ld\n",
+ 		     __FUNCTION__, pixmap->drawable.serialNumber));
+-		sna_add_flush_pixmap(sna, priv, priv->cpu_bo);
++		add_shm_flush(sna, priv);
+ 		kgem_bo_submit(&sna->kgem, priv->cpu_bo); /* XXX ShmDetach */
+ 	} else
+ 		__sna_free_pixmap(sna, pixmap, priv);
+@@ -1529,7 +1556,7 @@ static inline bool has_coherent_ptr(struct sna *sna, struct sna_pixmap *priv, un
+ 		if (!priv->cpu_bo)
+ 			return true;
+ 
+-		assert(!priv->cpu_bo->needs_flush);
++		assert(!priv->cpu_bo->needs_flush || (flags & MOVE_WRITE) == 0);
+ 		assert(priv->pixmap->devKind == priv->cpu_bo->pitch);
+ 		return priv->pixmap->devPrivate.ptr == MAP(priv->cpu_bo->map__cpu);
+ 	}
+@@ -1557,6 +1584,11 @@ static inline bool has_coherent_ptr(struct sna *sna, struct sna_pixmap *priv, un
+ 		return true;
+ 	}
+ 
++	if (priv->pixmap->devPrivate.ptr == MAP(priv->gpu_bo->map__wc)) {
++		assert(priv->mapped == MAPPED_GTT);
++		return true;
++	}
++
+ 	return false;
+ }
+ 
+@@ -1577,6 +1609,16 @@ static inline bool pixmap_inplace(struct sna *sna,
+ 		return false;
+ 
+ 	if (priv->gpu_bo && kgem_bo_is_busy(priv->gpu_bo)) {
++		if (priv->clear) {
++			DBG(("%s: no, clear GPU bo is busy\n", __FUNCTION__));
++			return false;
++		}
++
++		if (flags & MOVE_ASYNC_HINT) {
++			DBG(("%s: no, async hint and GPU bo is busy\n", __FUNCTION__));
++			return false;
++		}
++
+ 		if ((flags & (MOVE_WRITE | MOVE_READ)) == (MOVE_WRITE | MOVE_READ)) {
+ 			DBG(("%s: no, GPU bo is busy\n", __FUNCTION__));
+ 			return false;
+@@ -1624,7 +1666,7 @@ static bool sna_pixmap_alloc_gpu(struct sna *sna,
+ 	if (pixmap->usage_hint == SNA_CREATE_FB && (sna->flags & SNA_LINEAR_FB) == 0) {
+ 		flags |= CREATE_SCANOUT;
+ 		tiling = kgem_choose_tiling(&sna->kgem,
+-					    -I915_TILING_X,
++					    -DEFAULT_SCANOUT_TILING,
+ 					    pixmap->drawable.width,
+ 					    pixmap->drawable.height,
+ 					    pixmap->drawable.bitsPerPixel);
+@@ -1861,7 +1903,9 @@ sna_pixmap_undo_cow(struct sna *sna, struct sna_pixmap *priv, unsigned flags)
+ 	assert(priv->gpu_bo == cow->bo);
+ 	assert(cow->refcnt);
+ 
+-	if (flags && (flags & MOVE_WRITE) == 0 && IS_COW_OWNER(priv->cow))
++	if (flags && /* flags == 0 => force decouple */
++	    (flags & MOVE_WRITE) == 0 &&
++	    (((flags & __MOVE_FORCE) == 0) || IS_COW_OWNER(priv->cow)))
+ 		return true;
+ 
+ 	if (!IS_COW_OWNER(priv->cow))
+@@ -1933,7 +1977,7 @@ sna_pixmap_undo_cow(struct sna *sna, struct sna_pixmap *priv, unsigned flags)
+ 			box.y2 = pixmap->drawable.height;
+ 
+ 			if (flags & __MOVE_PRIME) {
+-				create = CREATE_GTT_MAP | CREATE_PRIME | CREATE_EXACT;
++				create = CREATE_GTT_MAP | CREATE_SCANOUT | CREATE_PRIME | CREATE_EXACT;
+ 				tiling = I915_TILING_NONE;
+ 			} else {
+ 				create = 0;
+@@ -2021,6 +2065,10 @@ sna_pixmap_make_cow(struct sna *sna,
+ 		     cow->bo->handle));
+ 
+ 		src_priv->cow = MAKE_COW_OWNER(cow);
++		if (src_priv->flush & FLUSH_WRITE) {
++			assert(!src_priv->shm);
++			sna_add_flush_pixmap(sna, src_priv, src_priv->gpu_bo);
++		}
+ 	}
+ 
+ 	if (cow == COW(dst_priv->cow)) {
+@@ -2267,6 +2315,7 @@ skip_inplace_map:
+ 	    (flags & MOVE_WRITE ? (void *)priv->gpu_bo : (void *)priv->gpu_damage) && priv->cpu_damage == NULL &&
+ 	    priv->gpu_bo->tiling == I915_TILING_NONE &&
+ 	    (flags & MOVE_READ || kgem_bo_can_map__cpu(&sna->kgem, priv->gpu_bo, flags & MOVE_WRITE)) &&
++	    (!priv->clear || !kgem_bo_is_busy(priv->gpu_bo)) &&
+ 	    ((flags & (MOVE_WRITE | MOVE_ASYNC_HINT)) == 0 ||
+ 	     (!priv->cow && !priv->move_to_gpu && !__kgem_bo_is_busy(&sna->kgem, priv->gpu_bo)))) {
+ 		void *ptr;
+@@ -2330,7 +2379,9 @@ skip_inplace_map:
+ 			     pixmap->devKind, pixmap->devKind * pixmap->drawable.height));
+ 
+ 			if (priv->cpu_bo) {
++				kgem_bo_undo(&sna->kgem, priv->cpu_bo);
+ 				if ((flags & MOVE_ASYNC_HINT || priv->cpu_bo->exec) &&
++				    sna->kgem.can_blt_cpu &&
+ 				    sna->render.fill_one(sna,
+ 							  pixmap, priv->cpu_bo, priv->clear_color,
+ 							  0, 0,
+@@ -2344,21 +2395,26 @@ skip_inplace_map:
+ 				assert(pixmap->devPrivate.ptr == MAP(priv->cpu_bo->map__cpu));
+ 			}
+ 
+-			assert(pixmap->devKind);
+-			if (priv->clear_color == 0 ||
+-			    pixmap->drawable.bitsPerPixel == 8 ||
+-			    priv->clear_color == (1 << pixmap->drawable.depth) - 1) {
+-				memset(pixmap->devPrivate.ptr, priv->clear_color,
+-				       (size_t)pixmap->devKind * pixmap->drawable.height);
+-			} else {
+-				pixman_fill(pixmap->devPrivate.ptr,
+-					    pixmap->devKind/sizeof(uint32_t),
+-					    pixmap->drawable.bitsPerPixel,
+-					    0, 0,
+-					    pixmap->drawable.width,
+-					    pixmap->drawable.height,
+-					    priv->clear_color);
+-			}
++			if (sigtrap_get() == 0) {
++				assert(pixmap->devKind);
++				sigtrap_assert_active();
++				if (priv->clear_color == 0 ||
++				    pixmap->drawable.bitsPerPixel == 8 ||
++				    priv->clear_color == (1 << pixmap->drawable.depth) - 1) {
++					memset(pixmap->devPrivate.ptr, priv->clear_color,
++					       (size_t)pixmap->devKind * pixmap->drawable.height);
++				} else {
++					pixman_fill(pixmap->devPrivate.ptr,
++						    pixmap->devKind/sizeof(uint32_t),
++						    pixmap->drawable.bitsPerPixel,
++						    0, 0,
++						    pixmap->drawable.width,
++						    pixmap->drawable.height,
++						    priv->clear_color);
++				}
++				sigtrap_put();
++			} else
++				return false;
+ 
+ clear_done:
+ 			sna_damage_all(&priv->cpu_damage, pixmap);
+@@ -2414,6 +2470,10 @@ done:
+ 			DBG(("%s: discarding idle GPU bo\n", __FUNCTION__));
+ 			sna_pixmap_free_gpu(sna, priv);
+ 		}
++		if (priv->flush) {
++			assert(!priv->shm);
++			sna_add_flush_pixmap(sna, priv, priv->gpu_bo);
++		}
+ 		priv->source_count = SOURCE_BIAS;
+ 	}
+ 
+@@ -2531,6 +2591,9 @@ static bool cpu_clear_boxes(struct sna *sna,
+ {
+ 	struct sna_fill_op fill;
+ 
++	if (!sna->kgem.can_blt_cpu)
++		return false;
++
+ 	if (!sna_fill_init_blt(&fill, sna,
+ 			       pixmap, priv->cpu_bo,
+ 			       GXcopy, priv->clear_color,
+@@ -2659,6 +2722,10 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
+ 					}
+ 				}
+ 				sna_damage_add_to_pixmap(&priv->cpu_damage, region, pixmap);
++				if (priv->flush) {
++					assert(!priv->shm);
++					sna_add_flush_pixmap(sna, priv, priv->gpu_bo);
++				}
+ 
+ 				if (dx | dy)
+ 					RegionTranslate(region, -dx, -dy);
+@@ -2904,17 +2971,22 @@ move_to_cpu:
+ 			assert(pixmap->devPrivate.ptr == MAP(priv->cpu_bo->map__cpu));
+ 		}
+ 
+-		assert(pixmap->devKind);
+-		do {
+-			pixman_fill(pixmap->devPrivate.ptr,
+-				    pixmap->devKind/sizeof(uint32_t),
+-				    pixmap->drawable.bitsPerPixel,
+-				    box->x1, box->y1,
+-				    box->x2 - box->x1,
+-				    box->y2 - box->y1,
+-				    priv->clear_color);
+-			box++;
+-		} while (--n);
++		if (sigtrap_get() == 0) {
++			assert(pixmap->devKind);
++			sigtrap_assert_active();
++			do {
++				pixman_fill(pixmap->devPrivate.ptr,
++					    pixmap->devKind/sizeof(uint32_t),
++					    pixmap->drawable.bitsPerPixel,
++					    box->x1, box->y1,
++					    box->x2 - box->x1,
++					    box->y2 - box->y1,
++					    priv->clear_color);
++				box++;
++			} while (--n);
++			sigtrap_put();
++		} else
++			return false;
+ 
+ clear_done:
+ 		if (flags & MOVE_WRITE ||
+@@ -3209,13 +3281,14 @@ __sna_pixmap_for_gpu(struct sna *sna, PixmapPtr pixmap, unsigned flags)
+ {
+ 	struct sna_pixmap *priv;
+ 
++	assert(flags & (MOVE_READ | MOVE_WRITE | __MOVE_FORCE));
+ 	if ((flags & __MOVE_FORCE) == 0 && wedged(sna))
+ 		return NULL;
+ 
+ 	priv = sna_pixmap(pixmap);
+ 	if (priv == NULL) {
+ 		DBG(("%s: not attached\n", __FUNCTION__));
+-		if ((flags & __MOVE_DRI) == 0)
++		if ((flags & (__MOVE_DRI | __MOVE_SCANOUT)) == 0)
+ 			return NULL;
+ 
+ 		if (pixmap->usage_hint == -1) {
+@@ -3238,6 +3311,44 @@ __sna_pixmap_for_gpu(struct sna *sna, PixmapPtr pixmap, unsigned flags)
+ 	return priv;
+ }
+ 
++inline static void sna_pixmap_unclean(struct sna *sna,
++				      struct sna_pixmap *priv,
++				      unsigned flags)
++{
++	struct drm_i915_gem_busy busy;
++
++	assert(DAMAGE_IS_ALL(priv->gpu_damage));
++	assert(priv->gpu_bo);
++	assert(priv->gpu_bo->proxy == NULL);
++	assert_pixmap_map(priv->pixmap, priv);
++
++	sna_damage_destroy(&priv->cpu_damage);
++	list_del(&priv->flush_list);
++
++	if (flags & (__MOVE_DRI | __MOVE_SCANOUT))
++		return;
++
++	if (!priv->flush || priv->gpu_bo->exec)
++		return;
++
++	busy.handle = priv->gpu_bo->handle;
++	busy.busy = 0;
++	ioctl(sna->kgem.fd, DRM_IOCTL_I915_GEM_BUSY, &busy);
++
++	DBG(("%s(pixmap=%ld): cleaning foreign bo handle=%u, busy=%x [ring=%d]\n",
++	     __FUNCTION__,
++	     priv->pixmap->drawable.serialNumber,
++	     busy.handle, busy.busy, !!(busy.busy & (0xfffe << 16))));
++
++	if (busy.busy) {
++		unsigned mode = KGEM_RENDER;
++		if (busy.busy & (0xfffe << 16))
++			mode = KGEM_BLT;
++		kgem_bo_mark_busy(&sna->kgem, priv->gpu_bo, mode);
++	} else
++		__kgem_bo_clear_busy(priv->gpu_bo);
++}
++
+ struct sna_pixmap *
+ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, const BoxRec *box, unsigned int flags)
+ {
+@@ -3287,12 +3398,14 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, const BoxRec *box, unsigned int fl
+ 	if (priv->cow) {
+ 		unsigned cow = flags & (MOVE_READ | MOVE_WRITE | __MOVE_FORCE);
+ 
++		assert(cow);
++
+ 		if ((flags & MOVE_READ) == 0) {
+ 			if (priv->gpu_damage) {
+ 				r.extents = *box;
+ 				r.data = NULL;
+ 				if (!region_subsumes_damage(&r, priv->gpu_damage))
+-					cow |= MOVE_READ;
++					cow |= MOVE_READ | __MOVE_FORCE;
+ 			}
+ 		} else {
+ 			if (priv->cpu_damage) {
+@@ -3303,22 +3416,18 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, const BoxRec *box, unsigned int fl
+ 			}
+ 		}
+ 
+-		if (cow) {
+-			if (!sna_pixmap_undo_cow(sna, priv, cow))
+-				return NULL;
++		if (!sna_pixmap_undo_cow(sna, priv, cow))
++			return NULL;
+ 
+-			if (priv->gpu_bo == NULL)
+-				sna_damage_destroy(&priv->gpu_damage);
+-		}
++		if (priv->gpu_bo == NULL)
++			sna_damage_destroy(&priv->gpu_damage);
+ 	}
+ 
+ 	if (sna_damage_is_all(&priv->gpu_damage,
+ 			      pixmap->drawable.width,
+ 			      pixmap->drawable.height)) {
+-		assert(priv->gpu_bo);
+-		assert(priv->gpu_bo->proxy == NULL);
+-		sna_damage_destroy(&priv->cpu_damage);
+-		list_del(&priv->flush_list);
++		DBG(("%s: already all-damaged\n", __FUNCTION__));
++		sna_pixmap_unclean(sna, priv, flags);
+ 		goto done;
+ 	}
+ 
+@@ -3360,10 +3469,7 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, const BoxRec *box, unsigned int fl
+ 		return priv;
+ 	}
+ 
+-	if (priv->shm) {
+-		assert(!priv->flush);
+-		sna_add_flush_pixmap(sna, priv, priv->cpu_bo);
+-	}
++	add_shm_flush(sna, priv);
+ 
+ 	assert(priv->cpu_damage);
+ 	region_set(&r, box);
+@@ -3527,7 +3633,8 @@ sna_drawable_use_bo(DrawablePtr drawable, unsigned flags, const BoxRec *box,
+ 	}
+ 
+ 	if (priv->cow) {
+-		unsigned cow = MOVE_WRITE | MOVE_READ;
++		unsigned cow = MOVE_WRITE | MOVE_READ | __MOVE_FORCE;
++		assert(cow);
+ 
+ 		if (flags & IGNORE_DAMAGE) {
+ 			if (priv->gpu_damage) {
+@@ -3717,8 +3824,11 @@ create_gpu_bo:
+ 				else
+ 					move = MOVE_WRITE | MOVE_READ | MOVE_ASYNC_HINT;
+ 
+-				if (sna_pixmap_move_to_gpu(pixmap, move))
++				if (sna_pixmap_move_to_gpu(pixmap, move)) {
++					sna_damage_all(&priv->gpu_damage,
++						       pixmap);
+ 					goto use_gpu_bo;
++				}
+ 			}
+ 
+ 			if (DAMAGE_IS_ALL(priv->gpu_damage) ||
+@@ -3934,26 +4044,28 @@ prefer_gpu_bo:
+ 			goto move_to_gpu;
+ 		}
+ 
+-		if ((priv->cpu_damage == NULL || flags & IGNORE_DAMAGE)) {
+-			if (priv->gpu_bo && priv->gpu_bo->tiling) {
+-				DBG(("%s: prefer to use GPU bo for rendering large pixmaps\n", __FUNCTION__));
+-				goto prefer_gpu_bo;
++		if (!priv->shm) {
++			if ((priv->cpu_damage == NULL || flags & IGNORE_DAMAGE)) {
++				if (priv->gpu_bo && priv->gpu_bo->tiling) {
++					DBG(("%s: prefer to use GPU bo for rendering large pixmaps\n", __FUNCTION__));
++					goto prefer_gpu_bo;
++				}
++
++				if (priv->cpu_bo->pitch >= 4096) {
++					DBG(("%s: prefer to use GPU bo for rendering wide pixmaps\n", __FUNCTION__));
++					goto prefer_gpu_bo;
++				}
+ 			}
+ 
+-			if (priv->cpu_bo->pitch >= 4096) {
+-				DBG(("%s: prefer to use GPU bo for rendering wide pixmaps\n", __FUNCTION__));
++			if ((flags & IGNORE_DAMAGE) == 0 && priv->cpu_bo->snoop) {
++				DBG(("%s: prefer to use GPU bo for reading from snooped target bo\n", __FUNCTION__));
+ 				goto prefer_gpu_bo;
+ 			}
+-		}
+-
+-		if ((flags & IGNORE_DAMAGE) == 0 && priv->cpu_bo->snoop) {
+-			DBG(("%s: prefer to use GPU bo for reading from snooped target bo\n", __FUNCTION__));
+-			goto prefer_gpu_bo;
+-		}
+ 
+-		if (!sna->kgem.can_blt_cpu) {
+-			DBG(("%s: can't render to CPU bo, try to use GPU bo\n", __FUNCTION__));
+-			goto prefer_gpu_bo;
++			if (!sna->kgem.can_blt_cpu) {
++				DBG(("%s: can't render to CPU bo, try to use GPU bo\n", __FUNCTION__));
++				goto prefer_gpu_bo;
++			}
+ 		}
+ 	}
+ 
+@@ -3967,9 +4079,7 @@ prefer_gpu_bo:
+ 	}
+ 
+ 	if (priv->shm) {
+-		assert(!priv->flush);
+-		sna_add_flush_pixmap(sna, priv, priv->cpu_bo);
+-
++		add_shm_flush(sna, priv);
+ 		/* As we may have flushed and retired,, recheck for busy bo */
+ 		if ((flags & FORCE_GPU) == 0 && !kgem_bo_is_busy(priv->cpu_bo))
+ 			return NULL;
+@@ -4019,7 +4129,7 @@ sna_pixmap_create_upload(ScreenPtr screen,
+ 	assert(width);
+ 	assert(height);
+ 
+-	if (depth == 1)
++	if (depth < 8)
+ 		return create_pixmap(sna, screen, width, height, depth,
+ 				     CREATE_PIXMAP_USAGE_SCRATCH);
+ 
+@@ -4121,27 +4231,21 @@ sna_pixmap_move_to_gpu(PixmapPtr pixmap, unsigned flags)
+ 
+ 	if (priv->cow) {
+ 		unsigned cow = flags & (MOVE_READ | MOVE_WRITE | __MOVE_FORCE);
++		assert(cow);
+ 		if (flags & MOVE_READ && priv->cpu_damage)
+ 			cow |= MOVE_WRITE;
+-		if (cow) {
+-			if (!sna_pixmap_undo_cow(sna, priv, cow))
+-				return NULL;
++		if (!sna_pixmap_undo_cow(sna, priv, cow))
++			return NULL;
+ 
+-			if (priv->gpu_bo == NULL)
+-				sna_damage_destroy(&priv->gpu_damage);
+-		}
++		if (priv->gpu_bo == NULL)
++			sna_damage_destroy(&priv->gpu_damage);
+ 	}
+ 
+ 	if (sna_damage_is_all(&priv->gpu_damage,
+ 			      pixmap->drawable.width,
+ 			      pixmap->drawable.height)) {
+ 		DBG(("%s: already all-damaged\n", __FUNCTION__));
+-		assert(DAMAGE_IS_ALL(priv->gpu_damage));
+-		assert(priv->gpu_bo);
+-		assert(priv->gpu_bo->proxy == NULL);
+-		assert_pixmap_map(pixmap, priv);
+-		sna_damage_destroy(&priv->cpu_damage);
+-		list_del(&priv->flush_list);
++		sna_pixmap_unclean(sna, priv, flags);
+ 		goto active;
+ 	}
+ 
+@@ -4206,7 +4310,7 @@ sna_pixmap_move_to_gpu(PixmapPtr pixmap, unsigned flags)
+ 				if (flags & MOVE_INPLACE_HINT || (priv->cpu_damage && priv->cpu_bo == NULL))
+ 					create = CREATE_GTT_MAP | CREATE_INACTIVE;
+ 				if (flags & __MOVE_PRIME)
+-					create |= CREATE_GTT_MAP | CREATE_PRIME | CREATE_EXACT;
++					create |= CREATE_GTT_MAP | CREATE_SCANOUT | CREATE_PRIME | CREATE_EXACT;
+ 
+ 				sna_pixmap_alloc_gpu(sna, pixmap, priv, create);
+ 			}
+@@ -4282,10 +4386,7 @@ sna_pixmap_move_to_gpu(PixmapPtr pixmap, unsigned flags)
+ 		goto done;
+ 	}
+ 
+-	if (priv->shm) {
+-		assert(!priv->flush);
+-		sna_add_flush_pixmap(sna, priv, priv->cpu_bo);
+-	}
++	add_shm_flush(sna, priv);
+ 
+ 	n = sna_damage_get_boxes(priv->cpu_damage, &box);
+ 	assert(n);
+@@ -4534,7 +4635,7 @@ static inline bool box32_trim_and_translate(Box32Rec *box, DrawablePtr d, GCPtr
+ 	return box32_clip(box, gc);
+ }
+ 
+-static inline void box_add_pt(BoxPtr box, int16_t x, int16_t y)
++static inline void box_add_xy(BoxPtr box, int16_t x, int16_t y)
+ {
+ 	if (box->x1 > x)
+ 		box->x1 = x;
+@@ -4547,6 +4648,11 @@ static inline void box_add_pt(BoxPtr box, int16_t x, int16_t y)
+ 		box->y2 = y;
+ }
+ 
++static inline void box_add_pt(BoxPtr box, const DDXPointRec *pt)
++{
++	box_add_xy(box, pt->x, pt->y);
++}
++
+ static inline bool box32_to_box16(const Box32Rec *b32, BoxRec *b16)
+ {
+ 	b16->x1 = b32->x1;
+@@ -4864,6 +4970,7 @@ try_upload__inplace(PixmapPtr pixmap, RegionRec *region,
+ 	pixmap->devPrivate.ptr = dst;
+ 	pixmap->devKind = priv->gpu_bo->pitch;
+ 	priv->mapped = dst == MAP(priv->gpu_bo->map__cpu) ? MAPPED_CPU : MAPPED_GTT;
++	priv->cpu &= priv->mapped == MAPPED_CPU;
+ 	assert(has_coherent_ptr(sna, priv, MOVE_WRITE));
+ 
+ 	box = region_rects(region);
+@@ -4923,8 +5030,7 @@ done:
+ 			sna_damage_all(&priv->gpu_damage, pixmap);
+ 		}
+ 
+-		if (priv->shm)
+-			sna_add_flush_pixmap(sna, priv, priv->cpu_bo);
++		add_shm_flush(sna, priv);
+ 	}
+ 
+ 	assert(!priv->clear);
+@@ -5172,6 +5278,16 @@ static inline uint8_t blt_depth(int depth)
+ 	}
+ }
+ 
++inline static void blt_done(struct sna *sna)
++{
++	sna->blt_state.fill_bo = 0;
++	if (sna->kgem.nbatch && __kgem_ring_empty(&sna->kgem)) {
++		DBG(("%s: flushing BLT operation on empty ring\n",
++		     __FUNCTION__));
++		_kgem_submit(&sna->kgem);
++	}
++}
++
+ static bool
+ sna_put_xybitmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
+ 		     int x, int y, int w, int  h, char *bits)
+@@ -5217,6 +5333,7 @@ sna_put_xybitmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
+ 
+ 	kgem_set_mode(&sna->kgem, KGEM_BLT, bo);
+ 	assert(kgem_bo_can_blt(&sna->kgem, bo));
++	kgem_bcs_set_tiling(&sna->kgem, NULL, bo);
+ 
+ 	/* Region is pre-clipped and translated into pixmap space */
+ 	box = region_rects(region);
+@@ -5238,6 +5355,7 @@ sna_put_xybitmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
+ 				return false;
+ 			_kgem_set_mode(&sna->kgem, KGEM_BLT);
+ 		}
++		kgem_bcs_set_tiling(&sna->kgem, NULL, bo);
+ 
+ 		upload = kgem_create_buffer(&sna->kgem,
+ 					    bstride*bh,
+@@ -5331,7 +5449,7 @@ sna_put_xybitmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
+ 		box++;
+ 	} while (--n);
+ 
+-	sna->blt_state.fill_bo = 0;
++	blt_done(sna);
+ 	return true;
+ }
+ 
+@@ -5381,6 +5499,7 @@ sna_put_xypixmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
+ 
+ 	kgem_set_mode(&sna->kgem, KGEM_BLT, bo);
+ 	assert(kgem_bo_can_blt(&sna->kgem, bo));
++	kgem_bcs_set_tiling(&sna->kgem, NULL, bo);
+ 
+ 	skip = h * BitmapBytePad(w + left);
+ 	for (i = 1 << (gc->depth-1); i; i >>= 1, bits += skip) {
+@@ -5408,6 +5527,7 @@ sna_put_xypixmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
+ 					return false;
+ 				_kgem_set_mode(&sna->kgem, KGEM_BLT);
+ 			}
++			kgem_bcs_set_tiling(&sna->kgem, NULL, bo);
+ 
+ 			upload = kgem_create_buffer(&sna->kgem,
+ 						    bstride*bh,
+@@ -5509,7 +5629,7 @@ sna_put_xypixmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
+ 		} while (--n);
+ 	}
+ 
+-	sna->blt_state.fill_bo = 0;
++	blt_done(sna);
+ 	return true;
+ }
+ 
+@@ -5837,7 +5957,7 @@ sna_self_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
+ 		if (!sna->render.copy_boxes(sna, alu,
+ 					    &pixmap->drawable, priv->gpu_bo, sx, sy,
+ 					    &pixmap->drawable, priv->gpu_bo, tx, ty,
+-					    box, n, 0)) {
++					    box, n, small_copy(region))) {
+ 			DBG(("%s: fallback - accelerated copy boxes failed\n",
+ 			     __FUNCTION__));
+ 			goto fallback;
+@@ -6098,6 +6218,9 @@ sna_copy_boxes__inplace(struct sna *sna, RegionPtr region, int alu,
+ 
+ 	kgem_bo_sync__cpu_full(&sna->kgem, src_priv->gpu_bo, FORCE_FULL_SYNC);
+ 
++	if (sigtrap_get())
++		return false;
++
+ 	box = region_rects(region);
+ 	n = region_num_rects(region);
+ 	if (src_priv->gpu_bo->tiling) {
+@@ -6137,6 +6260,8 @@ sna_copy_boxes__inplace(struct sna *sna, RegionPtr region, int alu,
+ 		}
+ 	}
+ 
++	sigtrap_put();
++
+ 	return true;
+ 
+ upload_inplace:
+@@ -6234,6 +6359,9 @@ upload_inplace:
+ 
+ 	assert(has_coherent_ptr(sna, src_priv, MOVE_READ));
+ 
++	if (sigtrap_get())
++		return false;
++
+ 	box = region_rects(region);
+ 	n = region_num_rects(region);
+ 	if (dst_priv->gpu_bo->tiling) {
+@@ -6265,15 +6393,19 @@ upload_inplace:
+ 		} while (--n);
+ 
+ 		if (!dst_priv->shm) {
+-			assert(ptr == MAP(dst_priv->gpu_bo->map__cpu));
+ 			dst_pixmap->devPrivate.ptr = ptr;
+ 			dst_pixmap->devKind = dst_priv->gpu_bo->pitch;
+-			dst_priv->mapped = MAPPED_CPU;
++			if (ptr == MAP(dst_priv->gpu_bo->map__cpu)) {
++				dst_priv->mapped = MAPPED_CPU;
++				dst_priv->cpu = true;
++			} else
++				dst_priv->mapped = MAPPED_GTT;
+ 			assert_pixmap_map(dst_pixmap, dst_priv);
+-			dst_priv->cpu = true;
+ 		}
+ 	}
+ 
++	sigtrap_put();
++
+ 	return true;
+ }
+ 
+@@ -6326,6 +6458,16 @@ sna_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
+ 
+ 	assert(region_num_rects(region));
+ 
++	if (src_priv &&
++	    src_priv->gpu_bo == NULL &&
++	    src_priv->cpu_bo == NULL &&
++	    src_priv->ptr == NULL) {
++		/* Rare but still happens, nothing to copy */
++		DBG(("%s: src pixmap=%ld is empty\n",
++		     __FUNCTION__, src_pixmap->drawable.serialNumber));
++		return;
++	}
++
+ 	if (src_pixmap == dst_pixmap)
+ 		return sna_self_copy_boxes(src, dst, gc,
+ 					   region, dx, dy,
+@@ -6491,15 +6633,14 @@ discard_cow:
+ 					sna_damage_all(&dst_priv->gpu_damage, dst_pixmap);
+ 					sna_damage_destroy(&dst_priv->cpu_damage);
+ 					list_del(&dst_priv->flush_list);
+-					if (dst_priv->shm)
+-						sna_add_flush_pixmap(sna, dst_priv, dst_priv->cpu_bo);
++					add_shm_flush(sna, dst_priv);
+ 					return;
+ 				}
+ 			}
+ 			if (!sna->render.copy_boxes(sna, alu,
+ 						    &src_pixmap->drawable, src_priv->gpu_bo, src_dx, src_dy,
+ 						    &dst_pixmap->drawable, bo, 0, 0,
+-						    box, n, 0)) {
++						    box, n, small_copy(region))) {
+ 				DBG(("%s: fallback - accelerated copy boxes failed\n",
+ 				     __FUNCTION__));
+ 				goto fallback;
+@@ -6536,7 +6677,7 @@ discard_cow:
+ 			if (!sna->render.copy_boxes(sna, alu,
+ 						    &src_pixmap->drawable, src_priv->gpu_bo, src_dx, src_dy,
+ 						    &dst_pixmap->drawable, bo, 0, 0,
+-						    box, n, 0)) {
++						    box, n, small_copy(region))) {
+ 				DBG(("%s: fallback - accelerated copy boxes failed\n",
+ 				     __FUNCTION__));
+ 				goto fallback;
+@@ -6571,15 +6712,12 @@ discard_cow:
+ 			if (replaces && UNDO)
+ 				kgem_bo_pair_undo(&sna->kgem, dst_priv->gpu_bo, dst_priv->cpu_bo);
+ 
+-			if (src_priv->shm) {
+-				assert(!src_priv->flush);
+-				sna_add_flush_pixmap(sna, src_priv, src_priv->cpu_bo);
+-			}
++			add_shm_flush(sna, src_priv);
+ 
+ 			if (!sna->render.copy_boxes(sna, alu,
+ 						    &src_pixmap->drawable, src_priv->cpu_bo, src_dx, src_dy,
+ 						    &dst_pixmap->drawable, bo, 0, 0,
+-						    box, n, src_priv->shm ? COPY_LAST : 0)) {
++						    box, n, small_copy(region) | (src_priv->shm ? COPY_LAST : 0))) {
+ 				DBG(("%s: fallback - accelerated copy boxes failed\n",
+ 				     __FUNCTION__));
+ 				goto fallback;
+@@ -6631,8 +6769,7 @@ discard_cow:
+ 				ok = sna->render.copy_boxes(sna, alu,
+ 							    &src_pixmap->drawable, src_bo, src_dx, src_dy,
+ 							    &dst_pixmap->drawable, bo, 0, 0,
+-							    box, n, COPY_LAST);
+-
++							    box, n, small_copy(region) |  COPY_LAST);
+ 				kgem_bo_sync__cpu(&sna->kgem, src_bo);
+ 				assert(src_bo->rq == NULL);
+ 				kgem_bo_destroy(&sna->kgem, src_bo);
+@@ -6780,18 +6917,22 @@ fallback:
+ 				return;
+ 		}
+ 
+-		assert(dst_pixmap->devPrivate.ptr);
+-		assert(dst_pixmap->devKind);
+-		do {
+-			pixman_fill(dst_pixmap->devPrivate.ptr,
+-				    dst_pixmap->devKind/sizeof(uint32_t),
+-				    dst_pixmap->drawable.bitsPerPixel,
+-				    box->x1, box->y1,
+-				    box->x2 - box->x1,
+-				    box->y2 - box->y1,
+-				    src_priv->clear_color);
+-			box++;
+-		} while (--n);
++		if (sigtrap_get() == 0) {
++			assert(dst_pixmap->devPrivate.ptr);
++			assert(dst_pixmap->devKind);
++			sigtrap_assert_active();
++			do {
++				pixman_fill(dst_pixmap->devPrivate.ptr,
++					    dst_pixmap->devKind/sizeof(uint32_t),
++					    dst_pixmap->drawable.bitsPerPixel,
++					    box->x1, box->y1,
++					    box->x2 - box->x1,
++					    box->y2 - box->y1,
++					    src_priv->clear_color);
++				box++;
++			} while (--n);
++			sigtrap_put();
++		}
+ 	} else if (!sna_copy_boxes__inplace(sna, region, alu,
+ 					    src_pixmap, src_priv,
+ 					    src_dx, src_dy,
+@@ -6848,36 +6989,39 @@ fallback:
+ 				((char *)src_pixmap->devPrivate.ptr +
+ 				 src_dy * src_stride + src_dx * bpp / 8);
+ 
+-			do {
+-				DBG(("%s: memcpy_blt(box=(%d, %d), (%d, %d), src=(%d, %d), pitches=(%d, %d))\n",
+-				     __FUNCTION__,
+-				     box->x1, box->y1,
+-				     box->x2 - box->x1,
+-				     box->y2 - box->y1,
+-				     src_dx, src_dy,
+-				     src_stride, dst_stride));
+-
+-				assert(box->x1 >= 0);
+-				assert(box->y1 >= 0);
+-				assert(box->x2 <= dst_pixmap->drawable.width);
+-				assert(box->y2 <= dst_pixmap->drawable.height);
+-
+-				assert(box->x1 + src_dx >= 0);
+-				assert(box->y1 + src_dy >= 0);
+-				assert(box->x2 + src_dx <= src_pixmap->drawable.width);
+-				assert(box->y2 + src_dy <= src_pixmap->drawable.height);
+-				assert(has_coherent_ptr(sna, src_priv, MOVE_READ));
+-				assert(has_coherent_ptr(sna, dst_priv, MOVE_WRITE));
+-				assert(src_stride);
+-				assert(dst_stride);
+-				memcpy_blt(src_bits, dst_bits, bpp,
+-					   src_stride, dst_stride,
+-					   box->x1, box->y1,
+-					   box->x1, box->y1,
+-					   box->x2 - box->x1,
+-					   box->y2 - box->y1);
+-				box++;
+-			} while (--n);
++			if (sigtrap_get() == 0) {
++				do {
++					DBG(("%s: memcpy_blt(box=(%d, %d), (%d, %d), src=(%d, %d), pitches=(%d, %d))\n",
++					     __FUNCTION__,
++					     box->x1, box->y1,
++					     box->x2 - box->x1,
++					     box->y2 - box->y1,
++					     src_dx, src_dy,
++					     src_stride, dst_stride));
++
++					assert(box->x1 >= 0);
++					assert(box->y1 >= 0);
++					assert(box->x2 <= dst_pixmap->drawable.width);
++					assert(box->y2 <= dst_pixmap->drawable.height);
++
++					assert(box->x1 + src_dx >= 0);
++					assert(box->y1 + src_dy >= 0);
++					assert(box->x2 + src_dx <= src_pixmap->drawable.width);
++					assert(box->y2 + src_dy <= src_pixmap->drawable.height);
++					assert(has_coherent_ptr(sna, src_priv, MOVE_READ));
++					assert(has_coherent_ptr(sna, dst_priv, MOVE_WRITE));
++					assert(src_stride);
++					assert(dst_stride);
++					memcpy_blt(src_bits, dst_bits, bpp,
++						   src_stride, dst_stride,
++						   box->x1, box->y1,
++						   box->x1, box->y1,
++						   box->x2 - box->x1,
++						   box->y2 - box->y1);
++					box++;
++				} while (--n);
++				sigtrap_put();
++			}
+ 		} else {
+ 			DBG(("%s: fallback -- miCopyRegion\n", __FUNCTION__));
+ 
+@@ -6931,7 +7075,8 @@ sna_do_copy(DrawablePtr src, DrawablePtr dst, GCPtr gc,
+ 
+ 	/* Short cut for unmapped windows */
+ 	if (dst->type == DRAWABLE_WINDOW && !((WindowPtr)dst)->realized) {
+-		DBG(("%s: unmapped\n", __FUNCTION__));
++		DBG(("%s: unmapped/unrealized dst (pixmap=%ld)\n",
++		     __FUNCTION__, get_window_pixmap((WindowPtr)dst)));
+ 		return NULL;
+ 	}
+ 
+@@ -7115,19 +7260,28 @@ sna_copy_area(DrawablePtr src, DrawablePtr dst, GCPtr gc,
+ 	if (gc->planemask == 0)
+ 		return NULL;
+ 
+-	DBG(("%s: src=(%d, %d)x(%d, %d)+(%d, %d) -> dst=(%d, %d)+(%d, %d); alu=%d, pm=%lx, depth=%d\n",
++	if (sna->ignore_copy_area)
++		return NULL;
++
++	DBG(("%s: src=pixmap=%ld:(%d, %d)x(%d, %d)+(%d, %d) -> dst=pixmap=%ld:(%d, %d)+(%d, %d); alu=%d, pm=%lx, depth=%d\n",
+ 	     __FUNCTION__,
++	     get_drawable_pixmap(src)->drawable.serialNumber,
+ 	     src_x, src_y, width, height, src->x, src->y,
++	     get_drawable_pixmap(dst)->drawable.serialNumber,
+ 	     dst_x, dst_y, dst->x, dst->y,
+ 	     gc->alu, gc->planemask, gc->depth));
+ 
+ 	if (FORCE_FALLBACK || !ACCEL_COPY_AREA || wedged(sna) ||
+-	    !PM_IS_SOLID(dst, gc->planemask) || gc->depth < 8)
++	    !PM_IS_SOLID(dst, gc->planemask) || gc->depth < 8) {
++		DBG(("%s: fallback copy\n", __FUNCTION__));
+ 		copy = sna_fallback_copy_boxes;
+-	else if (src == dst)
++	} else if (src == dst) {
++		DBG(("%s: self copy\n", __FUNCTION__));
+ 		copy = sna_self_copy_boxes;
+-	else
++	} else {
++		DBG(("%s: normal copy\n", __FUNCTION__));
+ 		copy = sna_copy_boxes;
++	}
+ 
+ 	return sna_do_copy(src, dst, gc,
+ 			   src_x, src_y,
+@@ -7136,30 +7290,21 @@ sna_copy_area(DrawablePtr src, DrawablePtr dst, GCPtr gc,
+ 			   copy, 0, NULL);
+ }
+ 
+-static const BoxRec *
+-find_clip_box_for_y(const BoxRec *begin, const BoxRec *end, int16_t y)
++const BoxRec *
++__find_clip_box_for_y(const BoxRec *begin, const BoxRec *end, int16_t y)
+ {
+-    const BoxRec *mid;
+-
+-    if (end == begin)
+-	return end;
+-
+-    if (end - begin == 1) {
++	assert(end - begin > 1);
++	do {
++		const BoxRec *mid = begin + (end - begin) / 2;
++		if (mid->y2 > y)
++			end = mid;
++		else
++			begin = mid;
++	} while (end > begin + 1);
+ 	if (begin->y2 > y)
+-	    return begin;
++		return begin;
+ 	else
+-	    return end;
+-    }
+-
+-    mid = begin + (end - begin) / 2;
+-    if (mid->y2 > y)
+-	/* If no box is found in [begin, mid], the function
+-	 * will return @mid, which is then known to be the
+-	 * correct answer.
+-	 */
+-	return find_clip_box_for_y(begin, mid, y);
+-    else
+-	return find_clip_box_for_y(mid, end, y);
++		return end;
+ }
+ 
+ struct sna_fill_spans {
+@@ -8223,6 +8368,8 @@ sna_copy_bitmap_blt(DrawablePtr _bitmap, DrawablePtr drawable, GCPtr gc,
+ 	}
+ 	br13 |= blt_depth(drawable->depth) << 24;
+ 	br13 |= copy_ROP[gc->alu] << 16;
++	DBG(("%s: target-depth=%d, alu=%d, bg=%08x, fg=%08x\n",
++	     __FUNCTION__, drawable->depth, gc->alu, gc->bgPixel, gc->fgPixel));
+ 
+ 	kgem_set_mode(&sna->kgem, KGEM_BLT, arg->bo);
+ 	assert(kgem_bo_can_blt(&sna->kgem, arg->bo));
+@@ -8255,6 +8402,7 @@ sna_copy_bitmap_blt(DrawablePtr _bitmap, DrawablePtr drawable, GCPtr gc,
+ 					return; /* XXX fallback? */
+ 				_kgem_set_mode(&sna->kgem, KGEM_BLT);
+ 			}
++			kgem_bcs_set_tiling(&sna->kgem, NULL, arg->bo);
+ 
+ 			assert(sna->kgem.mode == KGEM_BLT);
+ 			if (sna->kgem.gen >= 0100) {
+@@ -8270,8 +8418,8 @@ sna_copy_bitmap_blt(DrawablePtr _bitmap, DrawablePtr drawable, GCPtr gc,
+ 							 I915_GEM_DOMAIN_RENDER |
+ 							 KGEM_RELOC_FENCED,
+ 							 0);
+-				b[5] = gc->bgPixel;
+-				b[6] = gc->fgPixel;
++				b[6] = gc->bgPixel;
++				b[7] = gc->fgPixel;
+ 
+ 				dst = (uint8_t *)&b[8];
+ 				sna->kgem.nbatch += 8 + src_stride;
+@@ -8322,6 +8470,7 @@ sna_copy_bitmap_blt(DrawablePtr _bitmap, DrawablePtr drawable, GCPtr gc,
+ 					return; /* XXX fallback? */
+ 				_kgem_set_mode(&sna->kgem, KGEM_BLT);
+ 			}
++			kgem_bcs_set_tiling(&sna->kgem, NULL, arg->bo);
+ 
+ 			upload = kgem_create_buffer(&sna->kgem,
+ 						    bstride*bh,
+@@ -8408,7 +8557,7 @@ sna_copy_bitmap_blt(DrawablePtr _bitmap, DrawablePtr drawable, GCPtr gc,
+ 		sna_damage_add_to_pixmap(arg->damage, region, pixmap);
+ 	}
+ 	assert_pixmap_damage(pixmap);
+-	sna->blt_state.fill_bo = 0;
++	blt_done(sna);
+ }
+ 
+ static void
+@@ -8472,6 +8621,7 @@ sna_copy_plane_blt(DrawablePtr source, DrawablePtr drawable, GCPtr gc,
+ 				return; /* XXX fallback? */
+ 			_kgem_set_mode(&sna->kgem, KGEM_BLT);
+ 		}
++		kgem_bcs_set_tiling(&sna->kgem, NULL, arg->bo);
+ 
+ 		upload = kgem_create_buffer(&sna->kgem,
+ 					    bstride*bh,
+@@ -8588,6 +8738,8 @@ sna_copy_plane_blt(DrawablePtr source, DrawablePtr drawable, GCPtr gc,
+ 				}
+ 			}
+ 
++			kgem_bcs_set_tiling(&sna->kgem, upload, arg->bo);
++
+ 			assert(sna->kgem.mode == KGEM_BLT);
+ 			b = sna->kgem.batch + sna->kgem.nbatch;
+ 			if (sna->kgem.gen >= 0100) {
+@@ -8641,7 +8793,7 @@ sna_copy_plane_blt(DrawablePtr source, DrawablePtr drawable, GCPtr gc,
+ 		sna_damage_add_to_pixmap(arg->damage, region, dst_pixmap);
+ 	}
+ 	assert_pixmap_damage(dst_pixmap);
+-	sna->blt_state.fill_bo = 0;
++	blt_done(sna);
+ }
+ 
+ static RegionPtr
+@@ -8895,36 +9047,11 @@ sna_poly_point_extents(DrawablePtr drawable, GCPtr gc,
+ 			last.x += pt->x;
+ 			last.y += pt->y;
+ 			pt++;
+-			box_add_pt(&box, last.x, last.y);
++			box_add_xy(&box, last.x, last.y);
+ 		}
+ 	} else {
+-		--n; ++pt;
+-		while (n >= 8) {
+-			box_add_pt(&box, pt[0].x, pt[0].y);
+-			box_add_pt(&box, pt[1].x, pt[1].y);
+-			box_add_pt(&box, pt[2].x, pt[2].y);
+-			box_add_pt(&box, pt[3].x, pt[3].y);
+-			box_add_pt(&box, pt[4].x, pt[4].y);
+-			box_add_pt(&box, pt[5].x, pt[5].y);
+-			box_add_pt(&box, pt[6].x, pt[6].y);
+-			box_add_pt(&box, pt[7].x, pt[7].y);
+-			pt += 8;
+-			n -= 8;
+-		}
+-		if (n & 4) {
+-			box_add_pt(&box, pt[0].x, pt[0].y);
+-			box_add_pt(&box, pt[1].x, pt[1].y);
+-			box_add_pt(&box, pt[2].x, pt[2].y);
+-			box_add_pt(&box, pt[3].x, pt[3].y);
+-			pt += 4;
+-		}
+-		if (n & 2) {
+-			box_add_pt(&box, pt[0].x, pt[0].y);
+-			box_add_pt(&box, pt[1].x, pt[1].y);
+-			pt += 2;
+-		}
+-		if (n & 1)
+-			box_add_pt(&box, pt[0].x, pt[0].y);
++		while (--n)
++			box_add_pt(&box, ++pt);
+ 	}
+ 	box.x2++;
+ 	box.y2++;
+@@ -9636,7 +9763,7 @@ sna_poly_line_extents(DrawablePtr drawable, GCPtr gc,
+ 			y += pt->y;
+ 			if (blt)
+ 				blt &= pt->x == 0 || pt->y == 0;
+-			box_add_pt(&box, x, y);
++			box_add_xy(&box, x, y);
+ 		}
+ 	} else {
+ 		int x = box.x1;
+@@ -9648,7 +9775,7 @@ sna_poly_line_extents(DrawablePtr drawable, GCPtr gc,
+ 				x = pt->x;
+ 				y = pt->y;
+ 			}
+-			box_add_pt(&box, pt->x, pt->y);
++			box_add_pt(&box, pt);
+ 		}
+ 	}
+ 	box.x2++;
+@@ -10037,7 +10164,7 @@ out:
+ 	RegionUninit(&data.region);
+ }
+ 
+-static inline void box_from_seg(BoxPtr b, const xSegment *seg, GCPtr gc)
++static inline bool box_from_seg(BoxPtr b, const xSegment *seg, GCPtr gc)
+ {
+ 	if (seg->x1 == seg->x2) {
+ 		if (seg->y1 > seg->y2) {
+@@ -10051,6 +10178,9 @@ static inline void box_from_seg(BoxPtr b, const xSegment *seg, GCPtr gc)
+ 			if (gc->capStyle != CapNotLast)
+ 				b->y2++;
+ 		}
++		if (b->y1 >= b->y2)
++			return false;
++
+ 		b->x1 = seg->x1;
+ 		b->x2 = seg->x1 + 1;
+ 	} else {
+@@ -10065,6 +10195,9 @@ static inline void box_from_seg(BoxPtr b, const xSegment *seg, GCPtr gc)
+ 			if (gc->capStyle != CapNotLast)
+ 				b->x2++;
+ 		}
++		if (b->x1 >= b->x2)
++			return false;
++
+ 		b->y1 = seg->y1;
+ 		b->y2 = seg->y1 + 1;
+ 	}
+@@ -10073,6 +10206,7 @@ static inline void box_from_seg(BoxPtr b, const xSegment *seg, GCPtr gc)
+ 	     __FUNCTION__,
+ 	     seg->x1, seg->y1, seg->x2, seg->y2,
+ 	     b->x1, b->y1, b->x2, b->y2));
++	return true;
+ }
+ 
+ static bool
+@@ -10107,12 +10241,13 @@ sna_poly_segment_blt(DrawablePtr drawable,
+ 					nbox = ARRAY_SIZE(boxes);
+ 				n -= nbox;
+ 				do {
+-					box_from_seg(b, seg++, gc);
+-					if (b->y2 > b->y1 && b->x2 > b->x1) {
++					if (box_from_seg(b, seg++, gc)) {
++						assert(!box_empty(b));
+ 						b->x1 += dx;
+ 						b->x2 += dx;
+ 						b->y1 += dy;
+ 						b->y2 += dy;
++						assert(!box_empty(b));
+ 						b++;
+ 					}
+ 				} while (--nbox);
+@@ -10131,7 +10266,10 @@ sna_poly_segment_blt(DrawablePtr drawable,
+ 					nbox = ARRAY_SIZE(boxes);
+ 				n -= nbox;
+ 				do {
+-					box_from_seg(b++, seg++, gc);
++					if (box_from_seg(b, seg++, gc)) {
++						assert(!box_empty(b));
++						b++;
++					}
+ 				} while (--nbox);
+ 
+ 				if (b != boxes) {
+@@ -10156,7 +10294,10 @@ sna_poly_segment_blt(DrawablePtr drawable,
+ 			do {
+ 				BoxRec box;
+ 
+-				box_from_seg(&box, seg++, gc);
++				if (!box_from_seg(&box, seg++, gc))
++					continue;
++
++				assert(!box_empty(&box));
+ 				box.x1 += drawable->x;
+ 				box.x2 += drawable->x;
+ 				box.y1 += drawable->y;
+@@ -10174,6 +10315,7 @@ sna_poly_segment_blt(DrawablePtr drawable,
+ 						b->x2 += dx;
+ 						b->y1 += dy;
+ 						b->y2 += dy;
++						assert(!box_empty(b));
+ 						if (++b == last_box) {
+ 							fill.boxes(sna, &fill, boxes, last_box-boxes);
+ 							if (damage)
+@@ -10185,7 +10327,10 @@ sna_poly_segment_blt(DrawablePtr drawable,
+ 			} while (--n);
+ 		} else {
+ 			do {
+-				box_from_seg(b, seg++, gc);
++				if (!box_from_seg(b, seg++, gc))
++					continue;
++
++				assert(!box_empty(b));
+ 				b->x1 += drawable->x;
+ 				b->x2 += drawable->x;
+ 				b->y1 += drawable->y;
+@@ -10195,6 +10340,7 @@ sna_poly_segment_blt(DrawablePtr drawable,
+ 					b->x2 += dx;
+ 					b->y1 += dy;
+ 					b->y2 += dy;
++					assert(!box_empty(b));
+ 					if (++b == last_box) {
+ 						fill.boxes(sna, &fill, boxes, last_box-boxes);
+ 						if (damage)
+@@ -10319,8 +10465,11 @@ sna_poly_zero_segment_blt(DrawablePtr drawable,
+ 				}
+ 				b->x2++;
+ 				b->y2++;
+-				if (oc1 | oc2)
+-					box_intersect(b, extents);
++
++				if ((oc1 | oc2) && !box_intersect(b, extents))
++					continue;
++
++				assert(!box_empty(b));
+ 				if (++b == last_box) {
+ 					ret = &&rectangle_continue;
+ 					goto *jump;
+@@ -10383,6 +10532,7 @@ rectangle_continue:
+ 						     __FUNCTION__, x1, y1,
+ 						     b->x1, b->y1, b->x2, b->y2));
+ 
++						assert(!box_empty(b));
+ 						if (++b == last_box) {
+ 							ret = &&X_continue;
+ 							goto *jump;
+@@ -10407,6 +10557,7 @@ X_continue:
+ 						b->x2 = x1 + 1;
+ 					b->y2 = b->y1 + 1;
+ 
++					assert(!box_empty(b));
+ 					if (++b == last_box) {
+ 						ret = &&X2_continue;
+ 						goto *jump;
+@@ -10468,6 +10619,7 @@ X2_continue:
+ 							b->y2 = y1 + 1;
+ 						b->x2 = x1 + 1;
+ 
++						assert(!box_empty(b));
+ 						if (++b == last_box) {
+ 							ret = &&Y_continue;
+ 							goto *jump;
+@@ -10491,6 +10643,7 @@ Y_continue:
+ 						b->y2 = y1 + 1;
+ 					b->x2 = x1 + 1;
+ 
++					assert(!box_empty(b));
+ 					if (++b == last_box) {
+ 						ret = &&Y2_continue;
+ 						goto *jump;
+@@ -11785,14 +11938,29 @@ sna_poly_fill_rect_blt(DrawablePtr drawable,
+ 				if (nbox > ARRAY_SIZE(boxes))
+ 					nbox = ARRAY_SIZE(boxes);
+ 				n -= nbox;
+-				do {
++				while (nbox >= 2) {
++					b[0].x1 = rect[0].x + dx;
++					b[0].y1 = rect[0].y + dy;
++					b[0].x2 = b[0].x1 + rect[0].width;
++					b[0].y2 = b[0].y1 + rect[0].height;
++
++					b[1].x1 = rect[1].x + dx;
++					b[1].y1 = rect[1].y + dy;
++					b[1].x2 = b[1].x1 + rect[1].width;
++					b[1].y2 = b[1].y1 + rect[1].height;
++
++					b += 2;
++					rect += 2;
++					nbox -= 2;
++				}
++				if (nbox) {
+ 					b->x1 = rect->x + dx;
+ 					b->y1 = rect->y + dy;
+ 					b->x2 = b->x1 + rect->width;
+ 					b->y2 = b->y1 + rect->height;
+ 					b++;
+ 					rect++;
+-				} while (--nbox);
++				}
+ 				fill.boxes(sna, &fill, boxes, b-boxes);
+ 				b = boxes;
+ 			} while (n);
+@@ -11802,14 +11970,29 @@ sna_poly_fill_rect_blt(DrawablePtr drawable,
+ 				if (nbox > ARRAY_SIZE(boxes))
+ 					nbox = ARRAY_SIZE(boxes);
+ 				n -= nbox;
+-				do {
++				while (nbox >= 2) {
++					b[0].x1 = rect[0].x;
++					b[0].y1 = rect[0].y;
++					b[0].x2 = b[0].x1 + rect[0].width;
++					b[0].y2 = b[0].y1 + rect[0].height;
++
++					b[1].x1 = rect[1].x;
++					b[1].y1 = rect[1].y;
++					b[1].x2 = b[1].x1 + rect[1].width;
++					b[1].y2 = b[1].y1 + rect[1].height;
++
++					b += 2;
++					rect += 2;
++					nbox -= 2;
++				}
++				if (nbox) {
+ 					b->x1 = rect->x;
+ 					b->y1 = rect->y;
+ 					b->x2 = b->x1 + rect->width;
+ 					b->y2 = b->y1 + rect->height;
+ 					b++;
+ 					rect++;
+-				} while (--nbox);
++				}
+ 				fill.boxes(sna, &fill, boxes, b-boxes);
+ 				b = boxes;
+ 			} while (n);
+@@ -12192,6 +12375,7 @@ sna_poly_fill_rect_tiled_8x8_blt(DrawablePtr drawable,
+ 			return false;
+ 		_kgem_set_mode(&sna->kgem, KGEM_BLT);
+ 	}
++	kgem_bcs_set_tiling(&sna->kgem, tile_bo, bo);
+ 
+ 	get_drawable_deltas(drawable, pixmap, &dx, &dy);
+ 	assert(extents->x1 + dx >= 0);
+@@ -12335,6 +12519,7 @@ sna_poly_fill_rect_tiled_8x8_blt(DrawablePtr drawable,
+ 
+ 			_kgem_submit(&sna->kgem);
+ 			_kgem_set_mode(&sna->kgem, KGEM_BLT);
++			kgem_bcs_set_tiling(&sna->kgem, tile_bo, bo);
+ 		} while (1);
+ 	} else {
+ 		RegionRec clip;
+@@ -12403,6 +12588,7 @@ sna_poly_fill_rect_tiled_8x8_blt(DrawablePtr drawable,
+ 					if (!kgem_check_batch(&sna->kgem, 3)) {
+ 						_kgem_submit(&sna->kgem);
+ 						_kgem_set_mode(&sna->kgem, KGEM_BLT);
++						kgem_bcs_set_tiling(&sna->kgem, tile_bo, bo);
+ 
+ 						unwind_batch = sna->kgem.nbatch;
+ 						unwind_reloc = sna->kgem.nreloc;
+@@ -12499,6 +12685,7 @@ sna_poly_fill_rect_tiled_8x8_blt(DrawablePtr drawable,
+ 							DBG(("%s: emitting split batch\n", __FUNCTION__));
+ 							_kgem_submit(&sna->kgem);
+ 							_kgem_set_mode(&sna->kgem, KGEM_BLT);
++							kgem_bcs_set_tiling(&sna->kgem, tile_bo, bo);
+ 
+ 							unwind_batch = sna->kgem.nbatch;
+ 							unwind_reloc = sna->kgem.nreloc;
+@@ -12572,7 +12759,7 @@ sna_poly_fill_rect_tiled_8x8_blt(DrawablePtr drawable,
+ 	}
+ done:
+ 	assert_pixmap_damage(pixmap);
+-	sna->blt_state.fill_bo = 0;
++	blt_done(sna);
+ 	return true;
+ }
+ 
+@@ -13128,6 +13315,7 @@ sna_poly_fill_rect_stippled_8x8_blt(DrawablePtr drawable,
+ 			return false;
+ 		_kgem_set_mode(&sna->kgem, KGEM_BLT);
+ 	}
++	kgem_bcs_set_tiling(&sna->kgem, NULL, bo);
+ 
+ 	if (!clipped) {
+ 		dx += drawable->x;
+@@ -13240,6 +13428,7 @@ sna_poly_fill_rect_stippled_8x8_blt(DrawablePtr drawable,
+ 
+ 			_kgem_submit(&sna->kgem);
+ 			_kgem_set_mode(&sna->kgem, KGEM_BLT);
++			kgem_bcs_set_tiling(&sna->kgem, NULL, bo);
+ 		} while (1);
+ 	} else {
+ 		RegionRec clip;
+@@ -13297,6 +13486,7 @@ sna_poly_fill_rect_stippled_8x8_blt(DrawablePtr drawable,
+ 					if (!kgem_check_batch(&sna->kgem, 3)) {
+ 						_kgem_submit(&sna->kgem);
+ 						_kgem_set_mode(&sna->kgem, KGEM_BLT);
++						kgem_bcs_set_tiling(&sna->kgem, NULL, bo);
+ 
+ 						assert(sna->kgem.mode == KGEM_BLT);
+ 						b = sna->kgem.batch + sna->kgem.nbatch;
+@@ -13369,6 +13559,7 @@ sna_poly_fill_rect_stippled_8x8_blt(DrawablePtr drawable,
+ 						if (!kgem_check_batch(&sna->kgem, 3)) {
+ 							_kgem_submit(&sna->kgem);
+ 							_kgem_set_mode(&sna->kgem, KGEM_BLT);
++							kgem_bcs_set_tiling(&sna->kgem, NULL, bo);
+ 
+ 							assert(sna->kgem.mode == KGEM_BLT);
+ 							b = sna->kgem.batch + sna->kgem.nbatch;
+@@ -13419,7 +13610,7 @@ sna_poly_fill_rect_stippled_8x8_blt(DrawablePtr drawable,
+ 	}
+ 
+ 	assert_pixmap_damage(pixmap);
+-	sna->blt_state.fill_bo = 0;
++	blt_done(sna);
+ 	return true;
+ }
+ 
+@@ -13499,6 +13690,7 @@ sna_poly_fill_rect_stippled_1_blt(DrawablePtr drawable,
+ 	get_drawable_deltas(drawable, pixmap, &dx, &dy);
+ 	kgem_set_mode(&sna->kgem, KGEM_BLT, bo);
+ 	assert(kgem_bo_can_blt(&sna->kgem, bo));
++	kgem_bcs_set_tiling(&sna->kgem, NULL, bo);
+ 
+ 	br00 = 3 << 20;
+ 	br13 = bo->pitch;
+@@ -13543,6 +13735,7 @@ sna_poly_fill_rect_stippled_1_blt(DrawablePtr drawable,
+ 						return false;
+ 					_kgem_set_mode(&sna->kgem, KGEM_BLT);
+ 				}
++				kgem_bcs_set_tiling(&sna->kgem, NULL, bo);
+ 
+ 				assert(sna->kgem.mode == KGEM_BLT);
+ 				b = sna->kgem.batch + sna->kgem.nbatch;
+@@ -13606,6 +13799,7 @@ sna_poly_fill_rect_stippled_1_blt(DrawablePtr drawable,
+ 						return false;
+ 					_kgem_set_mode(&sna->kgem, KGEM_BLT);
+ 				}
++				kgem_bcs_set_tiling(&sna->kgem, NULL, bo);
+ 
+ 				upload = kgem_create_buffer(&sna->kgem,
+ 							    bstride*bh,
+@@ -13736,6 +13930,7 @@ sna_poly_fill_rect_stippled_1_blt(DrawablePtr drawable,
+ 							return false;
+ 						_kgem_set_mode(&sna->kgem, KGEM_BLT);
+ 					}
++					kgem_bcs_set_tiling(&sna->kgem, NULL, bo);
+ 
+ 					assert(sna->kgem.mode == KGEM_BLT);
+ 					b = sna->kgem.batch + sna->kgem.nbatch;
+@@ -13797,6 +13992,7 @@ sna_poly_fill_rect_stippled_1_blt(DrawablePtr drawable,
+ 							return false;
+ 						_kgem_set_mode(&sna->kgem, KGEM_BLT);
+ 					}
++					kgem_bcs_set_tiling(&sna->kgem, NULL, bo);
+ 
+ 					upload = kgem_create_buffer(&sna->kgem,
+ 								    bstride*bh,
+@@ -13927,6 +14123,7 @@ sna_poly_fill_rect_stippled_1_blt(DrawablePtr drawable,
+ 								return false;
+ 							_kgem_set_mode(&sna->kgem, KGEM_BLT);
+ 						}
++						kgem_bcs_set_tiling(&sna->kgem, NULL, bo);
+ 
+ 						assert(sna->kgem.mode == KGEM_BLT);
+ 						b = sna->kgem.batch + sna->kgem.nbatch;
+@@ -13987,6 +14184,7 @@ sna_poly_fill_rect_stippled_1_blt(DrawablePtr drawable,
+ 								return false;
+ 							_kgem_set_mode(&sna->kgem, KGEM_BLT);
+ 						}
++						kgem_bcs_set_tiling(&sna->kgem, NULL, bo);
+ 
+ 						upload = kgem_create_buffer(&sna->kgem,
+ 									    bstride*bh,
+@@ -14064,7 +14262,7 @@ sna_poly_fill_rect_stippled_1_blt(DrawablePtr drawable,
+ 		}
+ 	}
+ 
+-	sna->blt_state.fill_bo = 0;
++	blt_done(sna);
+ 	return true;
+ }
+ 
+@@ -14126,6 +14324,7 @@ sna_poly_fill_rect_stippled_n_box__imm(struct sna *sna,
+ 					return; /* XXX fallback? */
+ 				_kgem_set_mode(&sna->kgem, KGEM_BLT);
+ 			}
++			kgem_bcs_set_tiling(&sna->kgem, NULL, bo);
+ 
+ 			assert(sna->kgem.mode == KGEM_BLT);
+ 			b = sna->kgem.batch + sna->kgem.nbatch;
+@@ -14251,6 +14450,7 @@ sna_poly_fill_rect_stippled_n_box(struct sna *sna,
+ 					return; /* XXX fallback? */
+ 				_kgem_set_mode(&sna->kgem, KGEM_BLT);
+ 			}
++			kgem_bcs_set_tiling(&sna->kgem, NULL, bo);
+ 
+ 			assert(sna->kgem.mode == KGEM_BLT);
+ 			b = sna->kgem.batch + sna->kgem.nbatch;
+@@ -14414,6 +14614,7 @@ sna_poly_fill_rect_stippled_n_blt__imm(DrawablePtr drawable,
+ 	get_drawable_deltas(drawable, pixmap, &dx, &dy);
+ 	kgem_set_mode(&sna->kgem, KGEM_BLT, bo);
+ 	assert(kgem_bo_can_blt(&sna->kgem, bo));
++	kgem_bcs_set_tiling(&sna->kgem, NULL, bo);
+ 
+ 	br00 = XY_MONO_SRC_COPY_IMM | 3 << 20;
+ 	br13 = bo->pitch;
+@@ -14526,7 +14727,7 @@ sna_poly_fill_rect_stippled_n_blt__imm(DrawablePtr drawable,
+ 	}
+ 
+ 	assert_pixmap_damage(pixmap);
+-	sna->blt_state.fill_bo = 0;
++	blt_done(sna);
+ 	return true;
+ }
+ 
+@@ -14559,6 +14760,7 @@ sna_poly_fill_rect_stippled_n_blt(DrawablePtr drawable,
+ 	get_drawable_deltas(drawable, pixmap, &dx, &dy);
+ 	kgem_set_mode(&sna->kgem, KGEM_BLT, bo);
+ 	assert(kgem_bo_can_blt(&sna->kgem, bo));
++	kgem_bcs_set_tiling(&sna->kgem, NULL, bo);
+ 
+ 	br00 = XY_MONO_SRC_COPY | 3 << 20;
+ 	br13 = bo->pitch;
+@@ -14673,7 +14875,7 @@ sna_poly_fill_rect_stippled_n_blt(DrawablePtr drawable,
+ 	assert_pixmap_damage(pixmap);
+ 	if (tile)
+ 		kgem_bo_destroy(&sna->kgem, tile);
+-	sna->blt_state.fill_bo = 0;
++	blt_done(sna);
+ 	return true;
+ }
+ 
+@@ -15281,6 +15483,7 @@ sna_glyph_blt(DrawablePtr drawable, GCPtr gc,
+ 		}
+ 		_kgem_set_mode(&sna->kgem, KGEM_BLT);
+ 	}
++	kgem_bcs_set_tiling(&sna->kgem, NULL, bo);
+ 
+ 	DBG(("%s: glyph clip box (%d, %d), (%d, %d)\n",
+ 	     __FUNCTION__,
+@@ -15368,6 +15571,7 @@ sna_glyph_blt(DrawablePtr drawable, GCPtr gc,
+ 			if (!kgem_check_batch(&sna->kgem, 3+len)) {
+ 				_kgem_submit(&sna->kgem);
+ 				_kgem_set_mode(&sna->kgem, KGEM_BLT);
++				kgem_bcs_set_tiling(&sna->kgem, NULL, bo);
+ 
+ 				DBG(("%s: new batch, glyph clip box (%d, %d), (%d, %d)\n",
+ 				     __FUNCTION__,
+@@ -15479,7 +15683,7 @@ skip:
+ 	}
+ 
+ 	assert_pixmap_damage(pixmap);
+-	sna->blt_state.fill_bo = 0;
++	blt_done(sna);
+ 	return true;
+ }
+ 
+@@ -16002,6 +16206,7 @@ sna_reversed_glyph_blt(DrawablePtr drawable, GCPtr gc,
+ 		}
+ 		_kgem_set_mode(&sna->kgem, KGEM_BLT);
+ 	}
++	kgem_bcs_set_tiling(&sna->kgem, NULL, bo);
+ 
+ 	unwind_batch = sna->kgem.nbatch;
+ 	unwind_reloc = sna->kgem.nreloc;
+@@ -16111,6 +16316,7 @@ sna_reversed_glyph_blt(DrawablePtr drawable, GCPtr gc,
+ 			if (!kgem_check_batch(&sna->kgem, 3+len)) {
+ 				_kgem_submit(&sna->kgem);
+ 				_kgem_set_mode(&sna->kgem, KGEM_BLT);
++				kgem_bcs_set_tiling(&sna->kgem, NULL, bo);
+ 
+ 				unwind_batch = sna->kgem.nbatch;
+ 				unwind_reloc = sna->kgem.nreloc;
+@@ -16229,7 +16435,7 @@ skip:
+ 	}
+ 
+ 	assert_pixmap_damage(pixmap);
+-	sna->blt_state.fill_bo = 0;
++	blt_done(sna);
+ 	return true;
+ }
+ 
+@@ -16450,6 +16656,7 @@ sna_push_pixels_solid_blt(GCPtr gc,
+ 
+ 	kgem_set_mode(&sna->kgem, KGEM_BLT, bo);
+ 	assert(kgem_bo_can_blt(&sna->kgem, bo));
++	kgem_bcs_set_tiling(&sna->kgem, NULL, bo);
+ 
+ 	/* Region is pre-clipped and translated into pixmap space */
+ 	box = region_rects(region);
+@@ -16471,6 +16678,7 @@ sna_push_pixels_solid_blt(GCPtr gc,
+ 				return false;
+ 			_kgem_set_mode(&sna->kgem, KGEM_BLT);
+ 		}
++		kgem_bcs_set_tiling(&sna->kgem, NULL, bo);
+ 
+ 		upload = kgem_create_buffer(&sna->kgem,
+ 					    bstride*bh,
+@@ -16564,7 +16772,7 @@ sna_push_pixels_solid_blt(GCPtr gc,
+ 		box++;
+ 	} while (--n);
+ 
+-	sna->blt_state.fill_bo = 0;
++	blt_done(sna);
+ 	return true;
+ }
+ 
+@@ -16754,7 +16962,9 @@ static int sna_create_gc(GCPtr gc)
+ 
+ 	gc->freeCompClip = 0;
+ 	gc->pCompositeClip = 0;
++#if XORG_VERSION_CURRENT < XORG_VERSION_NUMERIC(1,19,99,1,0)
+ 	gc->pRotatedPixmap = 0;
++#endif
+ 
+ 	fb_gc(gc)->bpp = bits_per_pixel(gc->depth);
+ 
+@@ -16789,7 +16999,8 @@ sna_get_image__inplace(PixmapPtr pixmap,
+ 		break;
+ 	}
+ 
+-	if (!kgem_bo_can_map__cpu(&sna->kgem, priv->gpu_bo, FORCE_FULL_SYNC))
++	if ((flags & MOVE_INPLACE_HINT) == 0 &&
++	    !kgem_bo_can_map__cpu(&sna->kgem, priv->gpu_bo, FORCE_FULL_SYNC))
+ 		return false;
+ 
+ 	if (idle && __kgem_bo_is_busy(&sna->kgem, priv->gpu_bo))
+@@ -16801,11 +17012,19 @@ sna_get_image__inplace(PixmapPtr pixmap,
+ 	assert(sna_damage_contains_box(&priv->gpu_damage, &region->extents) == PIXMAN_REGION_IN);
+ 	assert(sna_damage_contains_box(&priv->cpu_damage, &region->extents) == PIXMAN_REGION_OUT);
+ 
+-	src = kgem_bo_map__cpu(&sna->kgem, priv->gpu_bo);
+-	if (src == NULL)
+-		return false;
++	if (kgem_bo_can_map__cpu(&sna->kgem, priv->gpu_bo, FORCE_FULL_SYNC)) {
++		src = kgem_bo_map__cpu(&sna->kgem, priv->gpu_bo);
++		if (src == NULL)
++			return false;
+ 
+-	kgem_bo_sync__cpu_full(&sna->kgem, priv->gpu_bo, FORCE_FULL_SYNC);
++		kgem_bo_sync__cpu_full(&sna->kgem, priv->gpu_bo, FORCE_FULL_SYNC);
++	} else {
++		src = kgem_bo_map__wc(&sna->kgem, priv->gpu_bo);
++		if (src == NULL)
++			return false;
++
++		kgem_bo_sync__gtt(&sna->kgem, priv->gpu_bo);
++	}
+ 
+ 	if (sigtrap_get())
+ 		return false;
+@@ -16833,12 +17052,11 @@ sna_get_image__inplace(PixmapPtr pixmap,
+ 			   region->extents.x2 - region->extents.x1,
+ 			   region->extents.y2 - region->extents.y1);
+ 		if (!priv->shm) {
+-			assert(src == MAP(priv->gpu_bo->map__cpu));
+ 			pixmap->devPrivate.ptr = src;
+ 			pixmap->devKind = priv->gpu_bo->pitch;
+-			priv->mapped = MAPPED_CPU;
++			priv->mapped = src == MAP(priv->gpu_bo->map__cpu) ? MAPPED_CPU : MAPPED_GTT;
+ 			assert_pixmap_map(pixmap, priv);
+-			priv->cpu = true;
++			priv->cpu &= priv->mapped == MAPPED_CPU;
+ 		}
+ 	}
+ 
+@@ -16930,7 +17148,7 @@ sna_get_image__fast(PixmapPtr pixmap,
+ 	if (priv == NULL || priv->gpu_damage == NULL)
+ 		return false;
+ 
+-	if (priv->clear) {
++	if (priv->clear && sigtrap_get() == 0) {
+ 		int w = region->extents.x2 - region->extents.x1;
+ 		int h = region->extents.y2 - region->extents.y1;
+ 		int pitch = PixmapBytePad(w, pixmap->drawable.depth);
+@@ -16939,6 +17157,7 @@ sna_get_image__fast(PixmapPtr pixmap,
+ 		     __FUNCTION__, priv->clear_color));
+ 		assert(DAMAGE_IS_ALL(priv->gpu_damage));
+ 		assert(priv->cpu_damage == NULL);
++		sigtrap_assert_active();
+ 
+ 		if (priv->clear_color == 0 ||
+ 		    pixmap->drawable.bitsPerPixel == 8 ||
+@@ -16955,6 +17174,7 @@ sna_get_image__fast(PixmapPtr pixmap,
+ 				    priv->clear_color);
+ 		}
+ 
++		sigtrap_put();
+ 		return true;
+ 	}
+ 
+@@ -17001,8 +17221,7 @@ sna_get_image(DrawablePtr drawable,
+ 	if (ACCEL_GET_IMAGE &&
+ 	    !FORCE_FALLBACK &&
+ 	    format == ZPixmap &&
+-	    drawable->bitsPerPixel >= 8 &&
+-	    PM_IS_SOLID(drawable, mask)) {
++	    drawable->bitsPerPixel >= 8) {
+ 		PixmapPtr pixmap = get_drawable_pixmap(drawable);
+ 		int16_t dx, dy;
+ 
+@@ -17014,7 +17233,7 @@ sna_get_image(DrawablePtr drawable,
+ 		region.data = NULL;
+ 
+ 		if (sna_get_image__fast(pixmap, &region, dst, flags))
+-			return;
++			goto apply_planemask;
+ 
+ 		if (!sna_drawable_move_region_to_cpu(&pixmap->drawable,
+ 						     &region, flags))
+@@ -17032,6 +17251,16 @@ sna_get_image(DrawablePtr drawable,
+ 				   region.extents.x1, region.extents.y1, 0, 0, w, h);
+ 			sigtrap_put();
+ 		}
++
++apply_planemask:
++		if (!PM_IS_SOLID(drawable, mask)) {
++			FbStip pm = fbReplicatePixel(mask, drawable->bitsPerPixel);
++			FbStip *d = (FbStip *)dst;
++			int i, n = PixmapBytePad(w, drawable->depth) / sizeof(FbStip) * h;
++
++			for (i = 0; i < n; i++)
++				d[i] &= pm;
++		}
+ 	} else {
+ 		region.extents.x1 = x + drawable->x;
+ 		region.extents.y1 = y + drawable->y;
+@@ -17162,17 +17391,19 @@ void sna_accel_flush(struct sna *sna)
+ 				__sna_free_pixmap(sna, priv->pixmap, priv);
+ 			}
+ 		} else {
++			unsigned hints;
+ 			DBG(("%s: flushing DRI pixmap=%ld\n", __FUNCTION__,
+ 			     priv->pixmap->drawable.serialNumber));
+ 			assert(priv->flush);
+-			if (sna_pixmap_move_to_gpu(priv->pixmap,
+-						   MOVE_READ | __MOVE_FORCE)) {
+-				if (priv->flush & IS_CLIPPED) {
++			hints = MOVE_READ | __MOVE_FORCE;
++			if (priv->flush & FLUSH_WRITE)
++				hints |= MOVE_WRITE;
++			if (sna_pixmap_move_to_gpu(priv->pixmap, hints)) {
++				if (priv->flush & FLUSH_WRITE) {
+ 					kgem_bo_unclean(&sna->kgem, priv->gpu_bo);
+ 					sna_damage_all(&priv->gpu_damage, priv->pixmap);
+ 					assert(priv->cpu_damage == NULL);
+-					priv->clear = false;
+-					priv->cpu = false;
++					assert(priv->clear == false);
+ 				}
+ 			}
+ 		}
+@@ -17184,10 +17415,46 @@ void sna_accel_flush(struct sna *sna)
+ }
+ 
+ static void
+-sna_accel_flush_callback(CallbackListPtr *list,
+-			 pointer user_data, pointer call_data)
++sna_shm_flush_callback(CallbackListPtr *list,
++		       pointer user_data, pointer call_data)
+ {
+-	sna_accel_flush(user_data);
++	struct sna *sna = user_data;
++
++	if (!sna->needs_shm_flush)
++		return;
++
++	sna_accel_flush(sna);
++	sna->needs_shm_flush = false;
++}
++
++static void
++sna_flush_callback(CallbackListPtr *list, pointer user_data, pointer call_data)
++{
++	struct sna *sna = user_data;
++
++	if (!sna->needs_dri_flush)
++		return;
++
++	sna_accel_flush(sna);
++	sna->needs_dri_flush = false;
++}
++
++static void
++sna_event_callback(CallbackListPtr *list, pointer user_data, pointer call_data)
++{
++	EventInfoRec *eventinfo = call_data;
++	struct sna *sna = user_data;
++	int i;
++
++	if (sna->needs_dri_flush)
++		return;
++
++	for (i = 0; i < eventinfo->count; i++) {
++		if (eventinfo->events[i].u.u.type == sna->damage_event) {
++			sna->needs_dri_flush = true;
++			return;
++		}
++	}
+ }
+ 
+ static struct sna_pixmap *sna_accel_scanout(struct sna *sna)
+@@ -17199,6 +17466,7 @@ static struct sna_pixmap *sna_accel_scanout(struct sna *sna)
+ 
+ 	assert(sna->vblank_interval);
+ 	assert(sna->front);
++	assert(!sna->mode.hidden);
+ 
+ 	priv = sna_pixmap(sna->front);
+ 	if (priv->gpu_bo == NULL)
+@@ -17217,7 +17485,7 @@ static void sna_accel_disarm_timer(struct sna *sna, int id)
+ static bool has_offload_slaves(struct sna *sna)
+ {
+ #if HAS_PIXMAP_SHARING
+-	ScreenPtr screen = sna->scrn->pScreen;
++	ScreenPtr screen = to_screen_from_sna(sna);
+ 	PixmapDirtyUpdatePtr dirty;
+ 
+ 	xorg_list_for_each_entry(dirty, &screen->pixmap_dirty_list, ent) {
+@@ -17231,11 +17499,14 @@ static bool has_offload_slaves(struct sna *sna)
+ 
+ static bool has_shadow(struct sna *sna)
+ {
+-	DamagePtr damage = sna->mode.shadow_damage;
++	DamagePtr damage;
+ 
+-	if (damage == NULL)
++	if (!sna->mode.shadow_enabled)
+ 		return false;
+ 
++	damage = sna->mode.shadow_damage;
++	assert(damage);
++
+ 	DBG(("%s: has pending damage? %d, outstanding flips: %d\n",
+ 	     __FUNCTION__,
+ 	     RegionNotEmpty(DamageRegion(damage)),
+@@ -17365,9 +17636,8 @@ static bool sna_accel_do_expire(struct sna *sna)
+ static void sna_accel_post_damage(struct sna *sna)
+ {
+ #if HAS_PIXMAP_SHARING
+-	ScreenPtr screen = sna->scrn->pScreen;
++	ScreenPtr screen = to_screen_from_sna(sna);
+ 	PixmapDirtyUpdatePtr dirty;
+-	bool flush = false;
+ 
+ 	xorg_list_for_each_entry(dirty, &screen->pixmap_dirty_list, ent) {
+ 		RegionRec region, *damage;
+@@ -17376,8 +17646,6 @@ static void sna_accel_post_damage(struct sna *sna)
+ 		int16_t dx, dy;
+ 		int n;
+ 
+-		assert(dirty->src == sna->front);
+-
+ 		damage = DamageRegion(dirty->damage);
+ 		if (RegionNil(damage))
+ 			continue;
+@@ -17477,7 +17745,14 @@ fallback:
+ 						    box, n, COPY_LAST))
+ 				goto fallback;
+ 
+-			flush = true;
++			/* Before signalling the slave via ProcessPending,
++			 * ensure not only the batch is submitted as the
++			 * slave may be using the Damage callback to perform
++			 * its copy, but also that the memory must be coherent
++			 * - we need to treat it as uncached for the PCI slave
++			 * will bypass LLC.
++			 */
++			kgem_bo_sync__gtt(&sna->kgem, __sna_pixmap_get_bo(dst));
+ 		}
+ 
+ 		DamageRegionProcessPending(&dirty->slave_dst->drawable);
+@@ -17485,8 +17760,6 @@ skip:
+ 		RegionUninit(&region);
+ 		DamageEmpty(dirty->damage);
+ 	}
+-	if (flush)
+-		kgem_submit(&sna->kgem);
+ #endif
+ }
+ 
+@@ -17689,6 +17962,7 @@ sna_set_screen_pixmap(PixmapPtr pixmap)
+ static Bool
+ sna_create_window(WindowPtr win)
+ {
++	DBG(("%s: window=%ld\n", __FUNCTION__, win->drawable.id));
+ 	sna_set_window_pixmap(win, win->drawable.pScreen->devPrivate);
+ 	return TRUE;
+ }
+@@ -17714,6 +17988,7 @@ sna_unmap_window(WindowPtr win)
+ static Bool
+ sna_destroy_window(WindowPtr win)
+ {
++	DBG(("%s: window=%ld\n", __FUNCTION__, win->drawable.id));
+ 	sna_video_destroy_window(win);
+ 	sna_dri2_destroy_window(win);
+ 	return TRUE;
+@@ -17790,20 +18065,34 @@ static bool sna_option_accel_none(struct sna *sna)
+ 	if (wedged(sna))
+ 		return true;
+ 
+-	if (xf86ReturnOptValBool(sna->Options, OPTION_ACCEL_DISABLE, FALSE))
++	if (!xf86ReturnOptValBool(sna->Options, OPTION_ACCEL_ENABLE, TRUE))
+ 		return true;
+ 
++	if (sna->kgem.gen >= 0120)
++		return true;
++
++	if (!intel_option_cast_to_bool(sna->Options,
++				       OPTION_ACCEL_METHOD,
++				       !IS_DEFAULT_ACCEL_METHOD(NOACCEL)))
++		return false;
++
++#if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,7,99,901,0)
+ 	s = xf86GetOptValString(sna->Options, OPTION_ACCEL_METHOD);
+ 	if (s == NULL)
+ 		return IS_DEFAULT_ACCEL_METHOD(NOACCEL);
+ 
+ 	return strcasecmp(s, "none") == 0;
++#else
++	return IS_DEFAULT_ACCEL_METHOD(NOACCEL);
++#endif
+ }
+ 
+ static bool sna_option_accel_blt(struct sna *sna)
+ {
+ 	const char *s;
+ 
++	assert(sna->kgem.gen < 0120);
++
+ 	s = xf86GetOptValString(sna->Options, OPTION_ACCEL_METHOD);
+ 	if (s == NULL)
+ 		return false;
+@@ -17811,6 +18100,13 @@ static bool sna_option_accel_blt(struct sna *sna)
+ 	return strcasecmp(s, "blt") == 0;
+ }
+ 
++#if HAVE_NOTIFY_FD
++static void sna_accel_notify(int fd, int ready, void *data)
++{
++	sna_mode_wakeup(data);
++}
++#endif
++
+ bool sna_accel_init(ScreenPtr screen, struct sna *sna)
+ {
+ 	const char *backend;
+@@ -17822,7 +18118,7 @@ bool sna_accel_init(ScreenPtr screen, struct sna *sna)
+ 	list_init(&sna->flush_pixmaps);
+ 	list_init(&sna->active_pixmaps);
+ 
+-	AddGeneralSocket(sna->kgem.fd);
++	SetNotifyFd(sna->kgem.fd, sna_accel_notify, X_NOTIFY_READ, sna);
+ 
+ #ifdef DEBUG_MEMORY
+ 	sna->timer_expire[DEBUG_MEMORY_TIMER] = GetTimeInMillis()+ 10 * 1000;
+@@ -17892,21 +18188,23 @@ bool sna_accel_init(ScreenPtr screen, struct sna *sna)
+ 		backend = "disabled";
+ 		sna->kgem.wedged = true;
+ 		sna_render_mark_wedged(sna);
+-	} else if (sna_option_accel_blt(sna) || sna->info->gen >= 0110)
++	} else if (sna_option_accel_blt(sna))
+ 		(void)backend;
+-	else if (sna->info->gen >= 0100)
++	else if (sna->kgem.gen >= 0110)
++		backend = gen9_render_init(sna, backend);
++	else if (sna->kgem.gen >= 0100)
+ 		backend = gen8_render_init(sna, backend);
+-	else if (sna->info->gen >= 070)
++	else if (sna->kgem.gen >= 070)
+ 		backend = gen7_render_init(sna, backend);
+-	else if (sna->info->gen >= 060)
++	else if (sna->kgem.gen >= 060)
+ 		backend = gen6_render_init(sna, backend);
+-	else if (sna->info->gen >= 050)
++	else if (sna->kgem.gen >= 050)
+ 		backend = gen5_render_init(sna, backend);
+-	else if (sna->info->gen >= 040)
++	else if (sna->kgem.gen >= 040)
+ 		backend = gen4_render_init(sna, backend);
+-	else if (sna->info->gen >= 030)
++	else if (sna->kgem.gen >= 030)
+ 		backend = gen3_render_init(sna, backend);
+-	else if (sna->info->gen >= 020)
++	else if (sna->kgem.gen >= 020)
+ 		backend = gen2_render_init(sna, backend);
+ 
+ 	DBG(("%s(backend=%s, prefer_gpu=%x)\n",
+@@ -17924,8 +18222,14 @@ bool sna_accel_init(ScreenPtr screen, struct sna *sna)
+ 
+ void sna_accel_create(struct sna *sna)
+ {
++	ExtensionEntry *damage;
++
+ 	DBG(("%s\n", __FUNCTION__));
+ 
++	damage = CheckExtension("DAMAGE");
++	if (damage)
++		sna->damage_event = damage->eventBase + XDamageNotify;
++
+ 	if (!sna_glyphs_create(sna))
+ 		goto fail;
+ 
+@@ -17943,27 +18247,59 @@ fail:
+ 	no_render_init(sna);
+ }
+ 
+-void sna_accel_watch_flush(struct sna *sna, int enable)
++static void sna_shm_watch_flush(struct sna *sna, int enable)
+ {
+ 	DBG(("%s: enable=%d\n", __FUNCTION__, enable));
+ 	assert(enable);
+ 
+-	if (sna->watch_flush == 0) {
++	if (sna->watch_shm_flush == 0) {
++		DBG(("%s: installing shm watchers\n", __FUNCTION__));
++		assert(enable > 0);
++
++		if (!AddCallback(&FlushCallback, sna_shm_flush_callback, sna))
++			return;
++
++		sna->watch_shm_flush++;
++	}
++
++	sna->watch_shm_flush += enable;
++}
++
++void sna_watch_flush(struct sna *sna, int enable)
++{
++	DBG(("%s: enable=%d\n", __FUNCTION__, enable));
++	assert(enable);
++
++	if (sna->watch_dri_flush == 0) {
++		int err = 0;
++
+ 		DBG(("%s: installing watchers\n", __FUNCTION__));
+ 		assert(enable > 0);
+-		if (!AddCallback(&FlushCallback, sna_accel_flush_callback, sna)) {
++
++		if (!sna->damage_event)
++			return;
++
++		if (!AddCallback(&EventCallback, sna_event_callback, sna))
++			err = 1;
++
++		if (!AddCallback(&FlushCallback, sna_flush_callback, sna))
++			err = 1;
++
++		if (err) {
+ 			xf86DrvMsg(sna->scrn->scrnIndex, X_Error,
+ 				   "Failed to attach ourselves to the flush callbacks, expect missing synchronisation with DRI clients (e.g a compositor)\n");
+ 		}
+-		sna->watch_flush++;
++
++		sna->watch_dri_flush++;
+ 	}
+ 
+-	sna->watch_flush += enable;
++	sna->watch_dri_flush += enable;
+ }
+ 
+ void sna_accel_leave(struct sna *sna)
+ {
+ 	DBG(("%s\n", __FUNCTION__));
++	sna_scanout_flush(sna);
+ 
+ 	/* as root we always have permission to render */
+ 	if (geteuid() == 0)
+@@ -17997,13 +18333,15 @@ void sna_accel_close(struct sna *sna)
+ 
+ 	sna_pixmap_expire(sna);
+ 
+-	DeleteCallback(&FlushCallback, sna_accel_flush_callback, sna);
+-	RemoveGeneralSocket(sna->kgem.fd);
++	DeleteCallback(&FlushCallback, sna_shm_flush_callback, sna);
++	DeleteCallback(&FlushCallback, sna_flush_callback, sna);
++	DeleteCallback(&EventCallback, sna_event_callback, sna);
++	RemoveNotifyFd(sna->kgem.fd);
+ 
+ 	kgem_cleanup_cache(&sna->kgem);
+ }
+ 
+-void sna_accel_block_handler(struct sna *sna, struct timeval **tv)
++void sna_accel_block(struct sna *sna, struct timeval **tv)
+ {
+ 	sigtrap_assert_inactive();
+ 
+@@ -18044,10 +18382,17 @@ restart:
+ 	if (sna_accel_do_debug_memory(sna))
+ 		sna_accel_debug_memory(sna);
+ 
+-	if (sna->watch_flush == 1) {
+-		DBG(("%s: removing watchers\n", __FUNCTION__));
+-		DeleteCallback(&FlushCallback, sna_accel_flush_callback, sna);
+-		sna->watch_flush = 0;
++	if (sna->watch_shm_flush == 1) {
++		DBG(("%s: removing shm watchers\n", __FUNCTION__));
++		DeleteCallback(&FlushCallback, sna_shm_flush_callback, sna);
++		sna->watch_shm_flush = 0;
++	}
++
++	if (sna->watch_dri_flush == 1) {
++		DBG(("%s: removing dri watchers\n", __FUNCTION__));
++		DeleteCallback(&FlushCallback, sna_flush_callback, sna);
++		DeleteCallback(&EventCallback, sna_event_callback, sna);
++		sna->watch_dri_flush = 0;
+ 	}
+ 
+ 	if (sna->timer_active & 1) {
+@@ -18083,22 +18428,6 @@ set_tv:
+ 	}
+ }
+ 
+-void sna_accel_wakeup_handler(struct sna *sna)
+-{
+-	DBG(("%s: nbatch=%d, need_retire=%d, need_purge=%d\n", __FUNCTION__,
+-	     sna->kgem.nbatch, sna->kgem.need_retire, sna->kgem.need_purge));
+-
+-	if (!sna->kgem.nbatch)
+-		return;
+-
+-	if (kgem_is_idle(&sna->kgem)) {
+-		DBG(("%s: GPU idle, flushing\n", __FUNCTION__));
+-		_kgem_submit(&sna->kgem);
+-	}
+-
+-	sigtrap_assert_inactive();
+-}
+-
+ void sna_accel_free(struct sna *sna)
+ {
+ 	DBG(("%s\n", __FUNCTION__));
+diff --git a/src/sna/sna_acpi.c b/src/sna/sna_acpi.c
+index dcc0287b..643d04af 100644
+--- a/src/sna/sna_acpi.c
++++ b/src/sna/sna_acpi.c
+@@ -92,7 +92,7 @@ void _sna_acpi_wakeup(struct sna *sna)
+ 		DBG(("%s: error [%d], detaching from acpid\n", __FUNCTION__, n));
+ 
+ 		/* XXX reattach later? */
+-		RemoveGeneralSocket(sna->acpi.fd);
++		RemoveNotifyFd(sna->acpi.fd);
+ 		sna_acpi_fini(sna);
+ 		return;
+ 	}
+@@ -136,6 +136,13 @@ void _sna_acpi_wakeup(struct sna *sna)
+ 	} while (n);
+ }
+ 
++#if HAVE_NOTIFY_FD
++static void sna_acpi_notify(int fd, int read, void *data)
++{
++	_sna_acpi_wakeup(data);
++}
++#endif
++
+ static int read_power_state(const char *path)
+ {
+ 	DIR *dir;
+@@ -200,7 +207,7 @@ void sna_acpi_init(struct sna *sna)
+ 
+ 	DBG(("%s: attaching to acpid\n", __FUNCTION__));
+ 
+-	AddGeneralSocket(sna->acpi.fd);
++	SetNotifyFd(sna->acpi.fd, sna_acpi_notify, X_NOTIFY_READ, sna);
+ 	sna->acpi.remain = sizeof(sna->acpi.event) - 1;
+ 	sna->acpi.offset = 0;
+ 
+diff --git a/src/sna/sna_blt.c b/src/sna/sna_blt.c
+index de8f6ec3..ddd2586d 100644
+--- a/src/sna/sna_blt.c
++++ b/src/sna/sna_blt.c
+@@ -86,6 +86,11 @@ static const uint8_t fill_ROP[] = {
+ 	ROP_1
+ };
+ 
++static void sig_done(struct sna *sna, const struct sna_composite_op *op)
++{
++	sigtrap_put();
++}
++
+ static void nop_done(struct sna *sna, const struct sna_composite_op *op)
+ {
+ 	assert(sna->kgem.nbatch <= KGEM_BATCH_SIZE(&sna->kgem));
+@@ -129,7 +134,6 @@ static bool sna_blt_fill_init(struct sna *sna,
+ 	struct kgem *kgem = &sna->kgem;
+ 
+ 	assert(kgem_bo_can_blt (kgem, bo));
+-	assert(bo->tiling != I915_TILING_Y);
+ 	blt->bo[0] = bo;
+ 
+ 	blt->br13 = bo->pitch;
+@@ -183,6 +187,7 @@ static bool sna_blt_fill_init(struct sna *sna,
+ 				return false;
+ 			_kgem_set_mode(kgem, KGEM_BLT);
+ 		}
++		kgem_bcs_set_tiling(kgem, NULL, bo);
+ 
+ 		assert(sna->kgem.mode == KGEM_BLT);
+ 		b = kgem->batch + kgem->nbatch;
+@@ -237,17 +242,13 @@ static bool sna_blt_fill_init(struct sna *sna,
+ 	return true;
+ }
+ 
+-noinline static void sna_blt_fill_begin(struct sna *sna,
+-					const struct sna_blt_state *blt)
++noinline static void __sna_blt_fill_begin(struct sna *sna,
++					  const struct sna_blt_state *blt)
+ {
+ 	struct kgem *kgem = &sna->kgem;
+ 	uint32_t *b;
+ 
+-	if (kgem->nreloc) {
+-		_kgem_submit(kgem);
+-		_kgem_set_mode(kgem, KGEM_BLT);
+-		assert(kgem->nbatch == 0);
+-	}
++	kgem_bcs_set_tiling(&sna->kgem, NULL, blt->bo[0]);
+ 
+ 	assert(kgem->mode == KGEM_BLT);
+ 	b = kgem->batch + kgem->nbatch;
+@@ -293,6 +294,21 @@ noinline static void sna_blt_fill_begin(struct sna *sna,
+ 	}
+ }
+ 
++inline static void sna_blt_fill_begin(struct sna *sna,
++				      const struct sna_blt_state *blt)
++{
++	struct kgem *kgem = &sna->kgem;
++
++	if (kgem->nreloc) {
++		_kgem_submit(kgem);
++		_kgem_set_mode(kgem, KGEM_BLT);
++		kgem_bcs_set_tiling(kgem, NULL, blt->bo[0]);
++		assert(kgem->nbatch == 0);
++	}
++
++	__sna_blt_fill_begin(sna, blt);
++}
++
+ inline static void sna_blt_fill_one(struct sna *sna,
+ 				    const struct sna_blt_state *blt,
+ 				    int16_t x, int16_t y,
+@@ -330,8 +346,8 @@ static bool sna_blt_copy_init(struct sna *sna,
+ {
+ 	struct kgem *kgem = &sna->kgem;
+ 
+-	assert(kgem_bo_can_blt (kgem, src));
+-	assert(kgem_bo_can_blt (kgem, dst));
++	assert(kgem_bo_can_blt(kgem, src));
++	assert(kgem_bo_can_blt(kgem, dst));
+ 
+ 	blt->bo[0] = src;
+ 	blt->bo[1] = dst;
+@@ -370,6 +386,7 @@ static bool sna_blt_copy_init(struct sna *sna,
+ 			return false;
+ 		_kgem_set_mode(kgem, KGEM_BLT);
+ 	}
++	kgem_bcs_set_tiling(&sna->kgem, src, dst);
+ 
+ 	sna->blt_state.fill_bo = 0;
+ 	return true;
+@@ -424,6 +441,7 @@ static bool sna_blt_alpha_fixup_init(struct sna *sna,
+ 			return false;
+ 		_kgem_set_mode(kgem, KGEM_BLT);
+ 	}
++	kgem_bcs_set_tiling(&sna->kgem, src, dst);
+ 
+ 	sna->blt_state.fill_bo = 0;
+ 	return true;
+@@ -454,6 +472,7 @@ static void sna_blt_alpha_fixup_one(struct sna *sna,
+ 	    !kgem_check_reloc(kgem, 2)) {
+ 		_kgem_submit(kgem);
+ 		_kgem_set_mode(kgem, KGEM_BLT);
++		kgem_bcs_set_tiling(&sna->kgem, blt->bo[0], blt->bo[1]);
+ 	}
+ 
+ 	assert(sna->kgem.mode == KGEM_BLT);
+@@ -582,6 +601,7 @@ static void sna_blt_copy_one(struct sna *sna,
+ 	    !kgem_check_reloc(kgem, 2)) {
+ 		_kgem_submit(kgem);
+ 		_kgem_set_mode(kgem, KGEM_BLT);
++		kgem_bcs_set_tiling(&sna->kgem, blt->bo[0], blt->bo[1]);
+ 	}
+ 
+ 	assert(sna->kgem.mode == KGEM_BLT);
+@@ -912,8 +932,27 @@ sna_composite_mask_is_opaque(PicturePtr mask)
+ 		return is_solid(mask) && is_white(mask);
+ 	else if (!PICT_FORMAT_A(mask->format))
+ 		return true;
+-	else
+-		return is_solid(mask) && is_opaque_solid(mask);
++	else if (mask->pSourcePict) {
++		PictSolidFill *fill = (PictSolidFill *) mask->pSourcePict;
++		return (fill->color >> 24) == 0xff;
++	} else {
++		struct sna_pixmap *priv;
++		assert(mask->pDrawable);
++
++		if (mask->pDrawable->width  == 1 &&
++		    mask->pDrawable->height == 1 &&
++		    mask->repeat)
++			return pixel_is_opaque(get_pixel(mask), mask->format);
++
++		if (mask->transform)
++			return false;
++
++		priv = sna_pixmap_from_drawable(mask->pDrawable);
++		if (priv == NULL || !priv->clear)
++			return false;
++
++		return pixel_is_opaque(priv->clear_color, mask->format);
++	}
+ }
+ 
+ fastcall
+@@ -971,6 +1010,7 @@ static void blt_composite_fill__cpu(struct sna *sna,
+ 
+ 	assert(op->dst.pixmap->devPrivate.ptr);
+ 	assert(op->dst.pixmap->devKind);
++	sigtrap_assert_active();
+ 	pixman_fill(op->dst.pixmap->devPrivate.ptr,
+ 		    op->dst.pixmap->devKind / sizeof(uint32_t),
+ 		    op->dst.pixmap->drawable.bitsPerPixel,
+@@ -990,6 +1030,7 @@ blt_composite_fill_box_no_offset__cpu(struct sna *sna,
+ 
+ 	assert(op->dst.pixmap->devPrivate.ptr);
+ 	assert(op->dst.pixmap->devKind);
++	sigtrap_assert_active();
+ 	pixman_fill(op->dst.pixmap->devPrivate.ptr,
+ 		    op->dst.pixmap->devKind / sizeof(uint32_t),
+ 		    op->dst.pixmap->drawable.bitsPerPixel,
+@@ -1010,6 +1051,7 @@ blt_composite_fill_boxes_no_offset__cpu(struct sna *sna,
+ 
+ 		assert(op->dst.pixmap->devPrivate.ptr);
+ 		assert(op->dst.pixmap->devKind);
++		sigtrap_assert_active();
+ 		pixman_fill(op->dst.pixmap->devPrivate.ptr,
+ 			    op->dst.pixmap->devKind / sizeof(uint32_t),
+ 			    op->dst.pixmap->drawable.bitsPerPixel,
+@@ -1031,6 +1073,7 @@ blt_composite_fill_box__cpu(struct sna *sna,
+ 
+ 	assert(op->dst.pixmap->devPrivate.ptr);
+ 	assert(op->dst.pixmap->devKind);
++	sigtrap_assert_active();
+ 	pixman_fill(op->dst.pixmap->devPrivate.ptr,
+ 		    op->dst.pixmap->devKind / sizeof(uint32_t),
+ 		    op->dst.pixmap->drawable.bitsPerPixel,
+@@ -1052,6 +1095,7 @@ blt_composite_fill_boxes__cpu(struct sna *sna,
+ 
+ 		assert(op->dst.pixmap->devPrivate.ptr);
+ 		assert(op->dst.pixmap->devKind);
++		sigtrap_assert_active();
+ 		pixman_fill(op->dst.pixmap->devPrivate.ptr,
+ 			    op->dst.pixmap->devKind / sizeof(uint32_t),
+ 			    op->dst.pixmap->drawable.bitsPerPixel,
+@@ -1159,12 +1203,15 @@ static inline void _sna_blt_maybe_clear(const struct sna_composite_op *op, const
+ 	    box->y2 - box->y1 >= op->dst.height) {
+ 		struct sna_pixmap *priv = sna_pixmap(op->dst.pixmap);
+ 		if (op->dst.bo == priv->gpu_bo) {
++			sna_damage_all(&priv->gpu_damage, op->dst.pixmap);
++			sna_damage_destroy(&priv->cpu_damage);
+ 			priv->clear = true;
+ 			priv->clear_color = op->u.blt.pixel;
+ 			DBG(("%s: pixmap=%ld marking clear [%08x]\n",
+ 			     __FUNCTION__,
+ 			     op->dst.pixmap->drawable.serialNumber,
+ 			     op->u.blt.pixel));
++			((struct sna_composite_op *)op)->damage = NULL;
+ 		}
+ 	}
+ }
+@@ -1404,6 +1451,7 @@ begin_blt(struct sna *sna,
+ 			return false;
+ 
+ 		_kgem_set_mode(&sna->kgem, KGEM_BLT);
++		kgem_bcs_set_tiling(&sna->kgem, NULL, op->dst.bo);
+ 	}
+ 
+ 	return true;
+@@ -1429,6 +1477,7 @@ prepare_blt_clear(struct sna *sna,
+ 	DBG(("%s\n", __FUNCTION__));
+ 
+ 	if (op->dst.bo == NULL) {
++		op->u.blt.pixel = 0;
+ 		op->blt   = blt_composite_fill__cpu;
+ 		if (op->dst.x|op->dst.y) {
+ 			op->box   = blt_composite_fill_box__cpu;
+@@ -1439,9 +1488,8 @@ prepare_blt_clear(struct sna *sna,
+ 			op->boxes = blt_composite_fill_boxes_no_offset__cpu;
+ 			op->thread_boxes = blt_composite_fill_boxes_no_offset__cpu;
+ 		}
+-		op->done  = nop_done;
+-		op->u.blt.pixel = 0;
+-		return true;
++		op->done = sig_done;
++		return sigtrap_get() == 0;
+ 	}
+ 
+ 	op->blt = blt_composite_fill;
+@@ -1484,8 +1532,8 @@ prepare_blt_fill(struct sna *sna,
+ 			op->boxes = blt_composite_fill_boxes_no_offset__cpu;
+ 			op->thread_boxes = blt_composite_fill_boxes_no_offset__cpu;
+ 		}
+-		op->done = nop_done;
+-		return true;
++		op->done = sig_done;
++		return sigtrap_get() == 0;
+ 	}
+ 
+ 	op->blt = blt_composite_fill;
+@@ -1668,6 +1716,7 @@ static void blt_composite_copy_boxes__thread(struct sna *sna,
+ 
+ 			_kgem_submit(kgem);
+ 			_kgem_set_mode(kgem, KGEM_BLT);
++			kgem_bcs_set_tiling(&sna->kgem, src_bo, dst_bo);
+ 		} while (1);
+ 	} else {
+ 		do {
+@@ -1724,6 +1773,7 @@ static void blt_composite_copy_boxes__thread(struct sna *sna,
+ 
+ 			_kgem_submit(kgem);
+ 			_kgem_set_mode(kgem, KGEM_BLT);
++			kgem_bcs_set_tiling(&sna->kgem, src_bo, dst_bo);
+ 		} while (1);
+ 	}
+ 	sna_vertex_unlock(&sna->render);
+@@ -1806,6 +1856,7 @@ static void blt_composite_copy_boxes__thread64(struct sna *sna,
+ 
+ 			_kgem_submit(kgem);
+ 			_kgem_set_mode(kgem, KGEM_BLT);
++			kgem_bcs_set_tiling(&sna->kgem, src_bo, dst_bo);
+ 		} while (1);
+ 	} else {
+ 		do {
+@@ -1864,6 +1915,7 @@ static void blt_composite_copy_boxes__thread64(struct sna *sna,
+ 
+ 			_kgem_submit(kgem);
+ 			_kgem_set_mode(kgem, KGEM_BLT);
++			kgem_bcs_set_tiling(&sna->kgem, src_bo, dst_bo);
+ 		} while (1);
+ 	}
+ 	sna_vertex_unlock(&sna->render);
+@@ -1973,6 +2025,7 @@ prepare_blt_copy(struct sna *sna,
+ 		}
+ 		_kgem_set_mode(&sna->kgem, KGEM_BLT);
+ 	}
++	kgem_bcs_set_tiling(&sna->kgem, bo, op->dst.bo);
+ 
+ 	DBG(("%s\n", __FUNCTION__));
+ 
+@@ -2396,6 +2449,9 @@ prepare_blt_put(struct sna *sna,
+ 			op->box   = blt_put_composite_box;
+ 			op->boxes = blt_put_composite_boxes;
+ 		}
++
++		op->done = nop_done;
++		return true;
+ 	} else {
+ 		if (alpha_fixup) {
+ 			op->u.blt.pixel = alpha_fixup;
+@@ -2407,10 +2463,10 @@ prepare_blt_put(struct sna *sna,
+ 			op->box   = blt_put_composite_box__cpu;
+ 			op->boxes = blt_put_composite_boxes__cpu;
+ 		}
+-	}
+-	op->done = nop_done;
+ 
+-	return true;
++		op->done = sig_done;
++		return sigtrap_get() == 0;
++	}
+ }
+ 
+ static bool
+@@ -2544,6 +2600,7 @@ sna_blt_composite(struct sna *sna,
+ clear:
+ 		if (was_clear && sna_pixmap(tmp->dst.pixmap)->clear_color == 0) {
+ 			sna_pixmap(tmp->dst.pixmap)->clear = true;
++nop:
+ 			return prepare_blt_nop(sna, tmp);
+ 		}
+ 
+@@ -2559,6 +2616,7 @@ clear:
+ 		}
+ 		tmp->dst.bo = sna_drawable_use_bo(dst->pDrawable, hint,
+ 						  &dst_box, &tmp->damage);
++		assert(!tmp->damage || !DAMAGE_IS_ALL(*tmp->damage));
+ 		if (tmp->dst.bo) {
+ 			if (!kgem_bo_can_blt(&sna->kgem, tmp->dst.bo)) {
+ 				DBG(("%s: can not blit to dst, tiling? %d, pitch? %d\n",
+@@ -2567,6 +2625,8 @@ clear:
+ 			}
+ 			if (hint & REPLACES)
+ 				kgem_bo_undo(&sna->kgem, tmp->dst.bo);
++			if (flags & COMPOSITE_UPLOAD)
++				return false;
+ 		} else {
+ 			RegionRec region;
+ 
+@@ -2590,32 +2650,40 @@ clear:
+ 		}
+ 		if (op == PictOpOver && is_opaque_solid(src))
+ 			op = PictOpSrc;
+-		if (op == PictOpAdd && is_white(src))
++		if (op == PictOpAdd &&
++		    PICT_FORMAT_RGB(src->format) == PICT_FORMAT_RGB(dst->format) &&
++		    is_white(src))
+ 			op = PictOpSrc;
+ 		if (was_clear && (op == PictOpAdd || op == PictOpOver)) {
+ 			if (sna_pixmap(tmp->dst.pixmap)->clear_color == 0)
+ 				op = PictOpSrc;
+ 			if (op == PictOpOver) {
++				unsigned dst_color = solid_color(dst->format, sna_pixmap(tmp->dst.pixmap)->clear_color);
+ 				color = over(get_solid_color(src, PICT_a8r8g8b8),
+-					     color_convert(sna_pixmap(tmp->dst.pixmap)->clear_color,
+-							   dst->format, PICT_a8r8g8b8));
++					     dst_color);
+ 				op = PictOpSrc;
+ 				DBG(("%s: precomputing solid OVER (%08x, %08x) -> %08x\n",
+ 				     __FUNCTION__, get_solid_color(src, PICT_a8r8g8b8),
+-				     color_convert(sna_pixmap(tmp->dst.pixmap)->clear_color,
+-						   dst->format, PICT_a8r8g8b8),
++				     solid_color(dst->format, sna_pixmap(tmp->dst.pixmap)->clear_color),
+ 				     color));
++				if (color == dst_color)
++					goto nop;
++				else
++					goto fill;
+ 			}
+ 			if (op == PictOpAdd) {
++				unsigned dst_color = solid_color(dst->format, sna_pixmap(tmp->dst.pixmap)->clear_color);
+ 				color = add(get_solid_color(src, PICT_a8r8g8b8),
+-					    color_convert(sna_pixmap(tmp->dst.pixmap)->clear_color,
+-							  dst->format, PICT_a8r8g8b8));
++					    dst_color);
+ 				op = PictOpSrc;
+ 				DBG(("%s: precomputing solid ADD (%08x, %08x) -> %08x\n",
+ 				     __FUNCTION__, get_solid_color(src, PICT_a8r8g8b8),
+-				     color_convert(sna_pixmap(tmp->dst.pixmap)->clear_color,
+-						   dst->format, PICT_a8r8g8b8),
++				     solid_color(dst->format, sna_pixmap(tmp->dst.pixmap)->clear_color),
+ 				     color));
++				if (color == dst_color)
++					goto nop;
++				else
++					goto fill;
+ 			}
+ 		}
+ 		if (op == PictOpOutReverse && is_opaque_solid(src))
+@@ -2649,6 +2717,7 @@ fill:
+ 		}
+ 		tmp->dst.bo = sna_drawable_use_bo(dst->pDrawable, hint,
+ 						  &dst_box, &tmp->damage);
++		assert(!tmp->damage || !DAMAGE_IS_ALL(*tmp->damage));
+ 		if (tmp->dst.bo) {
+ 			if (!kgem_bo_can_blt(&sna->kgem, tmp->dst.bo)) {
+ 				DBG(("%s: can not blit to dst, tiling? %d, pitch? %d\n",
+@@ -2657,6 +2726,8 @@ fill:
+ 			}
+ 			if (hint & REPLACES)
+ 				kgem_bo_undo(&sna->kgem, tmp->dst.bo);
++			if (flags & COMPOSITE_UPLOAD)
++				return false;
+ 		} else {
+ 			RegionRec region;
+ 
+@@ -2720,8 +2791,8 @@ fill:
+ 	if (is_clear(src_pixmap)) {
+ 		if (src->repeat ||
+ 		    (x >= 0 && y >= 0 &&
+-		     x + width  < src_pixmap->drawable.width &&
+-		     y + height < src_pixmap->drawable.height)) {
++		     x + width  <= src_pixmap->drawable.width &&
++		     y + height <= src_pixmap->drawable.height)) {
+ 			color = color_convert(sna_pixmap(src_pixmap)->clear_color,
+ 					      src->format, tmp->dst.format);
+ 			goto fill;
+@@ -2795,7 +2866,7 @@ fill:
+ 		if (src_pixmap->drawable.width  <= sna->render.max_3d_size &&
+ 		    src_pixmap->drawable.height <= sna->render.max_3d_size &&
+ 		    bo->pitch <= sna->render.max_3d_pitch &&
+-		    (flags & COMPOSITE_FALLBACK) == 0)
++		    (flags & (COMPOSITE_UPLOAD | COMPOSITE_FALLBACK)) == 0)
+ 		{
+ 			return false;
+ 		}
+@@ -2817,6 +2888,7 @@ fill:
+ 	}
+ 	tmp->dst.bo = sna_drawable_use_bo(dst->pDrawable, hint,
+ 					  &dst_box, &tmp->damage);
++	assert(!tmp->damage || !DAMAGE_IS_ALL(*tmp->damage));
+ 
+ 	if (tmp->dst.bo && hint & REPLACES) {
+ 		struct sna_pixmap *priv = sna_pixmap(tmp->dst.pixmap);
+@@ -2846,7 +2918,7 @@ fallback:
+ 			DBG(("%s: fallback -- unaccelerated upload\n",
+ 			     __FUNCTION__));
+ 			goto fallback;
+-		} else {
++		} else if ((flags & COMPOSITE_UPLOAD) == 0) {
+ 			ret = prepare_blt_copy(sna, tmp, bo, alpha_fixup);
+ 			if (!ret)
+ 				goto fallback;
+@@ -3023,6 +3095,7 @@ sna_blt_composite__convert(struct sna *sna,
+ 		}
+ 		_kgem_set_mode(&sna->kgem, KGEM_BLT);
+ 	}
++	kgem_bcs_set_tiling(&sna->kgem, tmp->src.bo, tmp->dst.bo);
+ 
+ 	if (alpha_fixup) {
+ 		tmp->blt   = blt_composite_copy_with_alpha;
+@@ -3062,7 +3135,7 @@ static void sna_blt_fill_op_blt(struct sna *sna,
+ 	if (sna->blt_state.fill_bo != op->base.u.blt.bo[0]->unique_id) {
+ 		const struct sna_blt_state *blt = &op->base.u.blt;
+ 
+-		sna_blt_fill_begin(sna, blt);
++		__sna_blt_fill_begin(sna, blt);
+ 
+ 		sna->blt_state.fill_bo = blt->bo[0]->unique_id;
+ 		sna->blt_state.fill_pixel = blt->pixel;
+@@ -3079,7 +3152,7 @@ fastcall static void sna_blt_fill_op_box(struct sna *sna,
+ 	if (sna->blt_state.fill_bo != op->base.u.blt.bo[0]->unique_id) {
+ 		const struct sna_blt_state *blt = &op->base.u.blt;
+ 
+-		sna_blt_fill_begin(sna, blt);
++		__sna_blt_fill_begin(sna, blt);
+ 
+ 		sna->blt_state.fill_bo = blt->bo[0]->unique_id;
+ 		sna->blt_state.fill_pixel = blt->pixel;
+@@ -3097,7 +3170,7 @@ fastcall static void sna_blt_fill_op_boxes(struct sna *sna,
+ 	if (sna->blt_state.fill_bo != op->base.u.blt.bo[0]->unique_id) {
+ 		const struct sna_blt_state *blt = &op->base.u.blt;
+ 
+-		sna_blt_fill_begin(sna, blt);
++		__sna_blt_fill_begin(sna, blt);
+ 
+ 		sna->blt_state.fill_bo = blt->bo[0]->unique_id;
+ 		sna->blt_state.fill_pixel = blt->pixel;
+@@ -3132,7 +3205,7 @@ fastcall static void sna_blt_fill_op_points(struct sna *sna,
+ 	DBG(("%s: %08x x %d\n", __FUNCTION__, blt->pixel, n));
+ 
+ 	if (sna->blt_state.fill_bo != op->base.u.blt.bo[0]->unique_id) {
+-		sna_blt_fill_begin(sna, blt);
++		__sna_blt_fill_begin(sna, blt);
+ 
+ 		sna->blt_state.fill_bo = blt->bo[0]->unique_id;
+ 		sna->blt_state.fill_pixel = blt->pixel;
+@@ -3162,65 +3235,15 @@ fastcall static void sna_blt_fill_op_points(struct sna *sna,
+ 		assert(kgem->nbatch < kgem->surface);
+ 
+ 		if ((dx|dy) == 0) {
+-			while (n_this_time >= 8) {
+-				*((uint64_t *)b + 0) = pt_add(cmd, p+0, 0, 0);
+-				*((uint64_t *)b + 1) = pt_add(cmd, p+1, 0, 0);
+-				*((uint64_t *)b + 2) = pt_add(cmd, p+2, 0, 0);
+-				*((uint64_t *)b + 3) = pt_add(cmd, p+3, 0, 0);
+-				*((uint64_t *)b + 4) = pt_add(cmd, p+4, 0, 0);
+-				*((uint64_t *)b + 5) = pt_add(cmd, p+5, 0, 0);
+-				*((uint64_t *)b + 6) = pt_add(cmd, p+6, 0, 0);
+-				*((uint64_t *)b + 7) = pt_add(cmd, p+7, 0, 0);
+-				b += 16;
+-				n_this_time -= 8;
+-				p += 8;
+-			}
+-			if (n_this_time & 4) {
+-				*((uint64_t *)b + 0) = pt_add(cmd, p+0, 0, 0);
+-				*((uint64_t *)b + 1) = pt_add(cmd, p+1, 0, 0);
+-				*((uint64_t *)b + 2) = pt_add(cmd, p+2, 0, 0);
+-				*((uint64_t *)b + 3) = pt_add(cmd, p+3, 0, 0);
+-				b += 8;
+-				p += 4;
+-			}
+-			if (n_this_time & 2) {
+-				*((uint64_t *)b + 0) = pt_add(cmd, p+0, 0, 0);
+-				*((uint64_t *)b + 1) = pt_add(cmd, p+1, 0, 0);
+-				b += 4;
+-				p += 2;
+-			}
+-			if (n_this_time & 1)
+-				*((uint64_t *)b + 0) = pt_add(cmd, p++, 0, 0);
++			do {
++				*(uint64_t *)b = pt_add(cmd, p++, 0, 0);
++				b += 2;
++			} while (--n_this_time);
+ 		} else {
+-			while (n_this_time >= 8) {
+-				*((uint64_t *)b + 0) = pt_add(cmd, p+0, dx, dy);
+-				*((uint64_t *)b + 1) = pt_add(cmd, p+1, dx, dy);
+-				*((uint64_t *)b + 2) = pt_add(cmd, p+2, dx, dy);
+-				*((uint64_t *)b + 3) = pt_add(cmd, p+3, dx, dy);
+-				*((uint64_t *)b + 4) = pt_add(cmd, p+4, dx, dy);
+-				*((uint64_t *)b + 5) = pt_add(cmd, p+5, dx, dy);
+-				*((uint64_t *)b + 6) = pt_add(cmd, p+6, dx, dy);
+-				*((uint64_t *)b + 7) = pt_add(cmd, p+7, dx, dy);
+-				b += 16;
+-				n_this_time -= 8;
+-				p += 8;
+-			}
+-			if (n_this_time & 4) {
+-				*((uint64_t *)b + 0) = pt_add(cmd, p+0, dx, dy);
+-				*((uint64_t *)b + 1) = pt_add(cmd, p+1, dx, dy);
+-				*((uint64_t *)b + 2) = pt_add(cmd, p+2, dx, dy);
+-				*((uint64_t *)b + 3) = pt_add(cmd, p+3, dx, dy);
+-				b += 8;
+-				p += 8;
+-			}
+-			if (n_this_time & 2) {
+-				*((uint64_t *)b + 0) = pt_add(cmd, p+0, dx, dy);
+-				*((uint64_t *)b + 1) = pt_add(cmd, p+1, dx, dy);
+-				b += 4;
+-				p += 2;
+-			}
+-			if (n_this_time & 1)
+-				*((uint64_t *)b + 0) = pt_add(cmd, p++, dx, dy);
++			do {
++				*(uint64_t *)b = pt_add(cmd, p++, dx, dy);
++				b += 2;
++			} while (--n_this_time);
+ 		}
+ 
+ 		if (!n)
+@@ -3414,6 +3437,7 @@ static bool sna_blt_fill_box(struct sna *sna, uint8_t alu,
+ 
+ 		_kgem_set_mode(kgem, KGEM_BLT);
+ 	}
++	kgem_bcs_set_tiling(&sna->kgem, NULL, bo);
+ 
+ 	assert(kgem_check_batch(kgem, 6));
+ 	assert(kgem_check_reloc(kgem, 1));
+@@ -3520,6 +3544,8 @@ bool sna_blt_fill_boxes(struct sna *sna, uint8_t alu,
+ 			_kgem_set_mode(kgem, KGEM_BLT);
+ 		}
+ 
++		kgem_bcs_set_tiling(&sna->kgem, NULL, bo);
++
+ 		assert(sna->kgem.mode == KGEM_BLT);
+ 		b = kgem->batch + kgem->nbatch;
+ 		if (kgem->gen >= 0100) {
+@@ -3608,6 +3634,7 @@ bool sna_blt_fill_boxes(struct sna *sna, uint8_t alu,
+ 
+ 			_kgem_submit(kgem);
+ 			_kgem_set_mode(kgem, KGEM_BLT);
++			kgem_bcs_set_tiling(&sna->kgem, NULL, bo);
+ 
+ 			assert(sna->kgem.mode == KGEM_BLT);
+ 			b = kgem->batch + kgem->nbatch;
+@@ -3754,6 +3781,7 @@ bool sna_blt_copy_boxes(struct sna *sna, uint8_t alu,
+ 		}
+ 		_kgem_set_mode(kgem, KGEM_BLT);
+ 	}
++	kgem_bcs_set_tiling(&sna->kgem, src_bo, dst_bo);
+ 
+ 	if ((dst_dx | dst_dy) == 0) {
+ 		if (kgem->gen >= 0100) {
+@@ -3814,6 +3842,7 @@ bool sna_blt_copy_boxes(struct sna *sna, uint8_t alu,
+ 
+ 				_kgem_submit(kgem);
+ 				_kgem_set_mode(kgem, KGEM_BLT);
++				kgem_bcs_set_tiling(&sna->kgem, src_bo, dst_bo);
+ 			} while (1);
+ 		} else {
+ 			uint64_t hdr = (uint64_t)br13 << 32 | cmd | 6;
+@@ -3871,6 +3900,7 @@ bool sna_blt_copy_boxes(struct sna *sna, uint8_t alu,
+ 
+ 				_kgem_submit(kgem);
+ 				_kgem_set_mode(kgem, KGEM_BLT);
++				kgem_bcs_set_tiling(&sna->kgem, src_bo, dst_bo);
+ 			} while (1);
+ 		}
+ 	} else {
+@@ -3932,6 +3962,7 @@ bool sna_blt_copy_boxes(struct sna *sna, uint8_t alu,
+ 
+ 				_kgem_submit(kgem);
+ 				_kgem_set_mode(kgem, KGEM_BLT);
++				kgem_bcs_set_tiling(&sna->kgem, src_bo, dst_bo);
+ 			} while (1);
+ 		} else {
+ 			cmd |= 6;
+@@ -3989,6 +4020,7 @@ bool sna_blt_copy_boxes(struct sna *sna, uint8_t alu,
+ 
+ 				_kgem_submit(kgem);
+ 				_kgem_set_mode(kgem, KGEM_BLT);
++				kgem_bcs_set_tiling(&sna->kgem, src_bo, dst_bo);
+ 			} while (1);
+ 		}
+ 	}
+@@ -4095,6 +4127,7 @@ bool sna_blt_copy_boxes__with_alpha(struct sna *sna, uint8_t alu,
+ 		    !kgem_check_reloc(kgem, 2)) {
+ 			_kgem_submit(kgem);
+ 			_kgem_set_mode(kgem, KGEM_BLT);
++			kgem_bcs_set_tiling(&sna->kgem, src_bo, dst_bo);
+ 		}
+ 
+ 		assert(sna->kgem.mode == KGEM_BLT);
+@@ -4190,6 +4223,7 @@ bool sna_blt_copy_boxes_fallback(struct sna *sna, uint8_t alu,
+ 		DBG(("%s: dst == src\n", __FUNCTION__));
+ 
+ 		if (src_bo->tiling == I915_TILING_Y &&
++		    !sna->kgem.can_blt_y &&
+ 		    kgem_bo_blt_pitch_is_ok(&sna->kgem, src_bo)) {
+ 			struct kgem_bo *bo;
+ 
+@@ -4237,6 +4271,7 @@ bool sna_blt_copy_boxes_fallback(struct sna *sna, uint8_t alu,
+ 		}
+ 	} else {
+ 		if (src_bo->tiling == I915_TILING_Y &&
++		    !sna->kgem.can_blt_y &&
+ 		    kgem_bo_blt_pitch_is_ok(&sna->kgem, src_bo)) {
+ 			DBG(("%s: src is y-tiled\n", __FUNCTION__));
+ 			if (src->type != DRAWABLE_PIXMAP)
+@@ -4251,6 +4286,7 @@ bool sna_blt_copy_boxes_fallback(struct sna *sna, uint8_t alu,
+ 		}
+ 
+ 		if (dst_bo->tiling == I915_TILING_Y &&
++		    !sna->kgem.can_blt_y &&
+ 		    kgem_bo_blt_pitch_is_ok(&sna->kgem, dst_bo)) {
+ 			DBG(("%s: dst is y-tiled\n", __FUNCTION__));
+ 			if (dst->type != DRAWABLE_PIXMAP)
+diff --git a/src/sna/sna_composite.c b/src/sna/sna_composite.c
+index f01f020e..1da8c291 100644
+--- a/src/sna/sna_composite.c
++++ b/src/sna/sna_composite.c
+@@ -452,6 +452,8 @@ static void apply_damage(struct sna_composite_op *op, RegionPtr region)
+ 		op->damage = NULL;
+ 	} else
+ 		sna_damage_add(op->damage, region);
++
++	assert(!op->damage || !DAMAGE_IS_ALL(*op->damage));
+ }
+ 
+ static inline bool use_cpu(PixmapPtr pixmap, struct sna_pixmap *priv,
+@@ -653,8 +655,9 @@ sna_composite(CARD8 op,
+ 	RegionRec region;
+ 	int dx, dy;
+ 
+-	DBG(("%s(%d src=%ld+(%d, %d), mask=%ld+(%d, %d), dst=%ld+(%d, %d)+(%d, %d), size=(%d, %d)\n",
+-	     __FUNCTION__, op,
++	DBG(("%s(pixmap=%ld, op=%d, src=%ld+(%d, %d), mask=%ld+(%d, %d), dst=%ld+(%d, %d)+(%d, %d), size=(%d, %d)\n",
++	     __FUNCTION__,
++	     pixmap->drawable.serialNumber, op,
+ 	     get_picture_id(src), src_x, src_y,
+ 	     get_picture_id(mask), mask_x, mask_y,
+ 	     get_picture_id(dst), dst_x, dst_y,
+@@ -673,13 +676,6 @@ sna_composite(CARD8 op,
+ 			src = sna->clear;
+ 	}
+ 
+-	if (mask && sna_composite_mask_is_opaque(mask)) {
+-		DBG(("%s: removing opaque %smask\n",
+-		     __FUNCTION__,
+-		     mask->componentAlpha && PICT_FORMAT_RGB(mask->format) ? "CA " : ""));
+-		mask = NULL;
+-	}
+-
+ 	if (!sna_compute_composite_region(&region,
+ 					  src, mask, dst,
+ 					  src_x,  src_y,
+@@ -688,6 +684,13 @@ sna_composite(CARD8 op,
+ 					  width,  height))
+ 		return;
+ 
++	if (mask && sna_composite_mask_is_opaque(mask)) {
++		DBG(("%s: removing opaque %smask\n",
++		     __FUNCTION__,
++		     mask->componentAlpha && PICT_FORMAT_RGB(mask->format) ? "CA " : ""));
++		mask = NULL;
++	}
++
+ 	if (NO_COMPOSITE)
+ 		goto fallback;
+ 
+@@ -756,6 +759,7 @@ sna_composite(CARD8 op,
+ 		DBG(("%s: fallback due unhandled composite op\n", __FUNCTION__));
+ 		goto fallback;
+ 	}
++	assert(!tmp.damage || !DAMAGE_IS_ALL(*tmp.damage));
+ 
+ 	if (region.data == NULL)
+ 		tmp.box(sna, &tmp, &region.extents);
+@@ -797,8 +801,10 @@ sna_composite_rectangles(CARD8		 op,
+ 	int i, num_boxes;
+ 	unsigned hint;
+ 
+-	DBG(("%s(op=%d, %08x x %d [(%d, %d)x(%d, %d) ...])\n",
+-	     __FUNCTION__, op,
++	DBG(("%s(pixmap=%ld, op=%d, %08x x %d [(%d, %d)x(%d, %d) ...])\n",
++	     __FUNCTION__,
++	     get_drawable_pixmap(dst->pDrawable)->drawable.serialNumber,
++	     op,
+ 	     (color->alpha >> 8 << 24) |
+ 	     (color->red   >> 8 << 16) |
+ 	     (color->green >> 8 << 8) |
+@@ -814,38 +820,40 @@ sna_composite_rectangles(CARD8		 op,
+ 		return;
+ 	}
+ 
+-	if ((color->red|color->green|color->blue|color->alpha) <= 0x00ff) {
+-		switch (op) {
+-		case PictOpOver:
+-		case PictOpOutReverse:
+-		case PictOpAdd:
+-			return;
+-		case  PictOpInReverse:
+-		case  PictOpSrc:
+-			op = PictOpClear;
+-			break;
+-		case  PictOpAtopReverse:
+-			op = PictOpOut;
+-			break;
+-		case  PictOpXor:
+-			op = PictOpOverReverse;
+-			break;
+-		}
+-	}
+ 	if (color->alpha <= 0x00ff) {
+-		switch (op) {
+-		case PictOpOver:
+-		case PictOpOutReverse:
+-			return;
+-		case  PictOpInReverse:
+-			op = PictOpClear;
+-			break;
+-		case  PictOpAtopReverse:
+-			op = PictOpOut;
+-			break;
+-		case  PictOpXor:
+-			op = PictOpOverReverse;
+-			break;
++		if (PICT_FORMAT_TYPE(dst->format) == PICT_TYPE_A ||
++		    (color->red|color->green|color->blue) <= 0x00ff) {
++			switch (op) {
++			case PictOpOver:
++			case PictOpOutReverse:
++			case PictOpAdd:
++				return;
++			case  PictOpInReverse:
++			case  PictOpSrc:
++				op = PictOpClear;
++				break;
++			case  PictOpAtopReverse:
++				op = PictOpOut;
++				break;
++			case  PictOpXor:
++				op = PictOpOverReverse;
++				break;
++			}
++		} else {
++			switch (op) {
++			case PictOpOver:
++			case PictOpOutReverse:
++				return;
++			case  PictOpInReverse:
++				op = PictOpClear;
++				break;
++			case  PictOpAtopReverse:
++				op = PictOpOut;
++				break;
++			case  PictOpXor:
++				op = PictOpOverReverse;
++				break;
++			}
+ 		}
+ 	} else if (color->alpha >= 0xff00) {
+ 		switch (op) {
+@@ -863,11 +871,16 @@ sna_composite_rectangles(CARD8		 op,
+ 		case  PictOpXor:
+ 			op = PictOpOut;
+ 			break;
++		case PictOpAdd:
++			if (PICT_FORMAT_TYPE(dst->format) == PICT_TYPE_A ||
++			    (color->red&color->green&color->blue) >= 0xff00)
++				op = PictOpSrc;
++			break;
+ 		}
+ 	}
+ 
+ 	/* Avoid reducing overlapping translucent rectangles */
+-	if (op == PictOpOver &&
++	if ((op == PictOpOver || op == PictOpAdd) &&
+ 	    num_rects == 1 &&
+ 	    sna_drawable_is_clear(dst->pDrawable))
+ 		op = PictOpSrc;
+@@ -979,6 +992,9 @@ sna_composite_rectangles(CARD8		 op,
+ 			bool ok;
+ 
+ 			if (op == PictOpClear) {
++				if (priv->clear_color == 0)
++					goto done;
++
+ 				ok = sna_get_pixel_from_rgba(&pixel,
+ 							     0, 0, 0, 0,
+ 							     dst->format);
+@@ -990,8 +1006,11 @@ sna_composite_rectangles(CARD8		 op,
+ 							     color->alpha,
+ 							     dst->format);
+ 			}
+-			if (ok && priv->clear_color == pixel)
++			if (ok && priv->clear_color == pixel) {
++				DBG(("%s: matches current clear, skipping\n",
++				     __FUNCTION__));
+ 				goto done;
++			}
+ 		}
+ 
+ 		if (region.data == NULL) {
+diff --git a/src/sna/sna_damage.h b/src/sna/sna_damage.h
+index 272e83bc..d5c727ee 100644
+--- a/src/sna/sna_damage.h
++++ b/src/sna/sna_damage.h
+@@ -267,7 +267,7 @@ int _sna_damage_get_boxes(struct sna_damage *damage, const BoxRec **boxes);
+ static inline int
+ sna_damage_get_boxes(struct sna_damage *damage, const BoxRec **boxes)
+ {
+-	assert(damage);
++	assert(DAMAGE_PTR(damage));
+ 
+ 	if (DAMAGE_IS_ALL(damage)) {
+ 		*boxes = &DAMAGE_PTR(damage)->extents;
+@@ -322,7 +322,8 @@ static inline void sna_damage_destroy(struct sna_damage **damage)
+ 	if (*damage == NULL)
+ 		return;
+ 
+-	__sna_damage_destroy(DAMAGE_PTR(*damage));
++	if (DAMAGE_PTR(*damage))
++		__sna_damage_destroy(DAMAGE_PTR(*damage));
+ 	*damage = NULL;
+ }
+ 
+diff --git a/src/sna/sna_display.c b/src/sna/sna_display.c
+index 4b218b70..9b77550e 100644
+--- a/src/sna/sna_display.c
++++ b/src/sna/sna_display.c
+@@ -39,6 +39,25 @@
+ #include <errno.h>
+ #include <poll.h>
+ #include <ctype.h>
++#include <dirent.h>
++
++#if HAVE_ALLOCA_H
++#include <alloca.h>
++#elif defined __GNUC__
++#define alloca __builtin_alloca
++#elif defined _AIX
++#define alloca __alloca
++#elif defined _MSC_VER
++#include <malloc.h>
++#define alloca _alloca
++#else
++void *alloca(size_t);
++#endif
++
++#define _PARSE_EDID_
++/* Jump through a few hoops in order to fixup EDIDs */
++#undef VERSION
++#undef REVISION
+ 
+ #include "sna.h"
+ #include "sna_reg.h"
+@@ -72,6 +91,10 @@
+ #include <memcheck.h>
+ #endif
+ 
++#define FAIL_CURSOR_IOCTL 0
++
++#define COLDPLUG_DELAY_MS 2000
++
+ /* Minor discrepancy between 32-bit/64-bit ABI in old kernels */
+ union compat_mode_get_connector{
+ 	struct drm_mode_get_connector conn;
+@@ -88,6 +111,8 @@ union compat_mode_get_connector{
+ #define DEFAULT_DPI 96
+ #endif
+ 
++#define OUTPUT_STATUS_CACHE_MS 15000
++
+ #define DRM_MODE_PAGE_FLIP_ASYNC 0x02
+ 
+ #define DRM_CLIENT_CAP_UNIVERSAL_PLANES 2
+@@ -106,33 +131,87 @@ struct local_mode_obj_get_properties {
+ };
+ #define LOCAL_MODE_OBJECT_PLANE 0xeeeeeeee
+ 
+-#if 0
++struct local_mode_set_plane {
++	uint32_t plane_id;
++	uint32_t crtc_id;
++	uint32_t fb_id; /* fb object contains surface format type */
++	uint32_t flags;
++
++	/* Signed dest location allows it to be partially off screen */
++	int32_t crtc_x, crtc_y;
++	uint32_t crtc_w, crtc_h;
++
++	/* Source values are 16.16 fixed point */
++	uint32_t src_x, src_y;
++	uint32_t src_h, src_w;
++};
++#define LOCAL_IOCTL_MODE_SETPLANE DRM_IOWR(0xB7, struct local_mode_set_plane)
++
++struct local_mode_get_plane {
++	uint32_t plane_id;
++
++	uint32_t crtc_id;
++	uint32_t fb_id;
++
++	uint32_t possible_crtcs;
++	uint32_t gamma_size;
++
++	uint32_t count_format_types;
++	uint64_t format_type_ptr;
++};
++#define LOCAL_IOCTL_MODE_GETPLANE DRM_IOWR(0xb6, struct local_mode_get_plane)
++
++struct local_mode_get_plane_res {
++	uint64_t plane_id_ptr;
++	uint64_t count_planes;
++};
++#define LOCAL_IOCTL_MODE_GETPLANERESOURCES DRM_IOWR(0xb5, struct local_mode_get_plane_res)
++
++#if 1
+ #define __DBG DBG
+ #else
+ #define __DBG(x)
+ #endif
+ 
++#define DBG_NATIVE_ROTATION ~0 /* minimum RR_Rotate_0 */
++
+ extern XF86ConfigPtr xf86configptr;
+ 
++struct sna_cursor {
++	struct sna_cursor *next;
++	uint32_t *image;
++	bool transformed;
++	Rotation rotation;
++	int ref;
++	int size;
++	int last_width;
++	int last_height;
++	unsigned handle;
++	unsigned serial;
++	unsigned alloc;
++};
++
+ struct sna_crtc {
++	unsigned long flags;
++	uint32_t id;
+ 	xf86CrtcPtr base;
+ 	struct drm_mode_modeinfo kmode;
+-	int dpms_mode;
+ 	PixmapPtr slave_pixmap;
+ 	DamagePtr slave_damage;
+-	struct kgem_bo *bo, *shadow_bo, *client_bo;
++	struct kgem_bo *bo, *shadow_bo, *client_bo, *cache_bo;
+ 	struct sna_cursor *cursor;
+ 	unsigned int last_cursor_size;
+ 	uint32_t offset;
+ 	bool shadow;
+ 	bool fallback_shadow;
+ 	bool transform;
++	bool cursor_transform;
++	bool hwcursor;
+ 	bool flip_pending;
+-	uint8_t id;
+-	uint8_t pipe;
+ 
+-	RegionRec client_damage; /* XXX overlap with shadow damage? */
++	struct pict_f_transform cursor_to_fb, fb_to_cursor;
+ 
++	RegionRec crtc_damage;
+ 	uint16_t shadow_bo_width, shadow_bo_height;
+ 
+ 	uint32_t rotation;
+@@ -143,7 +222,9 @@ struct sna_crtc {
+ 			uint32_t supported;
+ 			uint32_t current;
+ 		} rotation;
+-	} primary, sprite;
++		struct list link;
++	} primary;
++	struct list sprites;
+ 
+ 	uint32_t mode_serial, flip_serial;
+ 
+@@ -173,21 +254,33 @@ struct sna_output {
+ 
+ 	unsigned int is_panel : 1;
+ 	unsigned int add_default_modes : 1;
++	int connector_type;
++	int connector_type_id;
++
++	uint32_t link_status_idx;
+ 
+ 	uint32_t edid_idx;
+ 	uint32_t edid_blob_id;
+ 	uint32_t edid_len;
+ 	void *edid_raw;
++	xf86MonPtr fake_edid_mon;
++	void *fake_edid_raw;
+ 
+ 	bool has_panel_limits;
+ 	int panel_hdisplay;
+ 	int panel_vdisplay;
+ 
+ 	uint32_t dpms_id;
+-	int dpms_mode;
++	uint8_t dpms_mode;
+ 	struct backlight backlight;
+ 	int backlight_active_level;
+ 
++	uint32_t last_detect;
++	uint32_t status;
++	unsigned int hotplug_count;
++	bool update_properties;
++	bool reprobe;
++
+ 	int num_modes;
+ 	struct drm_mode_modeinfo *modes;
+ 
+@@ -218,13 +311,91 @@ enum { /* XXX copied from hw/xfree86/modes/xf86Crtc.c */
+ 	OPTION_DEFAULT_MODES,
+ };
+ 
++static void __sna_output_dpms(xf86OutputPtr output, int dpms, int fixup);
+ static void sna_crtc_disable_cursor(struct sna *sna, struct sna_crtc *crtc);
++static bool sna_crtc_flip(struct sna *sna, struct sna_crtc *crtc,
++			  struct kgem_bo *bo, int x, int y);
+ 
+ static bool is_zaphod(ScrnInfoPtr scrn)
+ {
+ 	return xf86IsEntityShared(scrn->entityList[0]);
+ }
+ 
++static bool
++sna_zaphod_match(struct sna *sna, const char *output)
++{
++	const char *s, *colon;
++	char t[20];
++	unsigned int i = 0;
++
++	s = xf86GetOptValString(sna->Options, OPTION_ZAPHOD);
++	if (s == NULL)
++		return false;
++
++	colon = strchr(s, ':');
++	if (colon) /* Skip over the ZaphodPipes */
++		s = colon + 1;
++
++	do {
++		/* match any outputs in a comma list, stopping at whitespace */
++		switch (*s) {
++		case '\0':
++			t[i] = '\0';
++			return strcmp(t, output) == 0;
++
++		case ',':
++			t[i] ='\0';
++			if (strcmp(t, output) == 0)
++				return TRUE;
++			i = 0;
++			break;
++
++		case ' ':
++		case '\t':
++		case '\n':
++		case '\r':
++			break;
++
++		default:
++			t[i++] = *s;
++			break;
++		}
++
++		s++;
++	} while (i < sizeof(t));
++
++	return false;
++}
++
++static unsigned
++get_zaphod_crtcs(struct sna *sna)
++{
++	const char *str, *colon;
++	unsigned crtcs = 0;
++
++	str = xf86GetOptValString(sna->Options, OPTION_ZAPHOD);
++	if (str == NULL || (colon = strchr(str, ':')) == NULL) {
++		DBG(("%s: no zaphod pipes, using screen number: %x\n",
++		     __FUNCTION__,
++		     sna->scrn->confScreen->device->screen));
++		return 1 << sna->scrn->confScreen->device->screen;
++	}
++
++	DBG(("%s: ZaphodHeads='%s'\n", __FUNCTION__, str));
++	while (str < colon) {
++		char *end;
++		unsigned crtc = strtoul(str, &end, 0);
++		if (end == str)
++			break;
++		DBG(("%s: adding CRTC %d to zaphod pipes\n",
++		     __FUNCTION__, crtc));
++		crtcs |= 1 << crtc;
++		str = end + 1;
++	}
++	DBG(("%s: ZaphodPipes=%x\n", __FUNCTION__, crtcs));
++	return crtcs;
++}
++
+ inline static unsigned count_to_mask(int x)
+ {
+ 	return (1 << x) - 1;
+@@ -247,6 +418,21 @@ static inline struct sna_crtc *to_sna_crtc(xf86CrtcPtr crtc)
+ 	return crtc->driver_private;
+ }
+ 
++static inline unsigned __sna_crtc_pipe(struct sna_crtc *crtc)
++{
++	return crtc->flags >> 8 & 0xff;
++}
++
++static inline unsigned __sna_crtc_id(struct sna_crtc *crtc)
++{
++	return crtc->id;
++}
++
++uint32_t sna_crtc_id(xf86CrtcPtr crtc)
++{
++	return __sna_crtc_id(to_sna_crtc(crtc));
++}
++
+ static inline bool event_pending(int fd)
+ {
+ 	struct pollfd pfd;
+@@ -268,29 +454,37 @@ static inline uint32_t fb_id(struct kgem_bo *bo)
+ 	return bo->delta;
+ }
+ 
+-uint32_t sna_crtc_id(xf86CrtcPtr crtc)
++unsigned sna_crtc_count_sprites(xf86CrtcPtr crtc)
+ {
+-	if (to_sna_crtc(crtc) == NULL)
+-		return 0;
+-	return to_sna_crtc(crtc)->id;
+-}
++	struct plane *sprite;
++	unsigned count;
+ 
+-int sna_crtc_to_pipe(xf86CrtcPtr crtc)
+-{
+-	assert(to_sna_crtc(crtc));
+-	return to_sna_crtc(crtc)->pipe;
++	count = 0;
++	list_for_each_entry(sprite, &to_sna_crtc(crtc)->sprites, link)
++		count++;
++
++	return count;
+ }
+ 
+-uint32_t sna_crtc_to_sprite(xf86CrtcPtr crtc)
++static struct plane *lookup_sprite(struct sna_crtc *crtc, unsigned idx)
+ {
+-	assert(to_sna_crtc(crtc));
+-	return to_sna_crtc(crtc)->sprite.id;
++	struct plane *sprite;
++
++	list_for_each_entry(sprite, &crtc->sprites, link)
++		if (idx-- == 0)
++			return sprite;
++
++	return NULL;
+ }
+ 
+-bool sna_crtc_is_on(xf86CrtcPtr crtc)
++uint32_t sna_crtc_to_sprite(xf86CrtcPtr crtc, unsigned idx)
+ {
++	struct plane *sprite;
++
+ 	assert(to_sna_crtc(crtc));
+-	return to_sna_crtc(crtc)->bo != NULL;
++
++	sprite = lookup_sprite(to_sna_crtc(crtc), idx);
++	return sprite ? sprite->id : 0;
+ }
+ 
+ bool sna_crtc_is_transformed(xf86CrtcPtr crtc)
+@@ -299,34 +493,48 @@ bool sna_crtc_is_transformed(xf86CrtcPtr crtc)
+ 	return to_sna_crtc(crtc)->transform;
+ }
+ 
+-static inline uint64_t msc64(struct sna_crtc *sna_crtc, uint32_t seq)
++static inline bool msc64(struct sna_crtc *sna_crtc, uint32_t seq, uint64_t *msc)
+ {
++	bool record = true;
+ 	if (seq < sna_crtc->last_seq) {
+ 		if (sna_crtc->last_seq - seq > 0x40000000) {
+ 			sna_crtc->wrap_seq++;
+ 			DBG(("%s: pipe=%d wrapped; was %u, now %u, wraps=%u\n",
+-			     __FUNCTION__, sna_crtc->pipe,
++			     __FUNCTION__, __sna_crtc_pipe(sna_crtc),
+ 			     sna_crtc->last_seq, seq, sna_crtc->wrap_seq));
+-		} else  {
+-			ERR(("%s: pipe=%d msc went backwards; was %u, now %u\n",
+-			     __FUNCTION__, sna_crtc->pipe, sna_crtc->last_seq, seq));
+-			seq = sna_crtc->last_seq;
++		} else {
++			DBG(("%s: pipe=%d msc went backwards; was %u, now %u; ignoring for last_swap\n",
++			     __FUNCTION__, __sna_crtc_pipe(sna_crtc), sna_crtc->last_seq, seq));
++
++			record = false;
+ 		}
+ 	}
+-	sna_crtc->last_seq = seq;
+-	return (uint64_t)sna_crtc->wrap_seq << 32 | seq;
++	*msc = (uint64_t)sna_crtc->wrap_seq << 32 | seq;
++	return record;
+ }
+ 
+ uint64_t sna_crtc_record_swap(xf86CrtcPtr crtc,
+ 			      int tv_sec, int tv_usec, unsigned seq)
+ {
+ 	struct sna_crtc *sna_crtc = to_sna_crtc(crtc);
++	uint64_t msc;
++
+ 	assert(sna_crtc);
+-	DBG(("%s: recording last swap on pipe=%d, frame %d, time %d.%06d\n",
+-	     __FUNCTION__, sna_crtc->pipe, seq, tv_sec, tv_usec));
+-	sna_crtc->swap.tv_sec = tv_sec;
+-	sna_crtc->swap.tv_usec = tv_usec;
+-	return sna_crtc->swap.msc = msc64(sna_crtc, seq);
++
++	if (msc64(sna_crtc, seq, &msc)) {
++		DBG(("%s: recording last swap on pipe=%d, frame %d [msc=%08lld], time %d.%06d\n",
++		     __FUNCTION__, __sna_crtc_pipe(sna_crtc), seq, (long long)msc,
++		     tv_sec, tv_usec));
++		sna_crtc->swap.tv_sec = tv_sec;
++		sna_crtc->swap.tv_usec = tv_usec;
++		sna_crtc->swap.msc = msc;
++	} else {
++		DBG(("%s: swap event on pipe=%d, frame %d [msc=%08lld], time %d.%06d\n",
++		     __FUNCTION__, __sna_crtc_pipe(sna_crtc), seq, (long long)msc,
++		     tv_sec, tv_usec));
++	}
++
++	return msc;
+ }
+ 
+ const struct ust_msc *sna_crtc_last_swap(xf86CrtcPtr crtc)
+@@ -342,15 +550,6 @@ const struct ust_msc *sna_crtc_last_swap(xf86CrtcPtr crtc)
+ 	}
+ }
+ 
+-xf86CrtcPtr sna_mode_first_crtc(struct sna *sna)
+-{
+-	xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(sna->scrn);
+-	if (sna->mode.num_real_crtc)
+-		return config->crtc[0];
+-	else
+-		return NULL;
+-}
+-
+ #ifndef NDEBUG
+ static void gem_close(int fd, uint32_t handle);
+ static void assert_scanout(struct kgem *kgem, struct kgem_bo *bo,
+@@ -372,12 +571,24 @@ static void assert_scanout(struct kgem *kgem, struct kgem_bo *bo,
+ #define assert_scanout(k, b, w, h)
+ #endif
+ 
++static void assert_crtc_fb(struct sna *sna, struct sna_crtc *crtc)
++{
++#ifndef NDEBUG
++	struct drm_mode_crtc mode = { .crtc_id = __sna_crtc_id(crtc) };
++	drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_GETCRTC, &mode);
++	assert(mode.fb_id == fb_id(crtc->bo));
++#endif
++}
++
+ static unsigned get_fb(struct sna *sna, struct kgem_bo *bo,
+ 		       int width, int height)
+ {
+ 	ScrnInfoPtr scrn = sna->scrn;
+ 	struct drm_mode_fb_cmd arg;
+ 
++	if (!kgem_bo_is_fenced(&sna->kgem, bo))
++		return 0;
++
+ 	assert(bo->refcnt);
+ 	assert(bo->proxy == NULL);
+ 	assert(!bo->snoop);
+@@ -393,8 +604,9 @@ static unsigned get_fb(struct sna *sna, struct kgem_bo *bo,
+ 	DBG(("%s: create fb %dx%d@%d/%d\n",
+ 	     __FUNCTION__, width, height, scrn->depth, scrn->bitsPerPixel));
+ 
+-	assert(bo->tiling != I915_TILING_Y);
++	assert(bo->tiling != I915_TILING_Y || sna->kgem.can_scanout_y);
+ 	assert((bo->pitch & 63) == 0);
++	assert(scrn->vtSema); /* must be master */
+ 
+ 	VG_CLEAR(arg);
+ 	arg.width = width;
+@@ -404,21 +616,83 @@ static unsigned get_fb(struct sna *sna, struct kgem_bo *bo,
+ 	arg.depth = scrn->depth;
+ 	arg.handle = bo->handle;
+ 
+-	assert(sna->scrn->vtSema); /* must be master */
+ 	if (drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_ADDFB, &arg)) {
+-		xf86DrvMsg(scrn->scrnIndex, X_ERROR,
+-			   "%s: failed to add fb: %dx%d depth=%d, bpp=%d, pitch=%d: %d\n",
+-			   __FUNCTION__, width, height,
+-			   scrn->depth, scrn->bitsPerPixel, bo->pitch, errno);
+-		return 0;
++		/* Try again with the fancy version */
++		struct local_mode_fb_cmd2 {
++			uint32_t fb_id;
++			uint32_t width, height;
++			uint32_t pixel_format;
++			uint32_t flags;
++
++			uint32_t handles[4];
++			uint32_t pitches[4]; /* pitch for each plane */
++			uint32_t offsets[4]; /* offset of each plane */
++			uint64_t modifiers[4];
++		} f;
++#define LOCAL_IOCTL_MODE_ADDFB2 DRM_IOWR(0xb8, struct local_mode_fb_cmd2)
++		memset(&f, 0, sizeof(f));
++		f.width = width;
++		f.height = height;
++		/* XXX interlaced */
++		f.flags = 1 << 1; /* +modifiers */
++		f.handles[0] = bo->handle;
++		f.pitches[0] = bo->pitch;
++
++		switch (bo->tiling) {
++		case I915_TILING_NONE:
++			break;
++		case I915_TILING_X:
++			/* I915_FORMAT_MOD_X_TILED */
++			f.modifiers[0] = (uint64_t)1 << 56 | 1;
++			break;
++		case I915_TILING_Y:
++			/* I915_FORMAT_MOD_X_TILED */
++			f.modifiers[0] = (uint64_t)1 << 56 | 2;
++			break;
++		}
++
++#define fourcc(a,b,c,d) ((a) | (b) << 8 | (c) << 16 | (d) << 24)
++		switch (scrn->depth) {
++		default:
++			ERR(("%s: unhandled screen format, depth=%d\n",
++			     __FUNCTION__, scrn->depth));
++			goto fail;
++		case 8:
++			f.pixel_format = fourcc('C', '8', ' ', ' ');
++			break;
++		case 15:
++			f.pixel_format = fourcc('X', 'R', '1', '5');
++			break;
++		case 16:
++			f.pixel_format = fourcc('R', 'G', '1', '6');
++			break;
++		case 24:
++			f.pixel_format = fourcc('X', 'R', '2', '4');
++			break;
++		case 30:
++			f.pixel_format = fourcc('X', 'R', '3', '0');
++			break;
++		}
++#undef fourcc
++
++		if (drmIoctl(sna->kgem.fd, LOCAL_IOCTL_MODE_ADDFB2, &f)) {
++fail:
++			xf86DrvMsg(scrn->scrnIndex, X_ERROR,
++				   "%s: failed to add fb: %dx%d depth=%d, bpp=%d, pitch=%d: %d\n",
++				   __FUNCTION__, width, height,
++				   scrn->depth, scrn->bitsPerPixel, bo->pitch, errno);
++			return 0;
++		}
++
++		arg.fb_id = f.fb_id;
+ 	}
+ 	assert(arg.fb_id != 0);
+-
++	bo->delta = arg.fb_id;
+ 	DBG(("%s: attached fb=%d to handle=%d\n",
+-	     __FUNCTION__, arg.fb_id, arg.handle));
++	     __FUNCTION__, bo->delta, arg.handle));
+ 
+ 	bo->scanout = true;
+-	return bo->delta = arg.fb_id;
++	return bo->delta;
+ }
+ 
+ static uint32_t gem_create(int fd, int size)
+@@ -438,6 +712,7 @@ static uint32_t gem_create(int fd, int size)
+ static void *gem_mmap(int fd, int handle, int size)
+ {
+ 	struct drm_i915_gem_mmap_gtt mmap_arg;
++	struct drm_i915_gem_set_domain set_domain;
+ 	void *ptr;
+ 
+ 	VG_CLEAR(mmap_arg);
+@@ -449,6 +724,15 @@ static void *gem_mmap(int fd, int handle, int size)
+ 	if (ptr == MAP_FAILED)
+ 		return NULL;
+ 
++	VG_CLEAR(set_domain);
++	set_domain.handle = handle;
++	set_domain.read_domains = I915_GEM_DOMAIN_GTT;
++	set_domain.write_domain = I915_GEM_DOMAIN_GTT;
++	if (drmIoctl(fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &set_domain)) {
++		munmap(ptr, size);
++		return NULL;
++	}
++
+ 	return ptr;
+ }
+ 
+@@ -497,8 +781,6 @@ sna_backlight_uevent(int fd, void *closure)
+ 		if (sna_output->dpms_mode != DPMSModeOn)
+ 			continue;
+ 
+-		assert(output->randr_output);
+-
+ 		val = backlight_get(&sna_output->backlight);
+ 		if (val < 0)
+ 			continue;
+@@ -523,6 +805,7 @@ sna_backlight_uevent(int fd, void *closure)
+ 					       TRUE, FALSE);
+ 		}
+ 	}
++	DBG(("%s: complete\n", __FUNCTION__));
+ }
+ 
+ static void sna_backlight_pre_init(struct sna *sna)
+@@ -570,6 +853,7 @@ static void sna_backlight_drain_uevents(struct sna *sna)
+ 	if (sna->mode.backlight_monitor == NULL)
+ 		return;
+ 
++	DBG(("%s()\n", __FUNCTION__));
+ 	sna_backlight_uevent(udev_monitor_get_fd(sna->mode.backlight_monitor),
+ 			     sna);
+ }
+@@ -632,9 +916,22 @@ sna_output_backlight_set(struct sna_output *sna_output, int level)
+ 	return ret;
+ }
+ 
++static bool
++has_native_backlight(struct sna_output *sna_output)
++{
++	return sna_output->backlight.type == BL_RAW;
++}
++
+ static void
+ sna_output_backlight_off(struct sna_output *sna_output)
+ {
++	/* Trust the kernel to turn the native backlight off. However, we
++	 * do explicitly turn the backlight back on (when we wake the output)
++	 * just in case a third party turns it off!
++	 */
++	if (has_native_backlight(sna_output))
++		return;
++
+ 	DBG(("%s(%s)\n", __FUNCTION__, sna_output->base->name));
+ 	backlight_off(&sna_output->backlight);
+ 	sna_output_backlight_set(sna_output, 0);
+@@ -674,7 +971,7 @@ has_user_backlight_override(xf86OutputPtr output)
+ 	if (*str == '\0')
+ 		return (char *)str;
+ 
+-	if (backlight_exists(str) == BL_NONE) {
++	if (!backlight_exists(str)) {
+ 		xf86DrvMsg(output->scrn->scrnIndex, X_ERROR,
+ 			   "Unrecognised backlight control interface '%s'\n",
+ 			   str);
+@@ -684,6 +981,93 @@ has_user_backlight_override(xf86OutputPtr output)
+ 	return strdup(str);
+ }
+ 
++static int get_device_minor(int fd)
++{
++	struct stat st;
++
++	if (fstat(fd, &st) || !S_ISCHR(st.st_mode))
++		return -1;
++
++	return st.st_rdev & 0x63;
++}
++
++static const char * const sysfs_connector_types[] = {
++	/* DRM_MODE_CONNECTOR_Unknown */	"Unknown",
++	/* DRM_MODE_CONNECTOR_VGA */		"VGA",
++	/* DRM_MODE_CONNECTOR_DVII */		"DVI-I",
++	/* DRM_MODE_CONNECTOR_DVID */		"DVI-D",
++	/* DRM_MODE_CONNECTOR_DVIA */		"DVI-A",
++	/* DRM_MODE_CONNECTOR_Composite */	"Composite",
++	/* DRM_MODE_CONNECTOR_SVIDEO */		"SVIDEO",
++	/* DRM_MODE_CONNECTOR_LVDS */		"LVDS",
++	/* DRM_MODE_CONNECTOR_Component */	"Component",
++	/* DRM_MODE_CONNECTOR_9PinDIN */	"DIN",
++	/* DRM_MODE_CONNECTOR_DisplayPort */	"DP",
++	/* DRM_MODE_CONNECTOR_HDMIA */		"HDMI-A",
++	/* DRM_MODE_CONNECTOR_HDMIB */		"HDMI-B",
++	/* DRM_MODE_CONNECTOR_TV */		"TV",
++	/* DRM_MODE_CONNECTOR_eDP */		"eDP",
++	/* DRM_MODE_CONNECTOR_VIRTUAL */	"Virtual",
++	/* DRM_MODE_CONNECTOR_DSI */		"DSI",
++	/* DRM_MODE_CONNECTOR_DPI */		"DPI"
++};
++
++static char *has_connector_backlight(xf86OutputPtr output)
++{
++	struct sna_output *sna_output = output->driver_private;
++	struct sna *sna = to_sna(output->scrn);
++	char path[1024];
++	DIR *dir;
++	struct dirent *de;
++	int minor, len;
++	char *str = NULL;
++
++	if (sna_output->connector_type >= ARRAY_SIZE(sysfs_connector_types))
++		return NULL;
++
++	minor = get_device_minor(sna->kgem.fd);
++	if (minor < 0)
++		return NULL;
++
++	len = snprintf(path, sizeof(path),
++		       "/sys/class/drm/card%d-%s-%d",
++		       minor,
++		       sysfs_connector_types[sna_output->connector_type],
++		       sna_output->connector_type_id);
++	DBG(("%s: lookup %s\n", __FUNCTION__, path));
++
++	dir = opendir(path);
++	if (dir == NULL)
++		return NULL;
++
++	while ((de = readdir(dir))) {
++		struct stat st;
++
++		if (*de->d_name == '.')
++			continue;
++
++		snprintf(path + len, sizeof(path) - len,
++			 "/%s", de->d_name);
++
++		if (stat(path, &st))
++			continue;
++
++		if (!S_ISDIR(st.st_mode))
++			continue;
++
++		DBG(("%s: testing %s as backlight\n",
++		     __FUNCTION__, de->d_name));
++
++		if (backlight_exists(de->d_name)) {
++			str = strdup(de->d_name); /* leak! */
++			break;
++		}
++	}
++
++	closedir(dir);
++	return str;
++}
++
+ static void
+ sna_output_backlight_init(xf86OutputPtr output)
+ {
+@@ -696,11 +1080,20 @@ sna_output_backlight_init(xf86OutputPtr output)
+ 	return;
+ #endif
+ 
+-	from = X_CONFIG;
+-	best_iface = has_user_backlight_override(output);
++	if (sna_output->is_panel) {
++		from = X_CONFIG;
++		best_iface = has_user_backlight_override(output);
++		if (best_iface)
++			goto done;
++	}
++
++	best_iface = has_connector_backlight(output);
+ 	if (best_iface)
+ 		goto done;
+ 
++	if (!sna_output->is_panel)
++		return;
++
+ 	/* XXX detect right backlight for multi-GPU/panels */
+ 	from = X_PROBED;
+ 	pci = xf86GetPciInfoForEntity(to_sna(output->scrn)->pEnt->index);
+@@ -728,6 +1121,38 @@ done:
+ 		   sna_output->backlight.iface, best_iface, output->name);
+ }
+ 
++#if ABI_VIDEODRV_VERSION >= SET_ABI_VERSION(22, 0)
++static inline int sigio_block(void)
++{
++	return 0;
++}
++static inline void sigio_unblock(int was_blocked)
++{
++	(void)was_blocked;
++}
++#elif XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,12,99,901,0)
++static inline int sigio_block(void)
++{
++	OsBlockSIGIO();
++	return 0;
++}
++static inline void sigio_unblock(int was_blocked)
++{
++	OsReleaseSIGIO();
++	(void)was_blocked;
++}
++#else
++#include <xf86_OSproc.h>
++static inline int sigio_block(void)
++{
++	return xf86BlockSIGIO();
++}
++static inline void sigio_unblock(int was_blocked)
++{
++	xf86UnblockSIGIO(was_blocked);
++}
++#endif
++
+ static char *canonical_kmode_name(const struct drm_mode_modeinfo *kmode)
+ {
+ 	char tmp[32], *buf;
+@@ -781,6 +1206,7 @@ mode_from_kmode(ScrnInfoPtr scrn,
+ 	mode->VTotal = kmode->vtotal;
+ 	mode->VScan = kmode->vscan;
+ 
++	mode->VRefresh = kmode->vrefresh;
+ 	mode->Flags = kmode->flags;
+ 	mode->name = get_kmode_name(kmode);
+ 
+@@ -814,6 +1240,7 @@ mode_to_kmode(struct drm_mode_modeinfo *kmode, DisplayModePtr mode)
+ 	kmode->vtotal = mode->VTotal;
+ 	kmode->vscan = mode->VScan;
+ 
++	kmode->vrefresh = mode->VRefresh;
+ 	kmode->flags = mode->Flags;
+ 	if (mode->name)
+ 		strncpy(kmode->name, mode->name, DRM_DISPLAY_MODE_LEN);
+@@ -824,11 +1251,12 @@ static void
+ sna_crtc_force_outputs_on(xf86CrtcPtr crtc)
+ {
+ 	xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(crtc->scrn);
++	/* All attached outputs are valid, so update our timestamps */
++	unsigned now = GetTimeInMillis();
+ 	int i;
+ 
+ 	assert(to_sna_crtc(crtc));
+-	DBG(("%s(pipe=%d), currently? %d\n", __FUNCTION__,
+-	     to_sna_crtc(crtc)->pipe, to_sna_crtc(crtc)->dpms_mode));
++	DBG(("%s(pipe=%d)\n", __FUNCTION__, sna_crtc_pipe(crtc)));
+ 
+ 	/* DPMS handling by the kernel is inconsistent, so after setting a
+ 	 * mode on an output presume that we intend for it to be on, or that
+@@ -843,10 +1271,11 @@ sna_crtc_force_outputs_on(xf86CrtcPtr crtc)
+ 		if (output->crtc != crtc)
+ 			continue;
+ 
+-		output->funcs->dpms(output, DPMSModeOn);
++		__sna_output_dpms(output, DPMSModeOn, false);
++		if (to_sna_output(output)->last_detect)
++			to_sna_output(output)->last_detect = now;
+ 	}
+ 
+-	to_sna_crtc(crtc)->dpms_mode = DPMSModeOn;
+ #if XF86_CRTC_VERSION >= 3
+ 	crtc->active = TRUE;
+ #endif
+@@ -859,8 +1288,7 @@ sna_crtc_force_outputs_off(xf86CrtcPtr crtc)
+ 	int i;
+ 
+ 	assert(to_sna_crtc(crtc));
+-	DBG(("%s(pipe=%d), currently? %d\n", __FUNCTION__,
+-	     to_sna_crtc(crtc)->pipe, to_sna_crtc(crtc)->dpms_mode));
++	DBG(("%s(pipe=%d)\n", __FUNCTION__, sna_crtc_pipe(crtc)));
+ 
+ 	/* DPMS handling by the kernel is inconsistent, so after setting a
+ 	 * mode on an output presume that we intend for it to be on, or that
+@@ -875,35 +1303,47 @@ sna_crtc_force_outputs_off(xf86CrtcPtr crtc)
+ 		if (output->crtc != crtc)
+ 			continue;
+ 
+-		output->funcs->dpms(output, DPMSModeOff);
++		__sna_output_dpms(output, DPMSModeOff, false);
+ 	}
+-
+-	to_sna_crtc(crtc)->dpms_mode = DPMSModeOff;
+ }
+ 
+ static unsigned
+-rotation_reduce(struct plane *p, unsigned rotation)
++rotation_reflect(unsigned rotation)
+ {
+-	unsigned unsupported_rotations = rotation & ~p->rotation.supported;
++	unsigned other_bits;
+ 
+-	if (unsupported_rotations == 0)
+-		return rotation;
++	/* paranoia for future extensions */
++	other_bits = rotation & ~RR_Rotate_All;
+ 
+-#define RR_Reflect_XY (RR_Reflect_X | RR_Reflect_Y)
++	/* flip the reflection to compensate for reflecting the rotation */
++	other_bits ^= RR_Reflect_X | RR_Reflect_Y;
+ 
+-	if ((unsupported_rotations & RR_Reflect_XY) == RR_Reflect_XY &&
+-	    p->rotation.supported& RR_Rotate_180) {
+-		rotation &= ~RR_Reflect_XY;
+-		rotation ^= RR_Rotate_180;
+-	}
++	/* Reflect the screen by rotating the rotation bit,
++	 * which has to have at least RR_Rotate_0 set. This allows
++	 * us to reflect any of the rotation bits, not just 0.
++	 */
++	rotation &= RR_Rotate_All;
++	assert(rotation);
++	rotation <<= 2; /* RR_Rotate_0 -> RR_Rotate_180 etc */
++	rotation |= rotation >> 4; /* RR_Rotate_270' to RR_Rotate_90 */
++	rotation &= RR_Rotate_All;
++	assert(rotation);
+ 
+-	if ((unsupported_rotations & RR_Rotate_180) &&
+-	    (p->rotation.supported& RR_Reflect_XY) == RR_Reflect_XY) {
+-		rotation ^= RR_Reflect_XY;
+-		rotation &= ~RR_Rotate_180;
++	return rotation | other_bits;
++}
++
++static unsigned
++rotation_reduce(struct plane *p, unsigned rotation)
++{
++	/* If unsupported try exchanging rotation for a reflection */
++	if (rotation & ~p->rotation.supported) {
++		unsigned new_rotation = rotation_reflect(rotation);
++		if ((new_rotation & p->rotation.supported) == new_rotation)
++			rotation = new_rotation;
+ 	}
+ 
+-#undef RR_Reflect_XY
++	/* Only one rotation bit should be set */
++	assert(is_power_of_two(rotation & RR_Rotate_All));
+ 
+ 	return rotation;
+ }
+@@ -923,7 +1363,7 @@ rotation_set(struct sna *sna, struct plane *p, uint32_t desired)
+ 	if (desired == p->rotation.current)
+ 		return true;
+ 
+-	if ((desired & p->rotation.supported) == 0) {
++	if ((desired & p->rotation.supported) != desired) {
+ 		errno = EINVAL;
+ 		return false;
+ 	}
+@@ -956,20 +1396,105 @@ rotation_reset(struct plane *p)
+ 	p->rotation.current = 0;
+ }
+ 
+-bool sna_crtc_set_sprite_rotation(xf86CrtcPtr crtc, uint32_t rotation)
++bool sna_crtc_set_sprite_rotation(xf86CrtcPtr crtc,
++				  unsigned idx,
++				  uint32_t rotation)
+ {
++	struct plane *sprite;
+ 	assert(to_sna_crtc(crtc));
++
++	sprite = lookup_sprite(to_sna_crtc(crtc), idx);
++	if (!sprite)
++		return false;
++
+ 	DBG(("%s: CRTC:%d [pipe=%d], sprite=%u set-rotation=%x\n",
+ 	     __FUNCTION__,
+-	     to_sna_crtc(crtc)->id, to_sna_crtc(crtc)->pipe, to_sna_crtc(crtc)->sprite.id,
+-	     rotation));
++	     sna_crtc_id(crtc), sna_crtc_pipe(crtc),
++	     sprite->id, rotation));
+ 
+-	return rotation_set(to_sna(crtc->scrn),
+-			    &to_sna_crtc(crtc)->sprite,
+-			    rotation_reduce(&to_sna_crtc(crtc)->sprite, rotation));
++	return rotation_set(to_sna(crtc->scrn), sprite,
++			    rotation_reduce(sprite, rotation));
+ }
+ 
+-static bool
++#if HAS_DEBUG_FULL
++#if !HAS_DEBUG_FULL
++#define LogF ErrorF
++#endif
++struct kmsg {
++	int fd;
++	int saved_loglevel;
++};
++
++static int kmsg_get_debug(void)
++{
++	FILE *file;
++	int v = -1;
++
++	file = fopen("/sys/module/drm/parameters/debug", "r");
++	if (file) {
++		fscanf(file, "%d", &v);
++		fclose(file);
++	}
++
++	return v;
++}
++
++static void kmsg_set_debug(int v)
++{
++	FILE *file;
++
++	file = fopen("/sys/module/drm/parameters/debug", "w");
++	if (file) {
++		fprintf(file, "%d\n", v);
++		fclose(file);
++	}
++}
++
++static void kmsg_open(struct kmsg *k)
++{
++	k->saved_loglevel = kmsg_get_debug();
++	if (k->saved_loglevel != -1)
++		kmsg_set_debug(0xff);
++
++	k->fd = open("/dev/kmsg", O_RDONLY | O_NONBLOCK);
++	if (k->fd != -1)
++		lseek(k->fd, 0, SEEK_END);
++}
++
++static void kmsg_close(struct kmsg *k, int dump)
++{
++	FILE *file;
++
++	file = NULL;
++	if (k->fd != -1 && dump)
++		file = fdopen(k->fd, "r");
++	if (file) {
++		size_t len = 0;
++		char *line = NULL;
++
++		while (getline(&line, &len, file) != -1) {
++			char *start = strchr(line, ';');
++			if (start)
++				LogF("KMSG: %s", start + 1);
++		}
++
++		free(line);
++		fclose(file);
++	}
++
++	if (k->fd != -1)
++		close(k->fd);
++
++	if (k->saved_loglevel != -1)
++		kmsg_set_debug(k->saved_loglevel);
++}
++#else
++struct kmsg { int unused; };
++static void kmsg_open(struct kmsg *k) {}
++static void kmsg_close(struct kmsg *k, int dump) {}
++#endif
++
++static int
+ sna_crtc_apply(xf86CrtcPtr crtc)
+ {
+ 	struct sna *sna = to_sna(crtc->scrn);
+@@ -978,26 +1503,39 @@ sna_crtc_apply(xf86CrtcPtr crtc)
+ 	struct drm_mode_crtc arg;
+ 	uint32_t output_ids[32];
+ 	int output_count = 0;
+-	int i;
++	int sigio, i;
++	struct kmsg kmsg;
++	int ret = EINVAL;
+ 
+-	DBG(("%s CRTC:%d [pipe=%d], handle=%d\n", __FUNCTION__, sna_crtc->id, sna_crtc->pipe, sna_crtc->bo->handle));
++	DBG(("%s CRTC:%d [pipe=%d], handle=%d\n", __FUNCTION__,
++	     __sna_crtc_id(sna_crtc), __sna_crtc_pipe(sna_crtc),
++	     sna_crtc->bo->handle));
+ 	if (!sna_crtc->kmode.clock) {
+ 		ERR(("%s(CRTC:%d [pipe=%d]): attempted to set an invalid mode\n",
+-		     __FUNCTION__, sna_crtc->id, sna_crtc->pipe));
+-		return false;
++		     __FUNCTION__, __sna_crtc_id(sna_crtc), __sna_crtc_pipe(sna_crtc)));
++		return EINVAL;
+ 	}
+ 
++	kmsg_open(&kmsg);
++	sigio = sigio_block();
++
+ 	assert(sna->mode.num_real_output < ARRAY_SIZE(output_ids));
+ 	sna_crtc_disable_cursor(sna, sna_crtc);
+ 
+ 	if (!rotation_set(sna, &sna_crtc->primary, sna_crtc->rotation)) {
++		memset(&arg, 0, sizeof(arg));
++		arg.crtc_id = __sna_crtc_id(sna_crtc);
++		(void)drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_SETCRTC, &arg);
++	}
++
++	if (!rotation_set(sna, &sna_crtc->primary, sna_crtc->rotation)) {
+ 		ERR(("%s: set-primary-rotation failed (rotation-id=%d, rotation=%d) on CRTC:%d [pipe=%d], errno=%d\n",
+-		     __FUNCTION__, sna_crtc->primary.rotation.prop, sna_crtc->rotation, sna_crtc->id, sna_crtc->pipe, errno));
++		     __FUNCTION__, sna_crtc->primary.rotation.prop, sna_crtc->rotation, __sna_crtc_id(sna_crtc), __sna_crtc_pipe(sna_crtc), errno));
+ 		sna_crtc->primary.rotation.supported &= ~sna_crtc->rotation;
+-		return false;
++		goto unblock;
+ 	}
+ 	DBG(("%s: CRTC:%d [pipe=%d] primary rotation set to %x\n",
+-	     __FUNCTION__, sna_crtc->id, sna_crtc->pipe, sna_crtc->rotation));
++	     __FUNCTION__, __sna_crtc_id(sna_crtc), __sna_crtc_pipe(sna_crtc), sna_crtc->rotation));
+ 
+ 	for (i = 0; i < sna->mode.num_real_output; i++) {
+ 		xf86OutputPtr output = config->output[i];
+@@ -1008,7 +1546,7 @@ sna_crtc_apply(xf86CrtcPtr crtc)
+ 		 * and we lose track of the user settings.
+ 		 */
+ 		if (output->crtc == NULL)
+-			output->funcs->dpms(output, DPMSModeOff);
++			__sna_output_dpms(output, DPMSModeOff, false);
+ 
+ 		if (output->crtc != crtc)
+ 			continue;
+@@ -1022,29 +1560,27 @@ sna_crtc_apply(xf86CrtcPtr crtc)
+ 
+ 		DBG(("%s: attaching output '%s' %d [%d] to crtc:%d (pipe %d) (possible crtc:%x, possible clones:%x)\n",
+ 		     __FUNCTION__, output->name, i, to_connector_id(output),
+-		     sna_crtc->id, sna_crtc->pipe,
++		     __sna_crtc_id(sna_crtc), __sna_crtc_pipe(sna_crtc),
+ 		     (uint32_t)output->possible_crtcs,
+ 		     (uint32_t)output->possible_clones));
+ 
+-		assert(output->possible_crtcs & (1 << sna_crtc->pipe) ||
++		assert(output->possible_crtcs & (1 << __sna_crtc_pipe(sna_crtc)) ||
+ 		       is_zaphod(crtc->scrn));
+ 
+ 		output_ids[output_count] = to_connector_id(output);
+ 		if (++output_count == ARRAY_SIZE(output_ids)) {
+ 			DBG(("%s: too many outputs (%d) for me!\n",
+ 			     __FUNCTION__, output_count));
+-			errno = EINVAL;
+-			return false;
++			goto unblock;
+ 		}
+ 	}
+ 	if (output_count == 0) {
+ 		DBG(("%s: no outputs\n", __FUNCTION__));
+-		errno = EINVAL;
+-		return false;
++		goto unblock;
+ 	}
+ 
+ 	VG_CLEAR(arg);
+-	arg.crtc_id = sna_crtc->id;
++	arg.crtc_id = __sna_crtc_id(sna_crtc);
+ 	arg.fb_id = fb_id(sna_crtc->bo);
+ 	if (sna_crtc->transform || sna_crtc->slave_pixmap) {
+ 		arg.x = 0;
+@@ -1061,7 +1597,7 @@ sna_crtc_apply(xf86CrtcPtr crtc)
+ 	arg.mode_valid = 1;
+ 
+ 	DBG(("%s: applying crtc [%d, pipe=%d] mode=%dx%d+%d+%d@%d, fb=%d%s%s update to %d outputs [%d...]\n",
+-	     __FUNCTION__, sna_crtc->id, sna_crtc->pipe,
++	     __FUNCTION__, __sna_crtc_id(sna_crtc), __sna_crtc_pipe(sna_crtc),
+ 	     arg.mode.hdisplay,
+ 	     arg.mode.vdisplay,
+ 	     arg.x, arg.y,
+@@ -1071,12 +1607,19 @@ sna_crtc_apply(xf86CrtcPtr crtc)
+ 	     sna_crtc->transform ? " [transformed]" : "",
+ 	     output_count, output_count ? output_ids[0] : 0));
+ 
+-	if (drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_SETCRTC, &arg))
+-		return false;
++	ret = 0;
++	if (unlikely(drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_SETCRTC, &arg))) {
++		ret = errno;
++		goto unblock;
++	}
+ 
+ 	sna_crtc->mode_serial++;
+ 	sna_crtc_force_outputs_on(crtc);
+-	return true;
++
++unblock:
++	sigio_unblock(sigio);
++	kmsg_close(&kmsg, ret);
++	return ret;
+ }
+ 
+ static bool overlap(const BoxRec *a, const BoxRec *b)
+@@ -1094,26 +1637,73 @@ static bool overlap(const BoxRec *a, const BoxRec *b)
+ 	return true;
+ }
+ 
++static void defer_event(struct sna *sna, struct drm_event *base)
++{
++	if (sna->mode.shadow_nevent == sna->mode.shadow_size) {
++		int size = sna->mode.shadow_size * 2;
++		void *ptr;
++
++		ptr = realloc(sna->mode.shadow_events,
++			      sizeof(struct drm_event_vblank)*size);
++		if (!ptr)
++			return;
++
++		sna->mode.shadow_events = ptr;
++		sna->mode.shadow_size = size;
++	}
++
++	memcpy(&sna->mode.shadow_events[sna->mode.shadow_nevent++],
++	       base, sizeof(struct drm_event_vblank));
++	DBG(("%s: deferring event count=%d\n",
++	     __func__, sna->mode.shadow_nevent));
++}
++
++static void flush_events(struct sna *sna)
++{
++	int n;
++
++	if (!sna->mode.shadow_nevent)
++		return;
++
++	DBG(("%s: flushing %d events=%d\n", __func__, sna->mode.shadow_nevent));
++
++	for (n = 0; n < sna->mode.shadow_nevent; n++) {
++		struct drm_event_vblank *vb = &sna->mode.shadow_events[n];
++
++		if ((uintptr_t)(vb->user_data) & 2)
++			sna_present_vblank_handler(vb);
++		else
++			sna_dri2_vblank_handler(vb);
++	}
++
++	sna->mode.shadow_nevent = 0;
++}
++
++
+ static bool wait_for_shadow(struct sna *sna,
+ 			    struct sna_pixmap *priv,
+ 			    unsigned flags)
+ {
+ 	PixmapPtr pixmap = priv->pixmap;
+-	DamagePtr damage;
+ 	struct kgem_bo *bo, *tmp;
+ 	int flip_active;
+ 	bool ret = true;
+ 
+-	DBG(("%s: flags=%x, flips=%d, handle=%d, shadow=%d\n",
+-	     __FUNCTION__, flags, sna->mode.flip_active,
++	DBG(("%s: enabled? %d waiting? %d, flags=%x, flips=%d, pixmap=%ld [front?=%d], handle=%d, shadow=%d\n",
++	     __FUNCTION__, sna->mode.shadow_enabled, sna->mode.shadow_wait,
++	     flags, sna->mode.flip_active,
++	     pixmap->drawable.serialNumber, pixmap == sna->front,
+ 	     priv->gpu_bo->handle, sna->mode.shadow->handle));
+ 
+ 	assert(priv->move_to_gpu_data == sna);
+ 	assert(sna->mode.shadow != priv->gpu_bo);
+ 
+-	if (flags == 0 || pixmap != sna->front || !sna->mode.shadow_damage)
++	if (flags == 0 || pixmap != sna->front || !sna->mode.shadow_enabled)
+ 		goto done;
+ 
++	assert(sna->mode.shadow_damage);
++	assert(!sna->mode.shadow_wait);
++
+ 	if ((flags & MOVE_WRITE) == 0) {
+ 		if ((flags & __MOVE_SCANOUT) == 0) {
+ 			struct sna_crtc *crtc;
+@@ -1154,9 +1744,7 @@ static bool wait_for_shadow(struct sna *sna,
+ 	}
+ 
+ 	assert(sna->mode.shadow_active);
+-
+-	damage = sna->mode.shadow_damage;
+-	sna->mode.shadow_damage = NULL;
++	sna->mode.shadow_wait = true;
+ 
+ 	flip_active = sna->mode.flip_active;
+ 	if (flip_active) {
+@@ -1208,6 +1796,8 @@ static bool wait_for_shadow(struct sna *sna,
+ 			bo = sna->mode.shadow;
+ 		}
+ 	}
++	assert(sna->mode.shadow_wait);
++	sna->mode.shadow_wait = false;
+ 
+ 	if (bo->refcnt > 1) {
+ 		bo = kgem_create_2d(&sna->kgem,
+@@ -1230,8 +1820,6 @@ static bool wait_for_shadow(struct sna *sna,
+ 			bo = sna->mode.shadow;
+ 	}
+ 
+-	sna->mode.shadow_damage = damage;
+-
+ 	RegionSubtract(&sna->mode.shadow_region,
+ 		       &sna->mode.shadow_region,
+ 		       &sna->mode.shadow_cancel);
+@@ -1269,6 +1857,7 @@ static bool wait_for_shadow(struct sna *sna,
+ 			RegionSubtract(&sna->mode.shadow_region, &sna->mode.shadow_region, &region);
+ 		}
+ 
++		crtc->client_bo->active_scanout--;
+ 		kgem_bo_destroy(&sna->kgem, crtc->client_bo);
+ 		crtc->client_bo = NULL;
+ 		list_del(&crtc->shadow_link);
+@@ -1281,12 +1870,13 @@ static bool wait_for_shadow(struct sna *sna,
+ 		     sna->mode.shadow_region.extents.y1,
+ 		     sna->mode.shadow_region.extents.x2,
+ 		     sna->mode.shadow_region.extents.y2));
+-		ret = sna->render.copy_boxes(sna, GXcopy,
+-					     &pixmap->drawable, priv->gpu_bo, 0, 0,
+-					     &pixmap->drawable, bo, 0, 0,
+-					     region_rects(&sna->mode.shadow_region),
+-					     region_num_rects(&sna->mode.shadow_region),
+-					     0);
++		if (!sna->render.copy_boxes(sna, GXcopy,
++					    &pixmap->drawable, priv->gpu_bo, 0, 0,
++					    &pixmap->drawable, bo, 0, 0,
++					    region_rects(&sna->mode.shadow_region),
++					    region_num_rects(&sna->mode.shadow_region),
++					    0))
++			ERR(("%s: copy failed\n", __FUNCTION__));
+ 	}
+ 
+ 	if (priv->cow)
+@@ -1295,11 +1885,13 @@ static bool wait_for_shadow(struct sna *sna,
+ 	sna_pixmap_unmap(pixmap, priv);
+ 
+ 	DBG(("%s: setting front pixmap to handle=%d\n", __FUNCTION__, bo->handle));
++	sna->mode.shadow->active_scanout--;
+ 	tmp = priv->gpu_bo;
+ 	priv->gpu_bo = bo;
+ 	if (bo != sna->mode.shadow)
+ 		kgem_bo_destroy(&sna->kgem, sna->mode.shadow);
+ 	sna->mode.shadow = tmp;
++	sna->mode.shadow->active_scanout++;
+ 
+ 	sna_dri2_pixmap_update_bo(sna, pixmap, bo);
+ 
+@@ -1311,6 +1903,9 @@ done:
+ 	priv->move_to_gpu_data = NULL;
+ 	priv->move_to_gpu = NULL;
+ 
++	assert(!sna->mode.shadow_wait);
++	flush_events(sna);
++
+ 	return ret;
+ }
+ 
+@@ -1358,22 +1953,43 @@ bool sna_pixmap_discard_shadow_damage(struct sna_pixmap *priv,
+ 	return RegionNil(&sna->mode.shadow_region);
+ }
+ 
++static void sna_mode_damage(DamagePtr damage, RegionPtr region, void *closure)
++{
++	struct sna *sna = closure;
++
++	if (sna->mode.rr_active)
++		return;
++
++	/* Throw away the rectangles if the region grows too big */
++	region = DamageRegion(damage);
++	if (region->data) {
++		RegionRec dup;
++
++		dup = *region;
++		RegionUninit(&dup);
++
++		region->data = NULL;
++	}
++}
++
+ static bool sna_mode_enable_shadow(struct sna *sna)
+ {
+-	ScreenPtr screen = sna->scrn->pScreen;
++	ScreenPtr screen = to_screen_from_sna(sna);
+ 
+ 	DBG(("%s\n", __FUNCTION__));
+ 	assert(sna->mode.shadow == NULL);
+ 	assert(sna->mode.shadow_damage == NULL);
+ 	assert(sna->mode.shadow_active == 0);
++	assert(!sna->mode.shadow_enabled);
+ 
+-	sna->mode.shadow_damage = DamageCreate(NULL, NULL,
+-					       DamageReportNone, TRUE,
+-					       screen, screen);
++	sna->mode.shadow_damage = DamageCreate(sna_mode_damage, NULL,
++					       DamageReportRawRegion,
++					       TRUE, screen, sna);
+ 	if (!sna->mode.shadow_damage)
+ 		return false;
+ 
+ 	DamageRegister(&sna->front->drawable, sna->mode.shadow_damage);
++	sna->mode.shadow_enabled = true;
+ 	return true;
+ }
+ 
+@@ -1381,8 +1997,10 @@ static void sna_mode_disable_shadow(struct sna *sna)
+ {
+ 	struct sna_pixmap *priv;
+ 
+-	if (!sna->mode.shadow_damage)
++	if (!sna->mode.shadow_damage) {
++		assert(!sna->mode.shadow_enabled);
+ 		return;
++	}
+ 
+ 	DBG(("%s\n", __FUNCTION__));
+ 
+@@ -1393,8 +2011,10 @@ static void sna_mode_disable_shadow(struct sna *sna)
+ 	DamageUnregister(&sna->front->drawable, sna->mode.shadow_damage);
+ 	DamageDestroy(sna->mode.shadow_damage);
+ 	sna->mode.shadow_damage = NULL;
++	sna->mode.shadow_enabled = false;
+ 
+ 	if (sna->mode.shadow) {
++		sna->mode.shadow->active_scanout--;
+ 		kgem_bo_destroy(&sna->kgem, sna->mode.shadow);
+ 		sna->mode.shadow = NULL;
+ 	}
+@@ -1413,7 +2033,7 @@ static void sna_crtc_slave_damage(DamagePtr damage, RegionPtr region, void *clos
+ 	     __FUNCTION__,
+ 	     region->extents.x1, region->extents.y1, region->extents.x2, region->extents.y2,
+ 	     region_num_rects(region),
+-	     crtc->pipe, crtc->base->x, crtc->base->y));
++	     __sna_crtc_pipe(crtc), crtc->base->x, crtc->base->y));
+ 
+ 	assert(crtc->slave_damage == damage);
+ 	assert(sna->mode.shadow_damage);
+@@ -1431,7 +2051,7 @@ static bool sna_crtc_enable_shadow(struct sna *sna, struct sna_crtc *crtc)
+ 		return true;
+ 	}
+ 
+-	DBG(("%s: enabling for crtc %d\n", __FUNCTION__, crtc->id));
++	DBG(("%s: enabling for crtc %d\n", __FUNCTION__, __sna_crtc_id(crtc)));
+ 
+ 	if (!sna->mode.shadow_active) {
+ 		if (!sna_mode_enable_shadow(sna))
+@@ -1443,9 +2063,12 @@ static bool sna_crtc_enable_shadow(struct sna *sna, struct sna_crtc *crtc)
+ 	if (crtc->slave_pixmap) {
+ 		assert(crtc->slave_damage == NULL);
+ 
++		DBG(("%s: enabling PRIME slave tracking on CRTC %d [pipe=%d], pixmap=%ld\n",
++		     __FUNCTION__, __sna_crtc_id(crtc), __sna_crtc_pipe(crtc), crtc->slave_pixmap->drawable.serialNumber));
+ 		crtc->slave_damage = DamageCreate(sna_crtc_slave_damage, NULL,
+ 						  DamageReportRawRegion, TRUE,
+-						  sna->scrn->pScreen, crtc);
++						  to_screen_from_sna(sna),
++						  crtc);
+ 		if (crtc->slave_damage == NULL) {
+ 			if (!--sna->mode.shadow_active)
+ 				sna_mode_disable_shadow(sna);
+@@ -1465,6 +2088,9 @@ static void sna_crtc_disable_override(struct sna *sna, struct sna_crtc *crtc)
+ 	if (crtc->client_bo == NULL)
+ 		return;
+ 
++	assert(crtc->client_bo->refcnt >= crtc->client_bo->active_scanout);
++	crtc->client_bo->active_scanout--;
++
+ 	if (!crtc->transform) {
+ 		DrawableRec tmp;
+ 
+@@ -1489,7 +2115,7 @@ static void sna_crtc_disable_shadow(struct sna *sna, struct sna_crtc *crtc)
+ 	if (!crtc->shadow)
+ 		return;
+ 
+-	DBG(("%s: disabling for crtc %d\n", __FUNCTION__, crtc->id));
++	DBG(("%s: disabling for crtc %d\n", __FUNCTION__, __sna_crtc_id(crtc)));
+ 	assert(sna->mode.shadow_active > 0);
+ 
+ 	if (crtc->slave_damage) {
+@@ -1517,14 +2143,24 @@ __sna_crtc_disable(struct sna *sna, struct sna_crtc *sna_crtc)
+ 	sna_crtc_disable_shadow(sna, sna_crtc);
+ 
+ 	if (sna_crtc->bo) {
++		DBG(("%s: releasing handle=%d from scanout, active=%d\n",
++		     __FUNCTION__,sna_crtc->bo->handle, sna_crtc->bo->active_scanout-1));
++		assert(sna_crtc->flags & CRTC_ON);
+ 		assert(sna_crtc->bo->active_scanout);
+ 		assert(sna_crtc->bo->refcnt >= sna_crtc->bo->active_scanout);
+ 		sna_crtc->bo->active_scanout--;
+ 		kgem_bo_destroy(&sna->kgem, sna_crtc->bo);
+ 		sna_crtc->bo = NULL;
++		sna_crtc->flags &= ~CRTC_ON;
+ 
+-		assert(sna->mode.front_active);
+-		sna->mode.front_active--;
++		if (sna->mode.hidden) {
++			sna->mode.hidden--;
++			assert(sna->mode.hidden);
++			assert(sna->mode.front_active == 0);
++		} else {
++			assert(sna->mode.front_active);
++			sna->mode.front_active--;
++		}
+ 		sna->mode.dirty = true;
+ 	}
+ 
+@@ -1532,13 +2168,19 @@ __sna_crtc_disable(struct sna *sna, struct sna_crtc *sna_crtc)
+ 		kgem_bo_destroy(&sna->kgem, sna_crtc->shadow_bo);
+ 		sna_crtc->shadow_bo = NULL;
+ 	}
+-	sna_crtc->transform = false;
++	if (sna_crtc->transform) {
++		assert(sna->mode.rr_active);
++		sna->mode.rr_active--;
++		sna_crtc->transform = false;
++	}
+ 
++	sna_crtc->cursor_transform = false;
++	sna_crtc->hwcursor = true;
+ 	assert(!sna_crtc->shadow);
+ }
+ 
+ static void
+-sna_crtc_disable(xf86CrtcPtr crtc)
++sna_crtc_disable(xf86CrtcPtr crtc, bool force)
+ {
+ 	struct sna *sna = to_sna(crtc->scrn);
+ 	struct sna_crtc *sna_crtc = to_sna_crtc(crtc);
+@@ -1547,14 +2189,16 @@ sna_crtc_disable(xf86CrtcPtr crtc)
+ 	if (sna_crtc == NULL)
+ 		return;
+ 
+-	DBG(("%s: disabling crtc [%d, pipe=%d]\n", __FUNCTION__,
+-	     sna_crtc->id, sna_crtc->pipe));
++	if (!force && sna_crtc->bo == NULL)
++		return;
++
++	DBG(("%s: disabling crtc [%d, pipe=%d], force?=%d\n", __FUNCTION__,
++	     __sna_crtc_id(sna_crtc), __sna_crtc_pipe(sna_crtc), force));
+ 
+ 	sna_crtc_force_outputs_off(crtc);
+-	assert(sna_crtc->dpms_mode == DPMSModeOff);
+ 
+ 	memset(&arg, 0, sizeof(arg));
+-	arg.crtc_id = sna_crtc->id;
++	arg.crtc_id = __sna_crtc_id(sna_crtc);
+ 	(void)drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_SETCRTC, &arg);
+ 
+ 	__sna_crtc_disable(sna, sna_crtc);
+@@ -1574,19 +2218,19 @@ static void update_flush_interval(struct sna *sna)
+ 
+ 		if (!crtc->enabled) {
+ 			DBG(("%s: CRTC:%d (pipe %d) disabled\n",
+-			     __FUNCTION__,i, to_sna_crtc(crtc)->pipe));
++			     __FUNCTION__,i, sna_crtc_pipe(crtc)));
+ 			assert(to_sna_crtc(crtc)->bo == NULL);
+ 			continue;
+ 		}
+ 
+-		if (to_sna_crtc(crtc)->dpms_mode != DPMSModeOn) {
++		if (to_sna_crtc(crtc)->bo == NULL) {
+ 			DBG(("%s: CRTC:%d (pipe %d) turned off\n",
+-			     __FUNCTION__,i, to_sna_crtc(crtc)->pipe));
++			     __FUNCTION__,i, sna_crtc_pipe(crtc)));
+ 			continue;
+ 		}
+ 
+ 		DBG(("%s: CRTC:%d (pipe %d) vrefresh=%f\n",
+-		     __FUNCTION__, i, to_sna_crtc(crtc)->pipe,
++		     __FUNCTION__, i, sna_crtc_pipe(crtc),
+ 		     xf86ModeVRefresh(&crtc->mode)));
+ 		max_vrefresh = max(max_vrefresh, xf86ModeVRefresh(&crtc->mode));
+ 	}
+@@ -1642,7 +2286,7 @@ void sna_copy_fbcon(struct sna *sna)
+ 	int dx, dy;
+ 	int i;
+ 
+-	if (wedged(sna))
++	if (wedged(sna) || isGPU(sna->scrn))
+ 		return;
+ 
+ 	DBG(("%s\n", __FUNCTION__));
+@@ -1662,7 +2306,7 @@ void sna_copy_fbcon(struct sna *sna)
+ 		assert(crtc != NULL);
+ 
+ 		VG_CLEAR(mode);
+-		mode.crtc_id = crtc->id;
++		mode.crtc_id = __sna_crtc_id(crtc);
+ 		if (drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_GETCRTC, &mode))
+ 			continue;
+ 		if (!mode.fb_id)
+@@ -1726,7 +2370,7 @@ void sna_copy_fbcon(struct sna *sna)
+ 	kgem_bo_destroy(&sna->kgem, bo);
+ 
+ #if ABI_VIDEODRV_VERSION >= SET_ABI_VERSION(10, 0)
+-	sna->scrn->pScreen->canDoBGNoneRoot = ok;
++	to_screen_from_sna(sna)->canDoBGNoneRoot = ok;
+ #endif
+ }
+ 
+@@ -1736,7 +2380,6 @@ static bool use_shadow(struct sna *sna, xf86CrtcPtr crtc)
+ 	PictTransform crtc_to_fb;
+ 	struct pict_f_transform f_crtc_to_fb, f_fb_to_crtc;
+ 	unsigned pitch_limit;
+-	struct sna_pixmap *priv;
+ 	BoxRec b;
+ 
+ 	assert(sna->scrn->virtualX && sna->scrn->virtualY);
+@@ -1765,27 +2408,31 @@ static bool use_shadow(struct sna *sna, xf86CrtcPtr crtc)
+ 		return true;
+ 	}
+ 
+-	priv = sna_pixmap_force_to_gpu(sna->front, MOVE_READ | __MOVE_SCANOUT);
+-	if (priv == NULL)
+-		return true; /* maybe we can create a bo for the scanout? */
+-
+-	if (sna->kgem.gen == 071)
+-		pitch_limit = priv->gpu_bo->tiling ? 16 * 1024 : 32 * 1024;
+-	else if ((sna->kgem.gen >> 3) > 4)
+-		pitch_limit = 32 * 1024;
+-	else if ((sna->kgem.gen >> 3) == 4)
+-		pitch_limit = priv->gpu_bo->tiling ? 16 * 1024 : 32 * 1024;
+-	else if ((sna->kgem.gen >> 3) == 3)
+-		pitch_limit = priv->gpu_bo->tiling ? 8 * 1024 : 16 * 1024;
+-	else
+-		pitch_limit = 8 * 1024;
+-	DBG(("%s: gpu bo handle=%d tiling=%d pitch=%d, limit=%d\n", __FUNCTION__, priv->gpu_bo->handle, priv->gpu_bo->tiling, priv->gpu_bo->pitch, pitch_limit));
+-	if (priv->gpu_bo->pitch > pitch_limit)
+-		return true;
++	if (!isGPU(sna->scrn)) {
++		struct sna_pixmap *priv;
+ 
+-	if (priv->gpu_bo->tiling && sna->flags & SNA_LINEAR_FB) {
+-		DBG(("%s: gpu bo is tiled, need linear, forcing shadow\n", __FUNCTION__));
+-		return true;
++		priv = sna_pixmap_force_to_gpu(sna->front, MOVE_READ | __MOVE_SCANOUT);
++		if (priv == NULL)
++			return true; /* maybe we can create a bo for the scanout? */
++
++		if (sna->kgem.gen == 071)
++			pitch_limit = priv->gpu_bo->tiling ? 16 * 1024 : 32 * 1024;
++		else if ((sna->kgem.gen >> 3) > 4)
++			pitch_limit = 32 * 1024;
++		else if ((sna->kgem.gen >> 3) == 4)
++			pitch_limit = priv->gpu_bo->tiling ? 16 * 1024 : 32 * 1024;
++		else if ((sna->kgem.gen >> 3) == 3)
++			pitch_limit = priv->gpu_bo->tiling ? 8 * 1024 : 16 * 1024;
++		else
++			pitch_limit = 8 * 1024;
++		DBG(("%s: gpu bo handle=%d tiling=%d pitch=%d, limit=%d\n", __FUNCTION__, priv->gpu_bo->handle, priv->gpu_bo->tiling, priv->gpu_bo->pitch, pitch_limit));
++		if (priv->gpu_bo->pitch > pitch_limit)
++			return true;
++
++		if (priv->gpu_bo->tiling && sna->flags & SNA_LINEAR_FB) {
++			DBG(("%s: gpu bo is tiled, need linear, forcing shadow\n", __FUNCTION__));
++			return true;
++		}
+ 	}
+ 
+ 	transform = NULL;
+@@ -1800,9 +2447,9 @@ static bool use_shadow(struct sna *sna, xf86CrtcPtr crtc)
+ 		bool needs_transform = true;
+ 		unsigned rotation = rotation_reduce(&to_sna_crtc(crtc)->primary, crtc->rotation);
+ 		DBG(("%s: natively supported rotation? rotation=%x & supported=%x == %d\n",
+-		     __FUNCTION__, crtc->rotation, to_sna_crtc(crtc)->primary.rotation.supported,
+-		     !!(crtc->rotation & to_sna_crtc(crtc)->primary.rotation.supported)));
+-		if (to_sna_crtc(crtc)->primary.rotation.supported & rotation)
++		     __FUNCTION__, rotation, to_sna_crtc(crtc)->primary.rotation.supported,
++		     rotation == (rotation & to_sna_crtc(crtc)->primary.rotation.supported)));
++		if ((to_sna_crtc(crtc)->primary.rotation.supported & rotation) == rotation)
+ 			needs_transform = RRTransformCompute(crtc->x, crtc->y,
+ 							     crtc->mode.HDisplay, crtc->mode.VDisplay,
+ 							     RR_Rotate_0, transform,
+@@ -1839,6 +2486,7 @@ static void set_shadow(struct sna *sna, RegionPtr region)
+ 
+ 	assert(priv->gpu_bo);
+ 	assert(sna->mode.shadow);
++	assert(sna->mode.shadow->active_scanout);
+ 
+ 	DBG(("%s: waiting for region %dx[(%d, %d), (%d, %d)], front handle=%d, shadow handle=%d\n",
+ 	     __FUNCTION__,
+@@ -1912,6 +2560,28 @@ get_scanout_bo(struct sna *sna, PixmapPtr pixmap)
+ 	return priv->gpu_bo;
+ }
+ 
++static void shadow_clear(struct sna *sna,
++			 PixmapPtr front, struct kgem_bo *bo,
++			 xf86CrtcPtr crtc)
++{
++	bool ok = false;
++	if (!wedged(sna))
++		ok = sna->render.fill_one(sna, front, bo, 0,
++					  0, 0, crtc->mode.HDisplay, crtc->mode.VDisplay,
++					  GXclear);
++	if (!ok) {
++		void *ptr = kgem_bo_map__gtt(&sna->kgem, bo);
++		if (ptr)
++			memset(ptr, 0, bo->pitch * crtc->mode.HDisplay);
++	}
++	sna->mode.shadow_dirty = true;
++}
++
++static bool rr_active(xf86CrtcPtr crtc)
++{
++	return crtc->transformPresent || crtc->rotation != RR_Rotate_0;
++}
++
+ static struct kgem_bo *sna_crtc_attach(xf86CrtcPtr crtc)
+ {
+ 	struct sna_crtc *sna_crtc = to_sna_crtc(crtc);
+@@ -1919,10 +2589,15 @@ static struct kgem_bo *sna_crtc_attach(xf86CrtcPtr crtc)
+ 	struct sna *sna = to_sna(scrn);
+ 	struct kgem_bo *bo;
+ 
+-	sna_crtc->transform = false;
++	if (sna_crtc->transform) {
++		assert(sna->mode.rr_active);
++		sna_crtc->transform = false;
++		sna->mode.rr_active--;
++	}
+ 	sna_crtc->rotation = RR_Rotate_0;
+ 
+ 	if (use_shadow(sna, crtc)) {
++		PixmapPtr front;
+ 		unsigned long tiled_limit;
+ 		int tiling;
+ 
+@@ -1949,6 +2624,10 @@ force_shadow:
+ 		}
+ 
+ 		tiling = I915_TILING_X;
++		if (crtc->rotation & (RR_Rotate_90 | RR_Rotate_270) &&
++		    sna->kgem.can_scanout_y)
++			tiling = I915_TILING_Y;
++
+ 		if (sna->kgem.gen == 071)
+ 			tiled_limit = 16 * 1024 * 8;
+ 		else if ((sna->kgem.gen >> 3) > 4)
+@@ -1977,8 +2656,8 @@ force_shadow:
+ 			return NULL;
+ 		}
+ 
+-		if (__sna_pixmap_get_bo(sna->front) && !crtc->transformPresent) {
+-			DrawableRec tmp;
++		front = sna_crtc->slave_pixmap ?: sna->front;
++		if (__sna_pixmap_get_bo(front) && !rr_active(crtc)) {
+ 			BoxRec b;
+ 
+ 			b.x1 = crtc->x;
+@@ -1986,28 +2665,48 @@ force_shadow:
+ 			b.x2 = crtc->x + crtc->mode.HDisplay;
+ 			b.y2 = crtc->y + crtc->mode.VDisplay;
+ 
+-			DBG(("%s: copying onto shadow CRTC: (%d, %d)x(%d, %d), handle=%d\n",
+-			     __FUNCTION__,
+-			     b.x1, b.y1,
+-			     b.x2, b.y2,
+-			     bo->handle));
+-
+-			tmp.width = crtc->mode.HDisplay;
+-			tmp.height = crtc->mode.VDisplay;
+-			tmp.depth = sna->front->drawable.depth;
+-			tmp.bitsPerPixel = sna->front->drawable.bitsPerPixel;
+-
+-			(void)sna->render.copy_boxes(sna, GXcopy,
+-						     &sna->front->drawable, __sna_pixmap_get_bo(sna->front), 0, 0,
+-						     &tmp, bo, -b.x1, -b.y1,
+-						     &b, 1, 0);
+-		}
++			if (b.x1 < 0)
++				b.x1 = 0;
++			if (b.y1 < 0)
++				b.y1 = 0;
++			if (b.x2 > scrn->virtualX)
++				b.x2 = scrn->virtualX;
++			if (b.y2 > scrn->virtualY)
++				b.y2 = scrn->virtualY;
++			if (b.x2 - b.x1 < crtc->mode.HDisplay ||
++			    b.y2 - b.y1 < crtc->mode.VDisplay)
++				shadow_clear(sna, front, bo, crtc);
++
++			if (b.y2 > b.y1 && b.x2 > b.x1) {
++				DrawableRec tmp;
++
++				DBG(("%s: copying onto shadow CRTC: (%d, %d)x(%d, %d) [fb=%dx%d], handle=%d\n",
++				     __FUNCTION__,
++				     b.x1, b.y1,
++				     b.x2-b.x1, b.y2-b.y1,
++				     scrn->virtualX, scrn->virtualY,
++				     bo->handle));
++
++				tmp.width = crtc->mode.HDisplay;
++				tmp.height = crtc->mode.VDisplay;
++				tmp.depth = front->drawable.depth;
++				tmp.bitsPerPixel = front->drawable.bitsPerPixel;
++
++				if (!sna->render.copy_boxes(sna, GXcopy,
++							     &front->drawable, __sna_pixmap_get_bo(front), 0, 0,
++							     &tmp, bo, -crtc->x, -crtc->y,
++							     &b, 1, COPY_LAST))
++					shadow_clear(sna, front, bo, crtc);
++			}
++		} else
++			shadow_clear(sna, front, bo, crtc);
+ 
+ 		sna_crtc->shadow_bo_width = crtc->mode.HDisplay;
+ 		sna_crtc->shadow_bo_height = crtc->mode.VDisplay;
+ 		sna_crtc->shadow_bo = bo;
+ out_shadow:
+ 		sna_crtc->transform = true;
++		sna->mode.rr_active++;
+ 		return kgem_bo_reference(bo);
+ 	} else {
+ 		if (sna_crtc->shadow_bo) {
+@@ -2048,26 +2747,26 @@ out_shadow:
+ 		}
+ 
+ 		if (sna->flags & SNA_TEAR_FREE) {
++			RegionRec region;
++
+ 			assert(sna_crtc->slave_pixmap == NULL);
+ 
+ 			DBG(("%s: enabling TearFree shadow\n", __FUNCTION__));
++			region.extents.x1 = 0;
++			region.extents.y1 = 0;
++			region.extents.x2 = sna->scrn->virtualX;
++			region.extents.y2 = sna->scrn->virtualY;
++			region.data = NULL;
++
+ 			if (!sna_crtc_enable_shadow(sna, sna_crtc)) {
+ 				DBG(("%s: failed to enable crtc shadow\n", __FUNCTION__));
+ 				return NULL;
+ 			}
+ 
+-			if (sna->mode.shadow == NULL && !wedged(sna)) {
+-				RegionRec region;
++			if (sna->mode.shadow == NULL) {
+ 				struct kgem_bo *shadow;
+ 
+ 				DBG(("%s: creating TearFree shadow bo\n", __FUNCTION__));
+-
+-				region.extents.x1 = 0;
+-				region.extents.y1 = 0;
+-				region.extents.x2 = sna->scrn->virtualX;
+-				region.extents.y2 = sna->scrn->virtualY;
+-				region.data = NULL;
+-
+ 				shadow = kgem_create_2d(&sna->kgem,
+ 							region.extents.x2,
+ 							region.extents.y2,
+@@ -2093,9 +2792,12 @@ out_shadow:
+ 					goto force_shadow;
+ 				}
+ 
++				assert(__sna_pixmap_get_bo(sna->front) == NULL ||
++				       __sna_pixmap_get_bo(sna->front)->pitch == shadow->pitch);
+ 				sna->mode.shadow = shadow;
+-				set_shadow(sna, &region);
++				sna->mode.shadow->active_scanout++;
+ 			}
++			set_shadow(sna, &region);
+ 
+ 			sna_crtc_disable_override(sna, sna_crtc);
+ 		} else
+@@ -2107,6 +2809,37 @@ out_shadow:
+ 	}
+ }
+ 
++#define SCALING_EPSILON (1./256)
++
++static bool
++is_affine(const struct pixman_f_transform *t)
++{
++	return (fabs(t->m[2][0]) < SCALING_EPSILON &&
++		fabs(t->m[2][1]) < SCALING_EPSILON);
++}
++
++static double determinant(const struct pixman_f_transform *t)
++{
++	return t->m[0][0]*t->m[1][1] - t->m[1][0]*t->m[0][1];
++}
++
++static bool
++affine_is_pixel_exact(const struct pixman_f_transform *t)
++{
++	double det = t->m[2][2] * determinant(t);
++	if (fabs (det * det - 1.0) < SCALING_EPSILON) {
++		if (fabs(t->m[0][1]) < SCALING_EPSILON &&
++		    fabs(t->m[1][0]) < SCALING_EPSILON)
++			return true;
++
++		if (fabs(t->m[0][0]) < SCALING_EPSILON &&
++		    fabs(t->m[1][1]) < SCALING_EPSILON)
++			return true;
++	}
++
++	return false;
++}
++
+ static void sna_crtc_randr(xf86CrtcPtr crtc)
+ {
+ 	struct sna_crtc *sna_crtc = to_sna_crtc(crtc);
+@@ -2152,6 +2885,25 @@ static void sna_crtc_randr(xf86CrtcPtr crtc)
+ 	} else
+ 		crtc->transform_in_use = sna_crtc->rotation != RR_Rotate_0;
+ 
++	/* Recompute the cursor after a potential change in transform */
++	if (sna_crtc->cursor) {
++		assert(sna_crtc->cursor->ref > 0);
++		sna_crtc->cursor->ref--;
++		sna_crtc->cursor = NULL;
++	}
++
++	if (needs_transform) {
++		sna_crtc->hwcursor = is_affine(&f_fb_to_crtc);
++		sna_crtc->cursor_transform =
++			sna_crtc->hwcursor &&
++			!affine_is_pixel_exact(&f_fb_to_crtc);
++	} else {
++		sna_crtc->hwcursor = true;
++		sna_crtc->cursor_transform = false;
++	}
++	DBG(("%s: hwcursor?=%d, cursor_transform?=%d\n",
++	     __FUNCTION__, sna_crtc->hwcursor, sna_crtc->cursor_transform));
++
+ 	crtc->crtc_to_framebuffer = crtc_to_fb;
+ 	crtc->f_crtc_to_framebuffer = f_crtc_to_fb;
+ 	crtc->f_framebuffer_to_crtc = f_fb_to_crtc;
+@@ -2184,7 +2936,7 @@ static void sna_crtc_randr(xf86CrtcPtr crtc)
+ static void
+ sna_crtc_damage(xf86CrtcPtr crtc)
+ {
+-	ScreenPtr screen = crtc->scrn->pScreen;
++	ScreenPtr screen = xf86ScrnToScreen(crtc->scrn);
+ 	struct sna *sna = to_sna(crtc->scrn);
+ 	RegionRec region, *damage;
+ 
+@@ -2200,15 +2952,21 @@ sna_crtc_damage(xf86CrtcPtr crtc)
+ 	if (region.extents.y2 > screen->height)
+ 		region.extents.y2 = screen->height;
+ 
++	if (region.extents.x2 <= region.extents.x1 ||
++	    region.extents.y2 <= region.extents.y1) {
++		DBG(("%s: crtc not damaged, all-clipped\n", __FUNCTION__));
++		return;
++	}
++
+ 	DBG(("%s: marking crtc %d as completely damaged (%d, %d), (%d, %d)\n",
+-	     __FUNCTION__, to_sna_crtc(crtc)->id,
++	     __FUNCTION__, sna_crtc_id(crtc),
+ 	     region.extents.x1, region.extents.y1,
+ 	     region.extents.x2, region.extents.y2));
+-	to_sna_crtc(crtc)->client_damage = region;
+ 
+ 	assert(sna->mode.shadow_damage && sna->mode.shadow_active);
+ 	damage = DamageRegion(sna->mode.shadow_damage);
+ 	RegionUnion(damage, damage, &region);
++	to_sna_crtc(crtc)->crtc_damage = region;
+ 
+ 	DBG(("%s: damage now %dx[(%d, %d), (%d, %d)]\n",
+ 	     __FUNCTION__,
+@@ -2260,6 +3018,21 @@ static const char *reflection_to_str(Rotation rotation)
+ 	}
+ }
+ 
++static void reprobe_connectors(xf86CrtcPtr crtc)
++{
++	xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(crtc->scrn);
++	struct sna *sna = to_sna(crtc->scrn);
++	int i;
++
++	for (i = 0; i < sna->mode.num_real_output; i++) {
++		xf86OutputPtr output = config->output[i];
++		if (output->crtc == crtc)
++			to_sna_output(output)->reprobe = true;
++	}
++
++	sna_mode_discover(sna, true);
++}
++
+ static Bool
+ __sna_crtc_set_mode(xf86CrtcPtr crtc)
+ {
+@@ -2268,11 +3041,19 @@ __sna_crtc_set_mode(xf86CrtcPtr crtc)
+ 	struct kgem_bo *saved_bo, *bo;
+ 	uint32_t saved_offset;
+ 	bool saved_transform;
++	bool saved_hwcursor;
++	bool saved_cursor_transform;
++	int ret;
+ 
+-	DBG(("%s\n", __FUNCTION__));
++	DBG(("%s: CRTC=%d, pipe=%d, hidden?=%d\n", __FUNCTION__,
++	     __sna_crtc_id(sna_crtc), __sna_crtc_pipe(sna_crtc), sna->mode.hidden));
++	if (sna->mode.hidden)
++		return TRUE;
+ 
+ 	saved_bo = sna_crtc->bo;
+ 	saved_transform = sna_crtc->transform;
++	saved_cursor_transform = sna_crtc->cursor_transform;
++	saved_hwcursor = sna_crtc->hwcursor;
+ 	saved_offset = sna_crtc->offset;
+ 
+ 	sna_crtc->fallback_shadow = false;
+@@ -2285,26 +3066,31 @@ retry: /* Attach per-crtc pixmap or direct */
+ 	}
+ 
+ 	/* Prevent recursion when enabling outputs during execbuffer */
+-	if (bo->exec && RQ(bo->rq)->bo == NULL)
++	if (bo->exec && RQ(bo->rq)->bo == NULL) {
+ 		_kgem_submit(&sna->kgem);
++		__kgem_bo_clear_dirty(bo);
++	}
+ 
+ 	sna_crtc->bo = bo;
+-	if (!sna_crtc_apply(crtc)) {
+-		int err = errno;
+-
++	ret = sna_crtc_apply(crtc);
++	if (ret) {
+ 		kgem_bo_destroy(&sna->kgem, bo);
+ 
+-		if (!sna_crtc->shadow) {
++		if (!sna_crtc->fallback_shadow) {
+ 			sna_crtc->fallback_shadow = true;
+ 			goto retry;
+ 		}
+ 
+ 		xf86DrvMsg(crtc->scrn->scrnIndex, X_ERROR,
+-			   "failed to set mode: %s [%d]\n", strerror(err), err);
++			   "failed to set mode: %s [%d]\n", strerror(ret), ret);
+ 		goto error;
+ 	}
+ 
++	sna_crtc->flags |= CRTC_ON;
+ 	bo->active_scanout++;
++	DBG(("%s: marking handle=%d as active=%d (removing %d from scanout, active=%d)\n",
++	     __FUNCTION__, bo->handle, bo->active_scanout,
++	     saved_bo ? saved_bo->handle : 0, saved_bo ? saved_bo->active_scanout - 1: -1));
+ 	if (saved_bo) {
+ 		assert(saved_bo->active_scanout);
+ 		assert(saved_bo->refcnt >= saved_bo->active_scanout);
+@@ -2315,17 +3101,34 @@ retry: /* Attach per-crtc pixmap or direct */
+ 	sna_crtc_randr(crtc);
+ 	if (sna_crtc->transform)
+ 		sna_crtc_damage(crtc);
++	if (sna_crtc->cursor &&  /* Reload cursor if RandR maybe changed */
++	    (!sna_crtc->hwcursor ||
++	     saved_cursor_transform || sna_crtc->cursor_transform ||
++	     sna_crtc->cursor->rotation != crtc->rotation))
++		sna_crtc_disable_cursor(sna, sna_crtc);
++
++	assert(!sna->mode.hidden);
+ 	sna->mode.front_active += saved_bo == NULL;
+ 	sna->mode.dirty = true;
+-	DBG(("%s: front_active=%d\n", __FUNCTION__, sna->mode.front_active));
++	DBG(("%s: handle=%d, scanout_active=%d, front_active=%d\n",
++	     __FUNCTION__, bo->handle, bo->active_scanout, sna->mode.front_active));
+ 
+ 	return TRUE;
+ 
+ error:
+ 	sna_crtc->offset = saved_offset;
++	if (sna_crtc->transform) {
++		assert(sna->mode.rr_active);
++		sna->mode.rr_active--;
++	}
++	if (saved_transform)
++		sna->mode.rr_active++;
+ 	sna_crtc->transform = saved_transform;
++	sna_crtc->cursor_transform = saved_cursor_transform;
++	sna_crtc->hwcursor = saved_hwcursor;
+ 	sna_crtc->bo = saved_bo;
+-	sna_mode_discover(sna);
++
++	reprobe_connectors(crtc);
+ 	return FALSE;
+ }
+ 
+@@ -2346,14 +3149,14 @@ sna_crtc_set_mode_major(xf86CrtcPtr crtc, DisplayModePtr mode,
+ 	xf86DrvMsg(crtc->scrn->scrnIndex, X_INFO,
+ 		   "switch to mode %dx%d@%.1f on %s using pipe %d, position (%d, %d), rotation %s, reflection %s\n",
+ 		   mode->HDisplay, mode->VDisplay, xf86ModeVRefresh(mode),
+-		   outputs_for_crtc(crtc, outputs, sizeof(outputs)), sna_crtc->pipe,
++		   outputs_for_crtc(crtc, outputs, sizeof(outputs)), __sna_crtc_pipe(sna_crtc),
+ 		   x, y, rotation_to_str(rotation), reflection_to_str(rotation));
+ 
+ 	assert(mode->HDisplay <= sna->mode.max_crtc_width &&
+ 	       mode->VDisplay <= sna->mode.max_crtc_height);
+ 
+ #if HAS_GAMMA
+-	drmModeCrtcSetGamma(sna->kgem.fd, sna_crtc->id,
++	drmModeCrtcSetGamma(sna->kgem.fd, __sna_crtc_id(sna_crtc),
+ 			    crtc->gamma_size,
+ 			    crtc->gamma_red,
+ 			    crtc->gamma_green,
+@@ -2372,17 +3175,10 @@ sna_crtc_set_mode_major(xf86CrtcPtr crtc, DisplayModePtr mode,
+ static void
+ sna_crtc_dpms(xf86CrtcPtr crtc, int mode)
+ {
+-	struct sna_crtc *priv = to_sna_crtc(crtc);
+-
+ 	DBG(("%s(pipe %d, dpms mode -> %d):= active=%d\n",
+-	     __FUNCTION__, priv->pipe, mode, mode == DPMSModeOn));
+-	if (priv->dpms_mode == mode)
+-		return;
+-
+-	assert(priv);
+-	priv->dpms_mode = mode;
++	     __FUNCTION__, sna_crtc_pipe(crtc), mode, mode == DPMSModeOn));
+ 
+-	if (mode == DPMSModeOn && crtc->enabled && priv->bo == NULL) {
++	if (mode == DPMSModeOn && crtc->enabled) {
+ 		if (__sna_crtc_set_mode(crtc))
+ 			update_flush_interval(to_sna(crtc->scrn));
+ 		else
+@@ -2390,7 +3186,7 @@ sna_crtc_dpms(xf86CrtcPtr crtc, int mode)
+ 	}
+ 
+ 	if (mode != DPMSModeOn)
+-		sna_crtc_disable(crtc);
++		sna_crtc_disable(crtc, false);
+ }
+ 
+ void sna_mode_adjust_frame(struct sna *sna, int x, int y)
+@@ -2426,7 +3222,7 @@ sna_crtc_gamma_set(xf86CrtcPtr crtc,
+ {
+ 	assert(to_sna_crtc(crtc));
+ 	drmModeCrtcSetGamma(to_sna(crtc->scrn)->kgem.fd,
+-			    to_sna_crtc(crtc)->id,
++			    sna_crtc_id(crtc),
+ 			    size, red, green, blue);
+ }
+ 
+@@ -2434,10 +3230,14 @@ static void
+ sna_crtc_destroy(xf86CrtcPtr crtc)
+ {
+ 	struct sna_crtc *sna_crtc = to_sna_crtc(crtc);
++	struct plane *sprite, *sn;
+ 
+ 	if (sna_crtc == NULL)
+ 		return;
+ 
++	list_for_each_entry_safe(sprite, sn, &sna_crtc->sprites, link)
++		free(sprite);
++
+ 	free(sna_crtc);
+ 	crtc->driver_private = NULL;
+ }
+@@ -2455,7 +3255,7 @@ sna_crtc_set_scanout_pixmap(xf86CrtcPtr crtc, PixmapPtr pixmap)
+ 		return TRUE;
+ 
+ 	DBG(("%s: CRTC:%d, pipe=%d setting scanout pixmap=%ld\n",
+-	     __FUNCTION__, sna_crtc->id,  sna_crtc->pipe,
++	     __FUNCTION__, __sna_crtc_id(sna_crtc),  __sna_crtc_pipe(sna_crtc),
+ 	     pixmap ? pixmap->drawable.serialNumber : 0));
+ 
+ 	/* Disable first so that we can unregister the damage tracking */
+@@ -2576,6 +3376,10 @@ static int plane_details(struct sna *sna, struct plane *p)
+ 		}
+ 	}
+ 
++	p->rotation.supported &= DBG_NATIVE_ROTATION;
++	if (!xf86ReturnOptValBool(sna->Options, OPTION_ROTATION, TRUE))
++		p->rotation.supported = RR_Rotate_0;
++
+ 	if (props != (uint32_t *)stack_props)
+ 		free(props);
+ 
+@@ -2583,20 +3387,26 @@ static int plane_details(struct sna *sna, struct plane *p)
+ 	return type;
+ }
+ 
++static void add_sprite_plane(struct sna_crtc *crtc,
++			     struct plane *details)
++{
++	struct plane *sprite = malloc(sizeof(*sprite));
++	if (!sprite)
++		return;
++
++	memcpy(sprite, details, sizeof(*sprite));
++	list_add(&sprite->link, &crtc->sprites);
++}
++
+ static void
+ sna_crtc_find_planes(struct sna *sna, struct sna_crtc *crtc)
+ {
+ #define LOCAL_IOCTL_SET_CAP	DRM_IOWR(0x0d, struct local_set_cap)
+-#define LOCAL_IOCTL_MODE_GETPLANERESOURCES DRM_IOWR(0xb5, struct local_mode_get_plane_res)
+-#define LOCAL_IOCTL_MODE_GETPLANE DRM_IOWR(0xb6, struct local_mode_get_plane)
+ 	struct local_set_cap {
+ 		uint64_t name;
+ 		uint64_t value;
+ 	} cap;
+-	struct local_mode_get_plane_res {
+-		uint64_t plane_id_ptr;
+-		uint64_t count_planes;
+-	} r;
++	struct local_mode_get_plane_res r;
+ 	uint32_t stack_planes[32];
+ 	uint32_t *planes = stack_planes;
+ 	int i;
+@@ -2629,18 +3439,7 @@ sna_crtc_find_planes(struct sna *sna, struct sna_crtc *crtc)
+ 	VG(VALGRIND_MAKE_MEM_DEFINED(planes, sizeof(uint32_t)*r.count_planes));
+ 
+ 	for (i = 0; i < r.count_planes; i++) {
+-		struct local_mode_get_plane {
+-			uint32_t plane_id;
+-
+-			uint32_t crtc_id;
+-			uint32_t fb_id;
+-
+-			uint32_t possible_crtcs;
+-			uint32_t gamma_size;
+-
+-			uint32_t count_format_types;
+-			uint64_t format_type_ptr;
+-		} p;
++		struct local_mode_get_plane p;
+ 		struct plane details;
+ 
+ 		VG_CLEAR(p);
+@@ -2649,11 +3448,11 @@ sna_crtc_find_planes(struct sna *sna, struct sna_crtc *crtc)
+ 		if (drmIoctl(sna->kgem.fd, LOCAL_IOCTL_MODE_GETPLANE, &p))
+ 			continue;
+ 
+-		if ((p.possible_crtcs & (1 << crtc->pipe)) == 0)
++		if ((p.possible_crtcs & (1 << __sna_crtc_pipe(crtc))) == 0)
+ 			continue;
+ 
+ 		DBG(("%s: plane %d is attached to our pipe=%d\n",
+-		     __FUNCTION__, planes[i], crtc->pipe));
++		     __FUNCTION__, planes[i], __sna_crtc_pipe(crtc)));
+ 
+ 		details.id = p.plane_id;
+ 		details.rotation.prop = 0;
+@@ -2672,8 +3471,7 @@ sna_crtc_find_planes(struct sna *sna, struct sna_crtc *crtc)
+ 			break;
+ 
+ 		case DRM_PLANE_TYPE_OVERLAY:
+-			if (crtc->sprite.id == 0)
+-				crtc->sprite = details;
++			add_sprite_plane(crtc, &details);
+ 			break;
+ 		}
+ 	}
+@@ -2688,7 +3486,6 @@ sna_crtc_init__rotation(struct sna *sna, struct sna_crtc *crtc)
+ 	crtc->rotation = RR_Rotate_0;
+ 	crtc->primary.rotation.supported = RR_Rotate_0;
+ 	crtc->primary.rotation.current = RR_Rotate_0;
+-	crtc->sprite.rotation = crtc->primary.rotation;
+ }
+ 
+ static void
+@@ -2698,55 +3495,55 @@ sna_crtc_init__cursor(struct sna *sna, struct sna_crtc *crtc)
+ 
+ 	VG_CLEAR(arg);
+ 	arg.flags = DRM_MODE_CURSOR_BO;
+-	arg.crtc_id = crtc->id;
++	arg.crtc_id = __sna_crtc_id(crtc);
+ 	arg.width = arg.height = 0;
+ 	arg.handle = 0;
+ 
+ 	(void)drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_CURSOR, &arg);
++	crtc->hwcursor = true;
+ }
+ 
+ static bool
+-sna_crtc_add(ScrnInfoPtr scrn, int id)
++sna_crtc_add(ScrnInfoPtr scrn, unsigned id)
+ {
+ 	struct sna *sna = to_sna(scrn);
+ 	xf86CrtcPtr crtc;
+ 	struct sna_crtc *sna_crtc;
+ 	struct drm_i915_get_pipe_from_crtc_id get_pipe;
+ 
+-	DBG(("%s(%d)\n", __FUNCTION__, id));
++	DBG(("%s(%d): is-zaphod? %d\n", __FUNCTION__, id, is_zaphod(scrn)));
+ 
+ 	sna_crtc = calloc(sizeof(struct sna_crtc), 1);
+ 	if (sna_crtc == NULL)
+ 		return false;
+ 
+ 	sna_crtc->id = id;
+-	sna_crtc->dpms_mode = -1;
+ 
+ 	VG_CLEAR(get_pipe);
+ 	get_pipe.pipe = 0;
+-	get_pipe.crtc_id = sna_crtc->id;
++	get_pipe.crtc_id = id;
+ 	if (drmIoctl(sna->kgem.fd,
+ 		     DRM_IOCTL_I915_GET_PIPE_FROM_CRTC_ID,
+ 		     &get_pipe)) {
+ 		free(sna_crtc);
+ 		return false;
+ 	}
+-	sna_crtc->pipe = get_pipe.pipe;
++	assert((unsigned)get_pipe.pipe < 256);
++	sna_crtc->flags |= get_pipe.pipe << 8;
+ 
+ 	if (is_zaphod(scrn) &&
+-	    scrn->confScreen->device->screen != sna_crtc->pipe) {
++	    (get_zaphod_crtcs(sna) & (1 << get_pipe.pipe)) == 0) {
+ 		free(sna_crtc);
+ 		return true;
+ 	}
+ 
++	list_init(&sna_crtc->sprites);
+ 	sna_crtc_init__rotation(sna, sna_crtc);
+-
+ 	sna_crtc_find_planes(sna, sna_crtc);
+ 
+-	DBG(("%s: CRTC:%d [pipe=%d], primary id=%x: supported-rotations=%x, current-rotation=%x, sprite id=%x: supported-rotations=%x, current-rotation=%x\n",
+-	     __FUNCTION__, sna_crtc->id, sna_crtc->pipe,
+-	     sna_crtc->primary.id, sna_crtc->primary.rotation.supported, sna_crtc->primary.rotation.current,
+-	     sna_crtc->sprite.id, sna_crtc->sprite.rotation.supported, sna_crtc->sprite.rotation.current));
++	DBG(("%s: CRTC:%d [pipe=%d], primary id=%x: supported-rotations=%x, current-rotation=%x\n",
++	     __FUNCTION__, id, get_pipe.pipe,
++	     sna_crtc->primary.id, sna_crtc->primary.rotation.supported, sna_crtc->primary.rotation.current));
+ 
+ 	list_init(&sna_crtc->shadow_link);
+ 
+@@ -2761,7 +3558,7 @@ sna_crtc_add(ScrnInfoPtr scrn, int id)
+ 	crtc->driver_private = sna_crtc;
+ 	sna_crtc->base = crtc;
+ 	DBG(("%s: attached crtc[%d] pipe=%d\n",
+-	     __FUNCTION__, id, sna_crtc->pipe));
++	     __FUNCTION__, id, __sna_crtc_pipe(sna_crtc)));
+ 
+ 	return true;
+ }
+@@ -2798,20 +3595,56 @@ find_property(struct sna *sna, struct sna_output *output, const char *name)
+ 	return -1;
+ }
+ 
++static void update_properties(struct sna *sna, struct sna_output *output)
++{
++	union compat_mode_get_connector compat_conn;
++	struct drm_mode_modeinfo dummy;
++
++	VG_CLEAR(compat_conn);
++
++	compat_conn.conn.connector_id = output->id;
++	compat_conn.conn.count_props = output->num_props;
++	compat_conn.conn.props_ptr = (uintptr_t)output->prop_ids;
++	compat_conn.conn.prop_values_ptr = (uintptr_t)output->prop_values;
++	compat_conn.conn.count_modes = 1; /* skip detect */
++	compat_conn.conn.modes_ptr = (uintptr_t)&dummy;
++	compat_conn.conn.count_encoders = 0;
++
++	(void)drmIoctl(sna->kgem.fd,
++		       DRM_IOCTL_MODE_GETCONNECTOR,
++		       &compat_conn.conn);
++
++	assert(compat_conn.conn.count_props == output->num_props);
++	output->update_properties = false;
++}
++
+ static xf86OutputStatus
+ sna_output_detect(xf86OutputPtr output)
+ {
+ 	struct sna *sna = to_sna(output->scrn);
+ 	struct sna_output *sna_output = output->driver_private;
+ 	union compat_mode_get_connector compat_conn;
++	uint32_t now;
+ 
+ 	DBG(("%s(%s:%d)\n", __FUNCTION__, output->name, sna_output->id));
++	sna_output->update_properties = false;
+ 
+ 	if (!sna_output->id) {
+ 		DBG(("%s(%s) hiding due to lost connection\n", __FUNCTION__, output->name));
+ 		return XF86OutputStatusDisconnected;
+ 	}
+ 
++	/* Cache detections for 15s or hotplug event  */
++	now = GetTimeInMillis();
++	if (sna_output->last_detect != 0 &&
++	    (int32_t)(now - sna_output->last_detect) <= OUTPUT_STATUS_CACHE_MS) {
++		DBG(("%s(%s) reporting cached status (since %dms): %d\n",
++		     __FUNCTION__, output->name, now - sna_output->last_detect,
++		     sna_output->status));
++		sna_output->update_properties = true;
++		return sna_output->status;
++	}
++
+ 	VG_CLEAR(compat_conn);
+ 	compat_conn.conn.connector_id = sna_output->id;
+ 	sna_output->num_modes = compat_conn.conn.count_modes = 0; /* reprobe */
+@@ -2854,15 +3687,23 @@ sna_output_detect(xf86OutputPtr output)
+ 	DBG(("%s(%s): found %d modes, connection status=%d\n",
+ 	     __FUNCTION__, output->name, sna_output->num_modes, compat_conn.conn.connection));
+ 
++	sna_output->reprobe = false;
++	sna_output->last_detect = now;
+ 	switch (compat_conn.conn.connection) {
+ 	case DRM_MODE_CONNECTED:
+-		return XF86OutputStatusConnected;
++		sna_output->status = XF86OutputStatusConnected;
++		output->mm_width = compat_conn.conn.mm_width;
++		output->mm_height = compat_conn.conn.mm_height;
++		break;
+ 	case DRM_MODE_DISCONNECTED:
+-		return XF86OutputStatusDisconnected;
++		sna_output->status = XF86OutputStatusDisconnected;
++		break;
+ 	default:
+ 	case DRM_MODE_UNKNOWNCONNECTION:
+-		return XF86OutputStatusUnknown;
++		sna_output->status = XF86OutputStatusUnknown;
++		break;
+ 	}
++	return sna_output->status;
+ }
+ 
+ static Bool
+@@ -2895,6 +3736,27 @@ sna_output_mode_valid(xf86OutputPtr output, DisplayModePtr mode)
+ 	return MODE_OK;
+ }
+ 
++static void sna_output_set_parsed_edid(xf86OutputPtr output, xf86MonPtr mon)
++{
++	unsigned conn_mm_width, conn_mm_height;
++
++	/* We set the output size based on values from the kernel */
++	conn_mm_width = output->mm_width;
++	conn_mm_height = output->mm_height;
++
++	xf86OutputSetEDID(output, mon);
++
++	if (output->mm_width != conn_mm_width || output->mm_height != conn_mm_height) {
++		DBG(("%s)%s): kernel and Xorg disagree over physical size: kernel=%dx%dmm, Xorg=%dx%dmm\n",
++		     __FUNCTION__, output->name,
++		     conn_mm_width, conn_mm_height,
++		     output->mm_width, output->mm_height));
++	}
++
++	output->mm_width = conn_mm_width;
++	output->mm_height = conn_mm_height;
++}
++
+ static void
+ sna_output_attach_edid(xf86OutputPtr output)
+ {
+@@ -2907,6 +3769,13 @@ sna_output_attach_edid(xf86OutputPtr output)
+ 	if (sna_output->edid_idx == -1)
+ 		return;
+ 
++	/* Always refresh the blob as the kernel may randomly update the
++	 * id even if the contents of the blob doesn't change, and a
++	 * request for the stale id will return nothing.
++	 */
++	if (sna_output->update_properties)
++		update_properties(sna, sna_output);
++
+ 	raw = sna_output->edid_raw;
+ 	blob.length = sna_output->edid_len;
+ 
+@@ -2917,8 +3786,12 @@ sna_output_attach_edid(xf86OutputPtr output)
+ 		old = NULL;
+ 
+ 	blob.blob_id = sna_output->prop_values[sna_output->edid_idx];
+-	DBG(("%s: attaching EDID id=%d, current=%d\n",
+-	     __FUNCTION__, blob.blob_id, sna_output->edid_blob_id));
++	if (!blob.blob_id)
++		goto done;
++
++	DBG(("%s(%s): attaching EDID id=%d, current=%d\n",
++	     __FUNCTION__, output->name,
++	     blob.blob_id, sna_output->edid_blob_id));
+ 	if (blob.blob_id == sna_output->edid_blob_id && 0) { /* sigh */
+ 		if (output->MonInfo) {
+ 			/* XXX the property keeps on disappearing... */
+@@ -2936,26 +3809,45 @@ sna_output_attach_edid(xf86OutputPtr output)
+ 	}
+ 
+ 	blob.data = (uintptr_t)raw;
+-	if (drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_GETPROPBLOB, &blob))
+-		goto done;
++	do {
++		while (drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_GETPROPBLOB, &blob)) {
++			update_properties(sna, sna_output);
++			if (blob.blob_id == sna_output->prop_values[sna_output->edid_idx]) {
++				DBG(("%s(%s): failed to read blob, reusing previous\n",
++				     __FUNCTION__, output->name));
++				goto done;
++			}
++			blob.blob_id = sna_output->prop_values[sna_output->edid_idx];
++		}
+ 
+-	DBG(("%s: retrieving blob id=%d, length=%d\n",
+-	     __FUNCTION__, blob.blob_id, blob.length));
++		DBG(("%s(%s): retrieving blob id=%d, length=%d\n",
++		     __FUNCTION__, output->name, blob.blob_id, blob.length));
+ 
+-	if (blob.length > sna_output->edid_len) {
+-		raw = realloc(raw, blob.length);
+-		if (raw == NULL)
++		if (blob.length < 128)
+ 			goto done;
+ 
+-		VG(memset(raw, 0, blob.length));
+-		blob.data = (uintptr_t)raw;
+-		if (drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_GETPROPBLOB, &blob))
+-			goto done;
++		if (blob.length > sna_output->edid_len) {
++			raw = realloc(raw, blob.length);
++			if (raw == NULL)
++				goto done;
++
++			VG(memset(raw, 0, blob.length));
++			blob.data = (uintptr_t)raw;
++		}
++	} while (blob.length != sna_output->edid_len &&
++		 drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_GETPROPBLOB, &blob));
++
++	if (blob.length & 127) {
++		/* Truncated EDID! Make sure no one reads too far */
++		*SECTION(NO_EDID, (uint8_t*)raw) = blob.length/128 - 1;
++		blob.length &= -128;
+ 	}
+ 
+ 	if (old &&
+ 	    blob.length == sna_output->edid_len &&
+ 	    memcmp(old, raw, blob.length) == 0) {
++		DBG(("%s(%s): EDID + MonInfo is unchanged\n",
++		     __FUNCTION__, output->name));
+ 		assert(sna_output->edid_raw == raw);
+ 		sna_output->edid_blob_id = blob.blob_id;
+ 		RRChangeOutputProperty(output->randr_output,
+@@ -2974,31 +3866,186 @@ skip_read:
+ 			mon->flags |= MONITOR_EDID_COMPLETE_RAWDATA;
+ 	}
+ 
+-done:
+-	xf86OutputSetEDID(output, mon);
+-	if (raw) {
+-		sna_output->edid_raw = raw;
+-		sna_output->edid_len = blob.length;
+-		sna_output->edid_blob_id = blob.blob_id;
++done:
++	sna_output_set_parsed_edid(output, mon);
++	if (raw) {
++		sna_output->edid_raw = raw;
++		sna_output->edid_len = blob.length;
++		sna_output->edid_blob_id = blob.blob_id;
++	}
++}
++
++static void
++sna_output_attach_tile(xf86OutputPtr output)
++{
++#if XF86_OUTPUT_VERSION >= 3
++	struct sna *sna = to_sna(output->scrn);
++	struct sna_output *sna_output = output->driver_private;
++	struct drm_mode_get_blob blob;
++	struct xf86CrtcTileInfo tile_info, *set = NULL;
++	char *tile;
++	int id;
++
++	id = find_property(sna, sna_output, "TILE");
++	DBG(("%s: found? TILE=%d\n", __FUNCTION__, id));
++	if (id == -1)
++		goto out;
++
++	if (sna_output->update_properties)
++		update_properties(sna, sna_output);
++
++	VG_CLEAR(blob);
++	blob.blob_id = sna_output->prop_values[id];
++	blob.length = 0;
++	if (drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_GETPROPBLOB, &blob))
++		goto out;
++
++	do {
++		id = blob.length;
++		tile = alloca(id + 1);
++		blob.data = (uintptr_t)tile;
++		VG(memset(tile, 0, id));
++		DBG(("%s: reading %d bytes for TILE blob\n", __FUNCTION__, id));
++		if (drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_GETPROPBLOB, &blob))
++			goto out;
++	} while (id != blob.length);
++
++	tile[blob.length] = '\0'; /* paranoia */
++	DBG(("%s: TILE='%s'\n", __FUNCTION__, tile));
++	if (xf86OutputParseKMSTile(tile, blob.length, &tile_info))
++		set = &tile_info;
++out:
++	xf86OutputSetTile(output, set);
++#endif
++}
++
++static bool duplicate_mode(DisplayModePtr modes, DisplayModePtr m)
++{
++	if (m == NULL)
++		return false;
++
++	while (modes) {
++		if (xf86ModesEqual(modes, m))
++			return true;
++
++		modes = modes->next;
++	}
++
++	return false;
++}
++
++static struct pixel_count {
++	int16_t width, height;
++} common_16_9[] = {
++	{ 640, 360 },
++	{ 720, 405 },
++	{ 864, 486 },
++	{ 960, 540 },
++	{ 1024, 576 },
++	{ 1280, 720 },
++	{ 1366, 768 },
++	{ 1600, 900 },
++	{ 1920, 1080 },
++	{ 2048, 1152 },
++	{ 2560, 1440 },
++	{ 2880, 1620 },
++	{ 3200, 1800 },
++	{ 3840, 2160 },
++	{ 4096, 2304 },
++	{ 5120, 2880 },
++	{ 7680, 4320 },
++	{ 15360, 8640 },
++}, common_16_10[] = {
++	{ 1280, 800 },
++	{ 1400, 900 },
++	{ 1680, 1050 },
++	{ 1920, 1200 },
++	{ 2560, 1600 },
++};
++
++static DisplayModePtr
++default_modes(DisplayModePtr preferred)
++{
++	DisplayModePtr modes;
++	int n;
++
++#if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,6,99,900,0)
++	modes = xf86GetDefaultModes();
++#else
++	modes = xf86GetDefaultModes(0, 0);
++#endif
++
++	/* XXX O(n^2) mode list generation :( */
++
++#if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,4,99,901,0)
++	if (preferred) {
++		DisplayModePtr m;
++
++		/* Add a half-resolution mode useful for large panels */
++		m = xf86GTFMode(preferred->HDisplay/2,
++				preferred->VDisplay/2,
++				xf86ModeVRefresh(preferred),
++				FALSE, FALSE);
++		if (!duplicate_mode(modes, m))
++			modes = xf86ModesAdd(modes, m);
++		else
++			free(m);
++
++		if (preferred->VDisplay * 16 > preferred->HDisplay*9 - preferred->HDisplay/32 &&
++		    preferred->VDisplay * 16 < preferred->HDisplay*9 + preferred->HDisplay/32) {
++			DBG(("Adding 16:9 modes -- %d < %d > %d\n",
++			     preferred->HDisplay*9 - preferred->HDisplay/32,
++			     preferred->VDisplay * 16,
++			     preferred->HDisplay*9 + preferred->HDisplay/32));
++			for (n = 0; n < ARRAY_SIZE(common_16_9); n++) {
++				if (preferred->HDisplay <= common_16_9[n].width ||
++				    preferred->VDisplay <= common_16_9[n].height)
++					break;
++
++				m = xf86GTFMode(common_16_9[n].width,
++						common_16_9[n].height,
++						xf86ModeVRefresh(preferred),
++						FALSE, FALSE);
++				if (!duplicate_mode(modes, m))
++					modes = xf86ModesAdd(modes, m);
++				else
++					free(m);
++			}
++		}
++
++		if (preferred->VDisplay * 16 > preferred->HDisplay*10 - preferred->HDisplay/32 &&
++		    preferred->VDisplay * 16 < preferred->HDisplay*10 + preferred->HDisplay/32) {
++			DBG(("Adding 16:10 modes -- %d < %d > %d\n",
++			     preferred->HDisplay*10 - preferred->HDisplay/32,
++			     preferred->VDisplay * 16,
++			     preferred->HDisplay*10 + preferred->HDisplay/32));
++			for (n = 0; n < ARRAY_SIZE(common_16_10); n++) {
++				if (preferred->HDisplay <= common_16_10[n].width ||
++				    preferred->VDisplay <= common_16_10[n].height)
++					break;
++
++				m = xf86GTFMode(common_16_10[n].width,
++						common_16_10[n].height,
++						xf86ModeVRefresh(preferred),
++						FALSE, FALSE);
++				if (!duplicate_mode(modes, m))
++					modes = xf86ModesAdd(modes, m);
++				else
++					free(m);
++			}
++		}
+ 	}
+-}
+-
+-static DisplayModePtr
+-default_modes(void)
+-{
+-#if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,6,99,900,0)
+-	return xf86GetDefaultModes();
+-#else
+-	return xf86GetDefaultModes(0, 0);
+ #endif
++
++	return modes;
+ }
+ 
+ static DisplayModePtr
+-sna_output_panel_edid(xf86OutputPtr output, DisplayModePtr modes)
++sna_output_add_default_modes(xf86OutputPtr output, DisplayModePtr modes)
+ {
+ 	xf86MonPtr mon = output->MonInfo;
+ 	DisplayModePtr i, m, preferred = NULL;
+-	int max_x = 0, max_y = 0;
++	int max_x = 0, max_y = 0, max_clock = 0;
+ 	float max_vrefresh = 0.0;
+ 
+ 	if (mon && GTF_SUPPORTED(mon->features.msc))
+@@ -3009,16 +4056,17 @@ sna_output_panel_edid(xf86OutputPtr output, DisplayModePtr modes)
+ 			preferred = m;
+ 		max_x = max(max_x, m->HDisplay);
+ 		max_y = max(max_y, m->VDisplay);
++		max_clock = max(max_clock, m->Clock);
+ 		max_vrefresh = max(max_vrefresh, xf86ModeVRefresh(m));
+ 	}
+-
+-	max_vrefresh = max(max_vrefresh, 60.0);
+ 	max_vrefresh *= (1 + SYNC_TOLERANCE);
+ 
+-	m = default_modes();
++	m = default_modes(preferred);
+ 	xf86ValidateModesSize(output->scrn, m, max_x, max_y, 0);
+ 
+ 	for (i = m; i; i = i->next) {
++		if (i->Clock > max_clock)
++			i->status = MODE_CLOCK_HIGH;
+ 		if (xf86ModeVRefresh(i) > max_vrefresh)
+ 			i->status = MODE_VSYNC;
+ 		if (preferred &&
+@@ -3034,28 +4082,47 @@ sna_output_panel_edid(xf86OutputPtr output, DisplayModePtr modes)
+ }
+ 
+ static DisplayModePtr
++sna_output_override_edid(xf86OutputPtr output)
++{
++	struct sna_output *sna_output = output->driver_private;
++
++	if (sna_output->fake_edid_mon == NULL)
++		return NULL;
++
++	xf86OutputSetEDID(output, sna_output->fake_edid_mon);
++	return xf86DDCGetModes(output->scrn->scrnIndex,
++			       sna_output->fake_edid_mon);
++}
++
++static DisplayModePtr
+ sna_output_get_modes(xf86OutputPtr output)
+ {
+ 	struct sna_output *sna_output = output->driver_private;
+-	DisplayModePtr Modes = NULL, current = NULL;
++	DisplayModePtr Modes, current;
+ 	int i;
+ 
+ 	DBG(("%s(%s:%d)\n", __FUNCTION__, output->name, sna_output->id));
+ 	assert(sna_output->id);
+ 
++	Modes = sna_output_override_edid(output);
++	if (Modes)
++		return Modes;
++
+ 	sna_output_attach_edid(output);
++	sna_output_attach_tile(output);
+ 
+-	if (output->crtc) {
++	current = NULL;
++	if (output->crtc && !sna_output->hotplug_count) {
+ 		struct drm_mode_crtc mode;
+ 
+ 		VG_CLEAR(mode);
+ 		assert(to_sna_crtc(output->crtc));
+-		mode.crtc_id = to_sna_crtc(output->crtc)->id;
++		mode.crtc_id = sna_crtc_id(output->crtc);
+ 
+ 		if (drmIoctl(to_sna(output->scrn)->kgem.fd, DRM_IOCTL_MODE_GETCRTC, &mode) == 0) {
+ 			DBG(("%s: CRTC:%d, pipe=%d: has mode?=%d\n", __FUNCTION__,
+-			     to_sna_crtc(output->crtc)->id,
+-			     to_sna_crtc(output->crtc)->pipe,
++			     sna_crtc_id(output->crtc),
++			     sna_crtc_pipe(output->crtc),
+ 			     mode.mode_valid && mode.mode.clock));
+ 
+ 			if (mode.mode_valid && mode.mode.clock) {
+@@ -3117,7 +4184,7 @@ sna_output_get_modes(xf86OutputPtr output)
+ 	}
+ 
+ 	if (sna_output->add_default_modes)
+-		Modes = sna_output_panel_edid(output, Modes);
++		Modes = sna_output_add_default_modes(output, Modes);
+ 
+ 	return Modes;
+ }
+@@ -3132,6 +4199,8 @@ sna_output_destroy(xf86OutputPtr output)
+ 		return;
+ 
+ 	free(sna_output->edid_raw);
++	free(sna_output->fake_edid_raw);
++
+ 	for (i = 0; i < sna_output->num_props; i++) {
+ 		if (sna_output->props[i].kprop == NULL)
+ 			continue;
+@@ -3155,7 +4224,7 @@ sna_output_destroy(xf86OutputPtr output)
+ }
+ 
+ static void
+-sna_output_dpms(xf86OutputPtr output, int dpms)
++__sna_output_dpms(xf86OutputPtr output, int dpms, int fixup)
+ {
+ 	struct sna *sna = to_sna(output->scrn);
+ 	struct sna_output *sna_output = output->driver_private;
+@@ -3182,8 +4251,9 @@ sna_output_dpms(xf86OutputPtr output, int dpms)
+ 	if (sna_output->backlight.iface && dpms != DPMSModeOn) {
+ 		if (old_dpms == DPMSModeOn) {
+ 			sna_output->backlight_active_level = sna_output_backlight_get(output);
+-			DBG(("%s: saving current backlight %d\n",
+-			     __FUNCTION__, sna_output->backlight_active_level));
++			DBG(("%s(%s:%d): saving current backlight %d\n",
++			     __FUNCTION__, output->name, sna_output->id,
++			     sna_output->backlight_active_level));
+ 		}
+ 		sna_output->dpms_mode = dpms;
+ 		sna_output_backlight_off(sna_output);
+@@ -3193,18 +4263,31 @@ sna_output_dpms(xf86OutputPtr output, int dpms)
+ 	    drmModeConnectorSetProperty(sna->kgem.fd,
+ 					sna_output->id,
+ 					sna_output->dpms_id,
+-					dpms))
+-		dpms = old_dpms;
++					dpms)) {
++		DBG(("%s(%s:%d): failed to set DPMS to %d (fixup? %d)\n",
++		     __FUNCTION__, output->name, sna_output->id, dpms, fixup));
++		if (fixup && dpms != DPMSModeOn) {
++			sna_crtc_disable(output->crtc, false);
++			return;
++		}
++	}
+ 
+ 	if (sna_output->backlight.iface && dpms == DPMSModeOn) {
+-		DBG(("%s: restoring previous backlight %d\n",
+-		     __FUNCTION__, sna_output->backlight_active_level));
++		DBG(("%s(%d:%d: restoring previous backlight %d\n",
++		     __FUNCTION__, output->name, sna_output->id,
++		     sna_output->backlight_active_level));
+ 		sna_output_backlight_on(sna_output);
+ 	}
+ 
+ 	sna_output->dpms_mode = dpms;
+ }
+ 
++static void
++sna_output_dpms(xf86OutputPtr output, int dpms)
++{
++	__sna_output_dpms(output, dpms, true);
++}
++
+ static bool
+ sna_property_ignore(drmModePropertyPtr prop)
+ {
+@@ -3239,14 +4322,14 @@ sna_output_create_ranged_atom(xf86OutputPtr output, Atom *atom,
+ 	err = RRConfigureOutputProperty(output->randr_output, *atom, FALSE,
+ 					TRUE, immutable, 2, atom_range);
+ 	if (err != 0)
+-		xf86DrvMsg(output->scrn->scrnIndex, X_ERROR,
++		xf86DrvMsg(output->scrn->scrnIndex, X_WARNING,
+ 			   "RRConfigureOutputProperty error, %d\n", err);
+ 
+ 	err = RRChangeOutputProperty(output->randr_output, *atom, XA_INTEGER,
+ 				     32, PropModeReplace, 1, &value,
+ 				     FALSE, FALSE);
+ 	if (err != 0)
+-		xf86DrvMsg(output->scrn->scrnIndex, X_ERROR,
++		xf86DrvMsg(output->scrn->scrnIndex, X_WARNING,
+ 			   "RRChangeOutputProperty error, %d\n", err);
+ }
+ 
+@@ -3303,7 +4386,7 @@ sna_output_create_resources(xf86OutputPtr output)
+ 							p->kprop->flags & DRM_MODE_PROP_IMMUTABLE ? TRUE : FALSE,
+ 							p->num_atoms - 1, (INT32 *)&p->atoms[1]);
+ 			if (err != 0) {
+-				xf86DrvMsg(output->scrn->scrnIndex, X_ERROR,
++				xf86DrvMsg(output->scrn->scrnIndex, X_WARNING,
+ 					   "RRConfigureOutputProperty error, %d\n", err);
+ 			}
+ 
+@@ -3315,7 +4398,7 @@ sna_output_create_resources(xf86OutputPtr output)
+ 						     XA_ATOM, 32, PropModeReplace, 1, &p->atoms[j+1],
+ 						     FALSE, FALSE);
+ 			if (err != 0) {
+-				xf86DrvMsg(output->scrn->scrnIndex, X_ERROR,
++				xf86DrvMsg(output->scrn->scrnIndex, X_WARNING,
+ 					   "RRChangeOutputProperty error, %d\n", err);
+ 			}
+ 		}
+@@ -3385,18 +4468,19 @@ sna_output_set_property(xf86OutputPtr output, Atom property,
+ 			if (value->type != XA_INTEGER || value->format != 32 ||
+ 			    value->size != 1)
+ 				return FALSE;
+-			val = *(uint32_t *)value->data;
+ 
++			val = *(uint32_t *)value->data;
+ 			drmModeConnectorSetProperty(sna->kgem.fd, sna_output->id,
+ 						    p->kprop->prop_id, (uint64_t)val);
+ 			return TRUE;
+ 		} else if (p->kprop->flags & DRM_MODE_PROP_ENUM) {
+-			Atom	atom;
+-			const char	*name;
+-			int		j;
++			Atom atom;
++			const char *name;
++			int j;
+ 
+ 			if (value->type != XA_ATOM || value->format != 32 || value->size != 1)
+ 				return FALSE;
++
+ 			memcpy(&atom, value->data, 4);
+ 			name = NameForAtom(atom);
+ 			if (name == NULL)
+@@ -3425,7 +4509,7 @@ static Bool
+ sna_output_get_property(xf86OutputPtr output, Atom property)
+ {
+ 	struct sna_output *sna_output = output->driver_private;
+-	int err;
++	int err, i, j;
+ 
+ 	if (property == backlight_atom || property == backlight_deprecated_atom) {
+ 		INT32 val;
+@@ -3449,7 +4533,7 @@ sna_output_get_property(xf86OutputPtr output, Atom property)
+ 					     XA_INTEGER, 32, PropModeReplace, 1, &val,
+ 					     FALSE, FALSE);
+ 		if (err != 0) {
+-			xf86DrvMsg(output->scrn->scrnIndex, X_ERROR,
++			xf86DrvMsg(output->scrn->scrnIndex, X_WARNING,
+ 				   "RRChangeOutputProperty error, %d\n", err);
+ 			return FALSE;
+ 		}
+@@ -3457,6 +4541,40 @@ sna_output_get_property(xf86OutputPtr output, Atom property)
+ 		return TRUE;
+ 	}
+ 
++	for (i = 0; i < sna_output->num_props; i++) {
++		struct sna_property *p = &sna_output->props[i];
++
++		if (p->atoms == NULL || p->atoms[0] != property)
++			continue;
++
++		if (sna_output->update_properties && output->scrn->vtSema)
++			update_properties(to_sna(output->scrn), sna_output);
++
++		err = 0;
++		if (p->kprop->flags & DRM_MODE_PROP_RANGE) {
++			err = RRChangeOutputProperty(output->randr_output,
++						     property, XA_INTEGER, 32,
++						     PropModeReplace, 1,
++						     &sna_output->prop_values[i],
++						     FALSE, FALSE);
++		} else if (p->kprop->flags & DRM_MODE_PROP_ENUM) {
++			for (j = 0; j < p->kprop->count_enums; j++) {
++				if (p->kprop->enums[j].value == sna_output->prop_values[i])
++					break;
++			}
++			err = RRChangeOutputProperty(output->randr_output,
++						     property, XA_ATOM, 32,
++						     PropModeReplace, 1,
++						     &p->atoms[j+1],
++						     FALSE, FALSE);
++		}
++
++		if (err != 0)
++			xf86DrvMsg(output->scrn->scrnIndex, X_WARNING,
++				   "RRChangeOutputProperty error, %d\n", err);
++		return TRUE;
++	}
++
+ 	return FALSE;
+ }
+ 
+@@ -3500,47 +4618,11 @@ static const char * const output_names[] = {
+ 	/* DRM_MODE_CONNECTOR_TV */		"TV",
+ 	/* DRM_MODE_CONNECTOR_eDP */		"eDP",
+ 	/* DRM_MODE_CONNECTOR_VIRTUAL */	"Virtual",
+-	/* DRM_MODE_CONNECTOR_DSI */		"DSI"
++	/* DRM_MODE_CONNECTOR_DSI */		"DSI",
++	/* DRM_MODE_CONNECTOR_DPI */		"DPI"
+ };
+ 
+ static bool
+-sna_zaphod_match(const char *s, const char *output)
+-{
+-	char t[20];
+-	unsigned int i = 0;
+-
+-	do {
+-		/* match any outputs in a comma list, stopping at whitespace */
+-		switch (*s) {
+-		case '\0':
+-			t[i] = '\0';
+-			return strcmp(t, output) == 0;
+-
+-		case ',':
+-			t[i] ='\0';
+-			if (strcmp(t, output) == 0)
+-				return TRUE;
+-			i = 0;
+-			break;
+-
+-		case ' ':
+-		case '\t':
+-		case '\n':
+-		case '\r':
+-			break;
+-
+-		default:
+-			t[i++] = *s;
+-			break;
+-		}
+-
+-		s++;
+-	} while (i < sizeof(t));
+-
+-	return false;
+-}
+-
+-static bool
+ output_ignored(ScrnInfoPtr scrn, const char *name)
+ {
+ 	char monitor_name[64];
+@@ -3572,14 +4654,21 @@ gather_encoders(struct sna *sna, uint32_t id, int count,
+ 	struct drm_mode_get_encoder enc;
+ 	uint32_t *ids = NULL;
+ 
++	DBG(("%s(%d): expected count=%d\n", __FUNCTION__, id, count));
++
+ 	VG_CLEAR(compat_conn);
++	VG_CLEAR(enc);
+ 	memset(out, 0, sizeof(*out));
+ 
+ 	do {
+-		free(ids);
+-		ids = malloc(sizeof(*ids) * count);
+-		if (ids == 0)
++		uint32_t *nids;
++
++		nids = realloc(ids, sizeof(*ids) * count);
++		if (nids == NULL) {
++			free(ids);
+ 			return false;
++		}
++		ids = nids;
+ 
+ 		compat_conn.conn.connector_id = id;
+ 		compat_conn.conn.count_props = 0;
+@@ -3593,12 +4682,14 @@ gather_encoders(struct sna *sna, uint32_t id, int count,
+ 			compat_conn.conn.count_encoders = count = 0;
+ 		}
+ 
++		VG(VALGRIND_MAKE_MEM_DEFINED(ids, sizeof(uint32_t)*compat_conn.conn.count_encoders));
+ 		if (count == compat_conn.conn.count_encoders)
+ 			break;
+ 
+ 		count = compat_conn.conn.count_encoders;
+ 	} while (1);
+ 
++	DBG(("%s(%d): gathering %d encoders\n", __FUNCTION__, id, count));
+ 	for (count = 0; count < compat_conn.conn.count_encoders; count++) {
+ 		enc.encoder_id = ids[count];
+ 		if (drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_GETENCODER, &enc)) {
+@@ -3606,6 +4697,8 @@ gather_encoders(struct sna *sna, uint32_t id, int count,
+ 			count = 0;
+ 			break;
+ 		}
++		DBG(("%s(%d): encoder=%d, possible_crtcs=%x, possible_clones=%x\n",
++		     __FUNCTION__, id, enc.encoder_id, enc.possible_crtcs, enc.possible_clones));
+ 		out->possible_crtcs |= enc.possible_crtcs;
+ 		out->possible_clones |= enc.possible_clones;
+ 
+@@ -3731,6 +4824,116 @@ static int name_from_path(struct sna *sna,
+ 	return 0;
+ }
+ 
++static char *fake_edid_name(xf86OutputPtr output)
++{
++	struct sna *sna = to_sna(output->scrn);
++	const char *str, *colon;
++
++#if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,7,99,901,0)
++	str = xf86GetOptValString(sna->Options, OPTION_EDID);
++#else
++	str = NULL;
++#endif
++	if (str == NULL)
++		return NULL;
++
++	do {
++		colon = strchr(str, ':');
++		if (colon == NULL)
++			return NULL;
++
++		if (strncmp(str, output->name, colon-str) == 0 &&
++		    output->name[colon-str] == '\0') {
++			char *path;
++			int len;
++
++			str = colon + 1;
++			colon = strchr(str, ',');
++			if (colon)
++				len = colon - str;
++			else
++				len = strlen(str);
++
++			path = malloc(len + 1);
++			if (path == NULL)
++				return NULL;
++
++			memcpy(path, str, len);
++			path[len] = '\0';
++			return path;
++		}
++
++		str = strchr(colon + 1, ',');
++		if (str == NULL)
++			return NULL;
++
++		str++;
++	} while (1);
++}
++
++static void
++sna_output_load_fake_edid(xf86OutputPtr output)
++{
++	struct sna_output *sna_output = output->driver_private;
++	const char *filename;
++	FILE *file;
++	void *raw;
++	int size;
++	xf86MonPtr mon;
++
++	filename = fake_edid_name(output);
++	if (filename == NULL)
++		return;
++
++	file = fopen(filename, "rb");
++	if (file == NULL)
++		goto err;
++
++	fseek(file, 0, SEEK_END);
++	size = ftell(file);
++	if (size % 128) {
++		fclose(file);
++		goto err;
++	}
++
++	raw = malloc(size);
++	if (raw == NULL) {
++		fclose(file);
++		free(raw);
++		goto err;
++	}
++
++	fseek(file, 0, SEEK_SET);
++	if (fread(raw, size, 1, file) != 1) {
++		fclose(file);
++		free(raw);
++		goto err;
++	}
++	fclose(file);
++
++	mon = xf86InterpretEDID(output->scrn->scrnIndex, raw);
++	if (mon == NULL) {
++		free(raw);
++		goto err;
++	}
++
++	if (mon && size > 128)
++		mon->flags |= MONITOR_EDID_COMPLETE_RAWDATA;
++
++	sna_output->fake_edid_mon = mon;
++	sna_output->fake_edid_raw = raw;
++
++	xf86DrvMsg(output->scrn->scrnIndex, X_CONFIG,
++		   "Loading EDID from \"%s\" for output %s\n",
++		   filename, output->name);
++	return;
++
++err:
++	xf86DrvMsg(output->scrn->scrnIndex, X_ERROR,
++		   "Could not read EDID file \"%s\" for output %s\n",
++		   filename, output->name);
++}
++
+ static int
+ sna_output_add(struct sna *sna, unsigned id, unsigned serial)
+ {
+@@ -3765,6 +4968,7 @@ sna_output_add(struct sna *sna, unsigned id, unsigned serial)
+ 		return -1;
+ 	}
+ 	assert(compat_conn.conn.connector_id == id);
++	DBG(("%s(%d): has %d associated encoders\n", __FUNCTION__, id, compat_conn.conn.count_encoders));
+ 
+ 	if (compat_conn.conn.connector_type < ARRAY_SIZE(output_names))
+ 		output_name = output_names[compat_conn.conn.connector_type];
+@@ -3813,34 +5017,43 @@ sna_output_add(struct sna *sna, unsigned id, unsigned serial)
+ 	}
+ 
+ 	if (is_zaphod(scrn)) {
+-		const char *str;
++		unsigned zaphod_crtcs;
+ 
+-		str = xf86GetOptValString(sna->Options, OPTION_ZAPHOD);
+-		if (str && !sna_zaphod_match(str, name)) {
+-			DBG(("%s: zaphod mismatch, want %s, have %s\n", __FUNCTION__, str, name));
++		if (!sna_zaphod_match(sna, name)) {
++			DBG(("%s: zaphod mismatch, want %s, have %s\n",
++			     __FUNCTION__,
++			     xf86GetOptValString(sna->Options, OPTION_ZAPHOD) ?: "???",
++			     name));
+ 			return 0;
+ 		}
+ 
+-		if ((possible_crtcs & (1 << scrn->confScreen->device->screen)) == 0) {
+-			if (str) {
+-				xf86DrvMsg(scrn->scrnIndex, X_ERROR,
+-					   "%s is an invalid output for screen (pipe) %d\n",
+-					   name, scrn->confScreen->device->screen);
+-				return -1;
+-			} else
+-				return 0;
++		zaphod_crtcs = get_zaphod_crtcs(sna);
++		possible_crtcs &= zaphod_crtcs;
++		if (possible_crtcs == 0) {
++			xf86DrvMsg(scrn->scrnIndex, X_ERROR,
++				   "%s is an invalid output for screen %d\n",
++				   name, scrn->confScreen->device->screen);
++			return -1;
+ 		}
+ 
+-		possible_crtcs = 1;
++		possible_crtcs >>= ffs(zaphod_crtcs) - 1;
+ 	}
+ 
+ 	sna_output = calloc(sizeof(struct sna_output), 1);
+ 	if (!sna_output)
+ 		return -1;
+ 
++	sna_output->connector_type = compat_conn.conn.connector_type;
++	sna_output->connector_type_id = compat_conn.conn.connector_type_id;
+ 	sna_output->num_props = compat_conn.conn.count_props;
+ 	sna_output->prop_ids = malloc(sizeof(uint32_t)*compat_conn.conn.count_props);
+ 	sna_output->prop_values = malloc(sizeof(uint64_t)*compat_conn.conn.count_props);
++	if (sna_output->prop_ids == NULL || sna_output->prop_values == NULL) {
++		free(sna_output->prop_ids);
++		free(sna_output->prop_values);
++		free(sna_output);
++		return -1;
++	}
+ 
+ 	compat_conn.conn.count_encoders = 0;
+ 
+@@ -3865,16 +5078,16 @@ sna_output_add(struct sna *sna, unsigned id, unsigned serial)
+ 	/* Construct name from topology, and recheck if output is acceptable */
+ 	path = name_from_path(sna, sna_output, name);
+ 	if (path) {
+-		const char *str;
+-
+ 		if (output_ignored(scrn, name)) {
+ 			len = 0;
+ 			goto skip;
+ 		}
+ 
+-		str = xf86GetOptValString(sna->Options, OPTION_ZAPHOD);
+-		if (str && !sna_zaphod_match(str, name)) {
+-			DBG(("%s: zaphod mismatch, want %s, have %s\n", __FUNCTION__, str, name));
++		if (is_zaphod(scrn) && !sna_zaphod_match(sna, name)) {
++			DBG(("%s: zaphod mismatch, want %s, have %s\n",
++			     __FUNCTION__,
++			     xf86GetOptValString(sna->Options, OPTION_ZAPHOD) ?: "???",
++			     name));
+ 			len = 0;
+ 			goto skip;
+ 		}
+@@ -3889,7 +5102,6 @@ sna_output_add(struct sna *sna, unsigned id, unsigned serial)
+ 			if (strcmp(output->name, name) == 0) {
+ 				assert(output->scrn == scrn);
+ 				assert(output->funcs == &sna_output_funcs);
+-				assert(to_sna_output(output)->id == 0);
+ 				sna_output_destroy(output);
+ 				goto reset;
+ 			}
+@@ -3935,6 +5147,8 @@ reset:
+ 	sna_output->id = compat_conn.conn.connector_id;
+ 	sna_output->is_panel = is_panel(compat_conn.conn.connector_type);
+ 	sna_output->edid_idx = find_property(sna, sna_output, "EDID");
++	sna_output->link_status_idx =
++		find_property(sna, sna_output, "link-status");
+ 	if (find_property(sna, sna_output, "scaling mode") != -1)
+ 		sna_output->add_default_modes =
+ 			xf86ReturnOptValBool(output->options, OPTION_DEFAULT_MODES, TRUE);
+@@ -3945,10 +5159,8 @@ reset:
+ 		sna_output->dpms_mode = sna_output->prop_values[i];
+ 		DBG(("%s: found 'DPMS' (idx=%d, id=%d), initial value=%d\n",
+ 		     __FUNCTION__, i, sna_output->dpms_id, sna_output->dpms_mode));
+-	} else {
+-		sna_output->dpms_id = -1;
++	} else
+ 		sna_output->dpms_mode = DPMSModeOff;
+-	}
+ 
+ 	sna_output->possible_encoders = possible_encoders;
+ 	sna_output->attached_encoders = attached_encoders;
+@@ -3963,12 +5175,13 @@ reset:
+ 	sna_output->base = output;
+ 
+ 	backlight_init(&sna_output->backlight);
+-	if (sna_output->is_panel)
+-		sna_output_backlight_init(output);
++	sna_output_backlight_init(output);
+ 
+ 	output->possible_crtcs = possible_crtcs & count_to_mask(sna->mode.num_real_crtc);
+ 	output->interlaceAllowed = TRUE;
+ 
++	sna_output_load_fake_edid(output);
++
+ 	if (serial) {
+ 		if (output->randr_output == NULL) {
+ 			output->randr_output = RROutputCreate(xf86ScrnToScreen(scrn), name, len, output);
+@@ -3976,6 +5189,7 @@ reset:
+ 				goto cleanup;
+ 		}
+ 
++		RROutputChanged(output->randr_output, TRUE);
+ 		sna_output_create_resources(output);
+ 		RRPostPendingProperties(output->randr_output);
+ 
+@@ -4009,38 +5223,6 @@ skip:
+ 	return len;
+ }
+ 
+-static void sna_output_del(xf86OutputPtr output)
+-{
+-	ScrnInfoPtr scrn = output->scrn;
+-	xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(scrn);
+-	int i;
+-
+-	DBG(("%s(%s)\n", __FUNCTION__, output->name));
+-	assert(to_sna_output(output));
+-
+-	RROutputDestroy(output->randr_output);
+-	sna_output_destroy(output);
+-
+-	while (output->probed_modes)
+-		xf86DeleteMode(&output->probed_modes, output->probed_modes);
+-
+-	free(output);
+-
+-	for (i = 0; i < config->num_output; i++)
+-		if (config->output[i] == output)
+-			break;
+-	assert(i < to_sna(scrn)->mode.num_real_output);
+-	DBG(("%s: removing output #%d of %d\n",
+-	     __FUNCTION__, i, to_sna(scrn)->mode.num_real_output));
+-
+-	for (; i < config->num_output; i++) {
+-		config->output[i] = config->output[i+1];
+-		config->output[i]->possible_clones >>= 1;
+-	}
+-	config->num_output--;
+-	to_sna(scrn)->mode.num_real_output--;
+-}
+-
+ static int output_rank(const void *A, const void *B)
+ {
+ 	const xf86OutputPtr *a = A;
+@@ -4058,6 +5240,7 @@ static void sort_config_outputs(struct sna *sna)
+ {
+ 	xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(sna->scrn);
+ 	qsort(config->output, sna->mode.num_real_output, sizeof(*config->output), output_rank);
++	config->compat_output = 0; /* make sure it is a sane value */
+ 	sna_mode_compute_possible_outputs(sna);
+ }
+ 
+@@ -4080,11 +5263,15 @@ static bool disable_unused_crtc(struct sna *sna)
+ 	bool update = false;
+ 	int o, c;
+ 
++	DBG(("%s\n", __FUNCTION__));
++
+ 	for (c = 0; c < sna->mode.num_real_crtc; c++) {
+ 		xf86CrtcPtr crtc = config->crtc[c];
+ 
+-		if (!crtc->enabled)
++		if (!crtc->enabled) {
++			sna_crtc_disable(crtc, false);
+ 			continue;
++		}
+ 
+ 		for (o = 0; o < sna->mode.num_real_output; o++) {
+ 			xf86OutputPtr output = config->output[o];
+@@ -4094,7 +5281,7 @@ static bool disable_unused_crtc(struct sna *sna)
+ 
+ 		if (o == sna->mode.num_real_output) {
+ 			DBG(("%s: CRTC:%d was enabled with no outputs\n",
+-			     __FUNCTION__, to_sna_crtc(crtc)->id));
++			     __FUNCTION__, sna_crtc_id(crtc)));
+ 			crtc->enabled = false;
+ 			update = true;
+ 		}
+@@ -4108,17 +5295,145 @@ static bool disable_unused_crtc(struct sna *sna)
+ 	return update;
+ }
+ 
+-void sna_mode_discover(struct sna *sna)
++bool sna_mode_find_hotplug_connector(struct sna *sna, unsigned id)
++{
++	xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(sna->scrn);
++	int i;
++
++	for (i = 0; i < sna->mode.num_real_output; i++) {
++		struct sna_output *output = to_sna_output(config->output[i]);
++		if (output->id == id) {
++			output->reprobe = true;
++			return true;
++		}
++	}
++
++	return false;
++}
++
++static bool
++output_retrain_link(struct sna *sna, struct sna_output *output)
++{
++	struct sna_crtc *crtc = to_sna_crtc(output->base->crtc);
++	int crtc_x = crtc->offset & 0xffff;
++	int crtc_y = crtc->offset >> 16;
++
++	return sna_crtc_flip(sna, crtc, crtc->bo, crtc_x, crtc_y);
++}
++
++static bool
++output_check_link(struct sna *sna, struct sna_output *output)
++{
++	uint64_t link_status;
++
++	if (!output->base->crtc)
++		return true;
++
++	if (output->link_status_idx == -1)
++		return true;
++
++#define LINK_STATUS_GOOD 0
++	link_status = output->prop_values[output->link_status_idx];
++	DBG(("%s: link_status=%d\n", __FUNCTION__, link_status));
++	if (link_status == LINK_STATUS_GOOD)
++		return true;
++
++	/* Perform a modeset as required for "link-status" = BAD */
++	if (!output_retrain_link(sna, output))
++		return false;
++
++	/* Query the "link-status" again to confirm the modeset */
++	update_properties(sna, output);
++
++	link_status = output->prop_values[output->link_status_idx];
++	DBG(("%s: link_status=%d after modeset\n", __FUNCTION__, link_status));
++	return link_status == LINK_STATUS_GOOD;
++}
++
++static bool
++output_check_status(struct sna *sna, struct sna_output *output)
++{
++	union compat_mode_get_connector compat_conn;
++	struct drm_mode_modeinfo dummy;
++	struct drm_mode_get_blob blob;
++	xf86OutputStatus status;
++	char *edid;
++
++	VG_CLEAR(compat_conn);
++
++	compat_conn.conn.connection = -1;
++	compat_conn.conn.connector_id = output->id;
++	compat_conn.conn.count_modes = 1; /* skip detect */
++	compat_conn.conn.modes_ptr = (uintptr_t)&dummy;
++	compat_conn.conn.count_encoders = 0;
++	compat_conn.conn.props_ptr = (uintptr_t)output->prop_ids;
++	compat_conn.conn.prop_values_ptr = (uintptr_t)output->prop_values;
++	compat_conn.conn.count_props = output->num_props;
++
++	if (drmIoctl(sna->kgem.fd,
++		     DRM_IOCTL_MODE_GETCONNECTOR,
++		     &compat_conn.conn) == 0)
++		output->update_properties = false;
++
++	if (!output_check_link(sna, output))
++		return false;
++
++	if (output->reprobe)
++		return false;
++
++	switch (compat_conn.conn.connection) {
++	case DRM_MODE_CONNECTED:
++		status = XF86OutputStatusConnected;
++		break;
++	case DRM_MODE_DISCONNECTED:
++		status = XF86OutputStatusDisconnected;
++		break;
++	default:
++	case DRM_MODE_UNKNOWNCONNECTION:
++		status = XF86OutputStatusUnknown;
++		break;
++	}
++	if (output->status != status)
++		return false;
++
++	if (status != XF86OutputStatusConnected)
++		return true;
++
++	if (output->num_modes != compat_conn.conn.count_modes)
++		return false;
++
++	if (output->edid_len == 0)
++		return false;
++
++	edid = alloca(output->edid_len);
++
++	VG_CLEAR(blob);
++	blob.blob_id = output->prop_values[output->edid_idx];
++	blob.length = output->edid_len;
++	blob.data = (uintptr_t)edid;
++	if (drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_GETPROPBLOB, &blob))
++		return false;
++
++	if (blob.length != output->edid_len)
++		return false;
++
++	return memcmp(edid, output->edid_raw, output->edid_len) == 0;
++}
++
++void sna_mode_discover(struct sna *sna, bool tell)
+ {
+ 	ScreenPtr screen = xf86ScrnToScreen(sna->scrn);
+ 	xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(sna->scrn);
++	bool force = sna->flags & SNA_REPROBE;
+ 	struct drm_mode_card_res res;
+-	uint32_t connectors[32];
++	uint32_t connectors[32], now;
+ 	unsigned changed = 0;
+ 	unsigned serial;
+ 	int i, j;
+ 
+ 	DBG(("%s()\n", __FUNCTION__));
++	sna->flags &= ~SNA_REPROBE;
++
+ 	VG_CLEAR(connectors);
+ 
+ 	memset(&res, 0, sizeof(res));
+@@ -4128,10 +5443,11 @@ void sna_mode_discover(struct sna *sna)
+ 	if (drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_GETRESOURCES, &res))
+ 		return;
+ 
+-	DBG(("%s: now %d (was %d) connectors\n", __FUNCTION__,
+-	     res.count_connectors, sna->mode.num_real_output));
++	DBG(("%s: now %d (was %d) connectors, %d encoders, %d crtc\n", __FUNCTION__,
++	     res.count_connectors, sna->mode.num_real_output,
++	     res.count_encoders, res.count_crtcs));
+ 	if (res.count_connectors > 32)
+-		return;
++		res.count_connectors = 32;
+ 
+ 	assert(sna->mode.num_real_crtc == res.count_crtcs || is_zaphod(sna->scrn));
+ 	assert(sna->mode.max_crtc_width  == res.max_width);
+@@ -4142,6 +5458,11 @@ void sna_mode_discover(struct sna *sna)
+ 	if (serial == 0)
+ 		serial = ++sna->mode.serial;
+ 
++	if (force) {
++		changed = 4;
++		now = 0;
++	} else
++		now = GetTimeInMillis();
+ 	for (i = 0; i < res.count_connectors; i++) {
+ 		DBG(("%s: connector[%d] = %d\n", __FUNCTION__, i, connectors[i]));
+ 		for (j = 0; j < sna->mode.num_real_output; j++) {
+@@ -4161,32 +5482,42 @@ void sna_mode_discover(struct sna *sna)
+ 
+ 	for (i = 0; i < sna->mode.num_real_output; i++) {
+ 		xf86OutputPtr output = config->output[i];
++		struct sna_output *sna_output = to_sna_output(output);
+ 
+-		if (to_sna_output(output)->id == 0)
++		if (sna_output->id == 0)
+ 			continue;
+ 
+-		if (to_sna_output(output)->serial == serial)
++		if (sna_output->serial == serial) {
++			if (output_check_status(sna, sna_output)) {
++				DBG(("%s: output %s (id=%d), retained state\n",
++				     __FUNCTION__, output->name, sna_output->id));
++				sna_output->last_detect = now;
++			} else {
++				DBG(("%s: output %s (id=%d), changed state, reprobing\n",
++				     __FUNCTION__, output->name, sna_output->id));
++				sna_output->hotplug_count++;
++				sna_output->last_detect = 0;
++				changed |= 4;
++			}
+ 			continue;
++		}
+ 
+ 		DBG(("%s: removing output %s (id=%d), serial=%u [now %u]\n",
+-		     __FUNCTION__, output->name, to_sna_output(output)->id,
+-		    to_sna_output(output)->serial, serial));
++		     __FUNCTION__, output->name, sna_output->id,
++		    sna_output->serial, serial));
+ 
+ 		xf86DrvMsg(sna->scrn->scrnIndex, X_INFO,
+-			   "%s output %s\n",
+-			   sna->flags & SNA_REMOVE_OUTPUTS ? "Removed" : "Disabled",
++			   "Disabled output %s\n",
+ 			   output->name);
+-		if (sna->flags & SNA_REMOVE_OUTPUTS) {
+-			sna_output_del(output);
+-			i--;
+-		} else {
+-			to_sna_output(output)->id = 0;
+-			output->crtc = NULL;
+-		}
++		sna_output->id = 0;
++		sna_output->last_detect = 0;
++		output->crtc = NULL;
++		RROutputChanged(output->randr_output, TRUE);
+ 		changed |= 2;
+ 	}
+ 
+-	if (changed) {
++	/* Have the list of available outputs been updated? */
++	if (changed & 3) {
+ 		DBG(("%s: outputs changed, broadcasting\n", __FUNCTION__));
+ 
+ 		sna_mode_set_primary(sna);
+@@ -4200,6 +5531,51 @@ void sna_mode_discover(struct sna *sna)
+ 
+ 		xf86RandR12TellChanged(screen);
+ 	}
++
++	/* If anything has changed, refresh the RandR information.
++	 * Note this could recurse once from udevless RRGetInfo() probes,
++	 * but only once.
++	 */
++	if (changed && tell)
++		RRGetInfo(screen, TRUE);
++}
++
++/* Since we only probe the current mode on startup, we may not have the full
++ * list of modes available until the user explicitly requests them. Fake a
++ * hotplug event after a second after starting to fill in any missing modes.
++ */
++static CARD32 sna_mode_coldplug(OsTimerPtr timer, CARD32 now, void *data)
++{
++	struct sna *sna = data;
++	ScreenPtr screen = xf86ScrnToScreen(sna->scrn);
++	xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(sna->scrn);
++	bool reprobe = false;
++	int i;
++
++	DBG(("%s()\n", __FUNCTION__));
++
++	for (i = 0; i < sna->mode.num_real_output; i++) {
++		xf86OutputPtr output = config->output[i];
++		struct sna_output *sna_output = to_sna_output(output);
++
++		if (sna_output->id == 0)
++			continue;
++		if (sna_output->last_detect)
++			continue;
++		if (output->status == XF86OutputStatusDisconnected)
++			continue;
++
++		DBG(("%s: output %s connected, needs reprobe\n",
++		     __FUNCTION__, output->name));
++		reprobe = true;
++	}
++
++	if (reprobe) {
++		RRGetInfo(screen, TRUE);
++		RRTellChanged(screen);
++	}
++	free(timer);
++	return 0;
+ }
+ 
+ static void copy_front(struct sna *sna, PixmapPtr old, PixmapPtr new)
+@@ -4208,7 +5584,7 @@ static void copy_front(struct sna *sna, PixmapPtr old, PixmapPtr new)
+ 
+ 	DBG(("%s\n", __FUNCTION__));
+ 
+-	if (wedged(sna))
++	if (wedged(sna) || isGPU(sna->scrn))
+ 		return;
+ 
+ 	old_priv = sna_pixmap_force_to_gpu(old, MOVE_READ);
+@@ -4220,12 +5596,19 @@ static void copy_front(struct sna *sna, PixmapPtr old, PixmapPtr new)
+ 		return;
+ 
+ 	if (old_priv->clear) {
+-		(void)sna->render.fill_one(sna, new, new_priv->gpu_bo,
+-					   old_priv->clear_color,
+-					   0, 0,
+-					   new->drawable.width,
+-					   new->drawable.height,
+-					   GXcopy);
++		bool ok = false;
++		if (!wedged(sna))
++			ok = sna->render.fill_one(sna, new, new_priv->gpu_bo,
++						  old_priv->clear_color,
++						  0, 0,
++						  new->drawable.width,
++						  new->drawable.height,
++						  GXcopy);
++		if (!ok) {
++			void *ptr = kgem_bo_map__gtt(&sna->kgem, new_priv->gpu_bo);
++			if (ptr)
++				memset(ptr, 0, new_priv->gpu_bo->pitch*new->drawable.height);
++		}
+ 		new_priv->clear = true;
+ 		new_priv->clear_color = old_priv->clear_color;
+ 	} else {
+@@ -4281,11 +5664,18 @@ static void copy_front(struct sna *sna, PixmapPtr old, PixmapPtr new)
+ 			     __FUNCTION__, box.x2, box.y2, sx, sy, dx, dy));
+ 
+ 			if (box.x2 != new->drawable.width || box.y2 != new->drawable.height) {
+-				(void)sna->render.fill_one(sna, new, new_priv->gpu_bo, 0,
+-							   0, 0,
+-							   new->drawable.width,
+-							   new->drawable.height,
+-							   GXclear);
++				bool ok = false;
++				if (!wedged(sna))
++					ok = sna->render.fill_one(sna, new, new_priv->gpu_bo, 0,
++								  0, 0,
++								  new->drawable.width,
++								  new->drawable.height,
++								  GXclear);
++				if (!ok) {
++					void *ptr = kgem_bo_map__gtt(&sna->kgem, new_priv->gpu_bo);
++					if (ptr)
++						memset(ptr, 0, new_priv->gpu_bo->pitch*new->drawable.height);
++				}
+ 			}
+ 			(void)sna->render.copy_boxes(sna, GXcopy,
+ 						     &old->drawable, old_priv->gpu_bo, sx, sy,
+@@ -4302,7 +5692,7 @@ sna_mode_resize(ScrnInfoPtr scrn, int width, int height)
+ {
+ 	xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(scrn);
+ 	struct sna *sna = to_sna(scrn);
+-	ScreenPtr screen = scrn->pScreen;
++	ScreenPtr screen = xf86ScrnToScreen(scrn);
+ 	PixmapPtr new_front;
+ 	int i;
+ 
+@@ -4337,9 +5727,20 @@ sna_mode_resize(ScrnInfoPtr scrn, int width, int height)
+ 	for (i = 0; i < sna->mode.num_real_crtc; i++)
+ 		sna_crtc_disable_shadow(sna, to_sna_crtc(config->crtc[i]));
+ 	assert(sna->mode.shadow_active == 0);
++	assert(!sna->mode.shadow_enabled);
+ 	assert(sna->mode.shadow_damage == NULL);
+ 	assert(sna->mode.shadow == NULL);
+ 
++	/* Flush pending shadow updates */
++	if (sna->mode.flip_active) {
++		DBG(("%s: waiting for %d outstanding TearFree flips\n",
++		     __FUNCTION__, sna->mode.flip_active));
++		while (sna->mode.flip_active && sna_mode_wait_for_event(sna))
++			sna_mode_wakeup(sna);
++	}
++
++	/* Cancel a pending [un]flip (as the pixmaps no longer match) */
++	sna_present_cancel_flip(sna);
+ 	copy_front(sna, sna->front, new_front);
+ 
+ 	screen->SetScreenPixmap(new_front);
+@@ -4351,14 +5752,6 @@ sna_mode_resize(ScrnInfoPtr scrn, int width, int height)
+ 	scrn->virtualY = height;
+ 	scrn->displayWidth = width;
+ 
+-	/* Flush pending shadow updates */
+-	if (sna->mode.flip_active) {
+-		DBG(("%s: waiting for %d outstanding TearFree flips\n",
+-		     __FUNCTION__, sna->mode.flip_active));
+-		while (sna->mode.flip_active && sna_mode_wait_for_event(sna))
+-			sna_mode_wakeup(sna);
+-	}
+-
+ 	/* Only update the CRTCs if we are in control */
+ 	if (!scrn->vtSema)
+ 		return TRUE;
+@@ -4371,7 +5764,7 @@ sna_mode_resize(ScrnInfoPtr scrn, int width, int height)
+ 			continue;
+ 
+ 		if (!__sna_crtc_set_mode(crtc))
+-			sna_crtc_disable(crtc);
++			sna_crtc_disable(crtc, false);
+ 	}
+ 
+ 	sna_mode_wakeup(sna);
+@@ -4381,19 +5774,6 @@ sna_mode_resize(ScrnInfoPtr scrn, int width, int height)
+ }
+ 
+ /* cursor handling */
+-struct sna_cursor {
+-	struct sna_cursor *next;
+-	uint32_t *image;
+-	Rotation rotation;
+-	int ref;
+-	int size;
+-	int last_width;
+-	int last_height;
+-	unsigned handle;
+-	unsigned serial;
+-	unsigned alloc;
+-};
+-
+ static void
+ rotate_coord(Rotation rotation, int size,
+ 	     int x_dst, int y_dst,
+@@ -4429,36 +5809,6 @@ rotate_coord(Rotation rotation, int size,
+ 	*y_src = y_dst;
+ }
+ 
+-static void
+-rotate_coord_back(Rotation rotation, int size, int *x, int *y)
+-{
+-	int t;
+-
+-	if (rotation & RR_Reflect_X)
+-		*x = size - *x - 1;
+-	if (rotation & RR_Reflect_Y)
+-		*y = size - *y - 1;
+-
+-	switch (rotation & 0xf) {
+-	case RR_Rotate_0:
+-		break;
+-	case RR_Rotate_90:
+-		t = *x;
+-		*x = *y;
+-		*y = size - t - 1;
+-		break;
+-	case RR_Rotate_180:
+-		*x = size - *x - 1;
+-		*y = size - *y - 1;
+-		break;
+-	case RR_Rotate_270:
+-		t = *x;
+-		*x = size - *y - 1;
+-		*y = t;
+-		break;
+-	}
+-}
+-
+ static struct sna_cursor *__sna_create_cursor(struct sna *sna, int size)
+ {
+ 	struct sna_cursor *c;
+@@ -4519,6 +5869,17 @@ static uint32_t *get_cursor_argb(CursorPtr c)
+ #endif
+ }
+ 
++static int __cursor_size(int width, int height)
++{
++	int i, size;
++
++	i = MAX(width, height);
++	for (size = 64; size < i; size <<= 1)
++		;
++
++	return size;
++}
++
+ static struct sna_cursor *__sna_get_cursor(struct sna *sna, xf86CrtcPtr crtc)
+ {
+ 	struct sna_cursor *cursor;
+@@ -4526,6 +5887,7 @@ static struct sna_cursor *__sna_get_cursor(struct sna *sna, xf86CrtcPtr crtc)
+ 	const uint32_t *argb;
+ 	uint32_t *image;
+ 	int width, height, pitch, size, x, y;
++	bool transformed;
+ 	Rotation rotation;
+ 
+ 	assert(sna->cursor.ref);
+@@ -4537,8 +5899,8 @@ static struct sna_cursor *__sna_get_cursor(struct sna *sna, xf86CrtcPtr crtc)
+ 	       cursor ? cursor->serial : 0,
+ 	       sna->cursor.serial));
+ 	if (cursor && cursor->serial == sna->cursor.serial) {
+-		assert(cursor->size == sna->cursor.size);
+-		assert(cursor->rotation == crtc->transform_in_use ? crtc->rotation : RR_Rotate_0);
++		assert(cursor->size == sna->cursor.size || cursor->transformed);
++		assert(cursor->rotation == (!to_sna_crtc(crtc)->cursor_transform && crtc->transform_in_use) ? crtc->rotation : RR_Rotate_0);
+ 		assert(cursor->ref);
+ 		return cursor;
+ 	}
+@@ -4550,22 +5912,81 @@ static struct sna_cursor *__sna_get_cursor(struct sna *sna, xf86CrtcPtr crtc)
+ 	       sna->cursor.serial,
+ 	       get_cursor_argb(sna->cursor.ref) != NULL));
+ 
+-	rotation = crtc->transform_in_use ? crtc->rotation : RR_Rotate_0;
++	transformed = to_sna_crtc(crtc)->cursor_transform;
++	rotation = (!transformed && crtc->transform_in_use) ? crtc->rotation : RR_Rotate_0;
++
++	if (transformed) {
++		struct pixman_box16 box;
++
++		box.x1 = box.y1 = 0;
++		box.x2 = sna->cursor.ref->bits->width;
++		box.y2 = sna->cursor.ref->bits->height;
+ 
+-	if (sna->cursor.use_gtt) { /* Don't allow phys cursor sharing */
++		pixman_f_transform_bounds(&crtc->f_crtc_to_framebuffer, &box);
++		size = __cursor_size(box.x2 - box.x1, box.y2 - box.y1);
++		__DBG(("%s: transformed cursor %dx%d -> %dx%d\n",
++		       __FUNCTION__ ,
++		       sna->cursor.ref->bits->width,
++		       sna->cursor.ref->bits->height,
++		       box.x2 - box.x1, box.y2 - box.y1));
++	} else
++		size = sna->cursor.size;
++
++	if (crtc->transform_in_use) {
++		RRTransformPtr T = NULL;
++		struct pixman_vector v;
++
++		if (crtc->transformPresent) {
++			T = &crtc->transform;
++
++			/* Cancel any translation from this affine
++			 * transformation. We just want to rotate and scale
++			 * the cursor image.
++			 */
++			v.vector[0] = 0;
++			v.vector[1] = 0;
++			v.vector[2] = pixman_fixed_1;
++			pixman_transform_point(&crtc->transform.transform, &v);
++		}
++
++		RRTransformCompute(0, 0, size, size, crtc->rotation, T, NULL,
++				   &to_sna_crtc(crtc)->cursor_to_fb,
++				   &to_sna_crtc(crtc)->fb_to_cursor);
++		if (T)
++			pixman_f_transform_translate(
++					&to_sna_crtc(crtc)->cursor_to_fb,
++					&to_sna_crtc(crtc)->fb_to_cursor,
++					-pixman_fixed_to_double(v.vector[0]),
++					-pixman_fixed_to_double(v.vector[1]));
++
++		__DBG(("%s: cursor_to_fb [%f %f %f, %f %f %f, %f %f %f]\n",
++		       __FUNCTION__,
++		       to_sna_crtc(crtc)->cursor_to_fb.m[0][0],
++		       to_sna_crtc(crtc)->cursor_to_fb.m[0][1],
++		       to_sna_crtc(crtc)->cursor_to_fb.m[0][2],
++		       to_sna_crtc(crtc)->cursor_to_fb.m[1][0],
++		       to_sna_crtc(crtc)->cursor_to_fb.m[1][1],
++		       to_sna_crtc(crtc)->cursor_to_fb.m[1][2],
++		       to_sna_crtc(crtc)->cursor_to_fb.m[2][0],
++		       to_sna_crtc(crtc)->cursor_to_fb.m[2][1],
++		       to_sna_crtc(crtc)->cursor_to_fb.m[2][2]));
++	}
++
++	/* Don't allow phys cursor sharing */
++	if (sna->cursor.use_gtt && !transformed) {
+ 		for (cursor = sna->cursor.cursors; cursor; cursor = cursor->next) {
+-			if (cursor->serial == sna->cursor.serial && cursor->rotation == rotation) {
++			if (cursor->serial == sna->cursor.serial &&
++			    cursor->rotation == rotation &&
++			    !cursor->transformed) {
+ 				__DBG(("%s: reusing handle=%d, serial=%d, rotation=%d, size=%d\n",
+ 				       __FUNCTION__, cursor->handle, cursor->serial, cursor->rotation, cursor->size));
+ 				assert(cursor->size == sna->cursor.size);
+ 				return cursor;
+ 			}
+ 		}
+-
+-		cursor = to_sna_crtc(crtc)->cursor;
+ 	}
+ 
+-	size = sna->cursor.size;
++	cursor = to_sna_crtc(crtc)->cursor;
+ 	if (cursor && cursor->alloc < 4*size*size)
+ 		cursor = NULL;
+ 
+@@ -4577,7 +5998,7 @@ static struct sna_cursor *__sna_get_cursor(struct sna *sna, xf86CrtcPtr crtc)
+ 		}
+ 	}
+ 
+-	width = sna->cursor.ref->bits->width;
++	width  = sna->cursor.ref->bits->width;
+ 	height = sna->cursor.ref->bits->height;
+ 	source = sna->cursor.ref->bits->source;
+ 	mask = sna->cursor.ref->bits->mask;
+@@ -4585,7 +6006,7 @@ static struct sna_cursor *__sna_get_cursor(struct sna *sna, xf86CrtcPtr crtc)
+ 	pitch = BitmapBytePad(width);
+ 
+ 	image = cursor->image;
+-	if (image == NULL) {
++	if (image == NULL || transformed) {
+ 		image = sna->cursor.scratch;
+ 		cursor->last_width = cursor->last_height = size;
+ 	}
+@@ -4616,6 +6037,21 @@ static struct sna_cursor *__sna_get_cursor(struct sna *sna, xf86CrtcPtr crtc)
+ 				mask += pitch;
+ 				source += pitch;
+ 			}
++			if (transformed) {
++				__DBG(("%s: Applying affine BLT to bitmap\n", __FUNCTION__));
++				affine_blt(image, cursor->image, 32,
++					   0, 0, width, height, size * 4,
++					   0, 0, size, size, size * 4,
++					   &to_sna_crtc(crtc)->cursor_to_fb);
++				image = cursor->image;
++			}
++		} else if (transformed) {
++			__DBG(("%s: Applying affine BLT to ARGB\n", __FUNCTION__));
++			affine_blt(argb, cursor->image, 32,
++				   0, 0, width, height, width * 4,
++				   0, 0, size, size, size * 4,
++				   &to_sna_crtc(crtc)->cursor_to_fb);
++			image = cursor->image;
+ 		} else
+ 			memcpy_blt(argb, image, 32,
+ 				   width * 4, size * 4,
+@@ -4662,9 +6098,16 @@ static struct sna_cursor *__sna_get_cursor(struct sna *sna, xf86CrtcPtr crtc)
+ 
+ 	cursor->size = size;
+ 	cursor->rotation = rotation;
++	cursor->transformed = transformed;
+ 	cursor->serial = sna->cursor.serial;
+-	cursor->last_width = width;
+-	cursor->last_height = height;
++	if (transformed) {
++		/* mark the transformed rectangle as dirty, not input */
++		cursor->last_width = size;
++		cursor->last_height = size;
++	} else {
++		cursor->last_width = width;
++		cursor->last_height = height;
++	}
+ 	return cursor;
+ }
+ 
+@@ -4674,40 +6117,55 @@ sna_realize_cursor(xf86CursorInfoPtr info, CursorPtr cursor)
+ 	return NULL;
+ }
+ 
+-#if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,12,99,901,0)
+-static inline int sigio_block(void)
+-{
+-	OsBlockSIGIO();
+-	return 0;
+-}
+-static inline void sigio_unblock(int was_blocked)
++static void enable_fb_access(ScrnInfoPtr scrn, int state)
+ {
+-	OsReleaseSIGIO();
+-	(void)was_blocked;
+-}
++	scrn->EnableDisableFBAccess(
++#ifdef XF86_HAS_SCRN_CONV
++				    scrn,
+ #else
+-#include <xf86_OSproc.h>
+-static inline int sigio_block(void)
++				    scrn->scrnIndex,
++#endif
++				    state);
++}
++
++
++static void __restore_swcursor(ScrnInfoPtr scrn)
+ {
+-	return xf86BlockSIGIO();
++	DBG(("%s: attempting to restore SW cursor\n", __FUNCTION__));
++	enable_fb_access(scrn, FALSE);
++	enable_fb_access(scrn, TRUE);
++
++	RemoveBlockAndWakeupHandlers((void *)__restore_swcursor,
++				     (void *)NoopDDA,
++				     scrn);
+ }
+-static inline void sigio_unblock(int was_blocked)
++
++static void restore_swcursor(struct sna *sna)
+ {
+-	xf86UnblockSIGIO(was_blocked);
++	sna->cursor.info->HideCursor(sna->scrn);
++
++	/* XXX Force the cursor to be restored (avoiding recursion) */
++	FreeCursor(sna->cursor.ref, None);
++	sna->cursor.ref = NULL;
++
++	RegisterBlockAndWakeupHandlers((void *)__restore_swcursor,
++				       (void *)NoopDDA,
++				       sna->scrn);
+ }
+-#endif
+ 
+ static void
+ sna_show_cursors(ScrnInfoPtr scrn)
+ {
+ 	xf86CrtcConfigPtr xf86_config = XF86_CRTC_CONFIG_PTR(scrn);
+ 	struct sna *sna = to_sna(scrn);
++	struct kmsg kmsg;
+ 	int sigio, c;
+ 
+ 	DBG(("%s: cursor?=%d\n", __FUNCTION__, sna->cursor.ref != NULL));
+ 	if (sna->cursor.ref == NULL)
+ 		return;
+ 
++	kmsg_open(&kmsg);
+ 	sigio = sigio_block();
+ 	for (c = 0; c < sna->mode.num_real_crtc; c++) {
+ 		xf86CrtcPtr crtc = xf86_config->crtc[c];
+@@ -4721,7 +6179,7 @@ sna_show_cursors(ScrnInfoPtr scrn)
+ 
+ 		if (!crtc->cursor_in_range) {
+ 			DBG(("%s: skipping cursor outside CRTC (pipe=%d)\n",
+-			     __FUNCTION__, sna_crtc->pipe));
++			     __FUNCTION__, sna_crtc_pipe(crtc)));
+ 			continue;
+ 		}
+ 
+@@ -4729,20 +6187,21 @@ sna_show_cursors(ScrnInfoPtr scrn)
+ 		if (cursor == NULL ||
+ 		    (sna_crtc->cursor == cursor && sna_crtc->last_cursor_size == cursor->size)) {
+ 			DBG(("%s: skipping cursor already show on CRTC (pipe=%d)\n",
+-			     __FUNCTION__, sna_crtc->pipe));
++			     __FUNCTION__, sna_crtc_pipe(crtc)));
+ 			continue;
+ 		}
+ 
+ 		DBG(("%s: CRTC pipe=%d, handle->%d\n", __FUNCTION__,
+-		     sna_crtc->pipe, cursor->handle));
++		     sna_crtc_pipe(crtc), cursor->handle));
+ 
+ 		VG_CLEAR(arg);
+ 		arg.flags = DRM_MODE_CURSOR_BO;
+-		arg.crtc_id = sna_crtc->id;
++		arg.crtc_id = __sna_crtc_id(sna_crtc);
+ 		arg.width = arg.height = cursor->size;
+ 		arg.handle = cursor->handle;
+ 
+-		if (drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_CURSOR, &arg) == 0) {
++		if (!FAIL_CURSOR_IOCTL &&
++		    drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_CURSOR, &arg) == 0) {
+ 			if (sna_crtc->cursor) {
+ 				assert(sna_crtc->cursor->ref > 0);
+ 				sna_crtc->cursor->ref--;
+@@ -4750,10 +6209,18 @@ sna_show_cursors(ScrnInfoPtr scrn)
+ 			cursor->ref++;
+ 			sna_crtc->cursor = cursor;
+ 			sna_crtc->last_cursor_size = cursor->size;
++		} else {
++			ERR(("%s: failed to show cursor on CRTC:%d [pipe=%d], disabling hwcursor: errno=%d\n",
++			     __FUNCTION__, sna_crtc_id(crtc), sna_crtc_pipe(crtc), errno));
++			sna->cursor.disable = true;
+ 		}
+ 	}
+ 	sigio_unblock(sigio);
+ 	sna->cursor.active = true;
++	kmsg_close(&kmsg, sna->cursor.disable);
++
++	if (unlikely(sna->cursor.disable))
++		restore_swcursor(sna);
+ }
+ 
+ static void
+@@ -4789,24 +6256,45 @@ static void
+ sna_crtc_disable_cursor(struct sna *sna, struct sna_crtc *crtc)
+ {
+ 	struct drm_mode_cursor arg;
++	int sigio;
+ 
+ 	if (!crtc->cursor)
+ 		return;
+ 
+-	DBG(("%s: CRTC:%d, handle=%d\n", __FUNCTION__, crtc->id, crtc->cursor->handle));
+-	assert(crtc->cursor->ref);
++	sigio = sigio_block();
++	if (crtc->cursor) {
++		DBG(("%s: CRTC:%d, handle=%d\n", __FUNCTION__, __sna_crtc_id(crtc), crtc->cursor->handle));
++		assert(crtc->cursor->ref > 0);
++		crtc->cursor->ref--;
++		crtc->cursor = NULL;
++		crtc->last_cursor_size = 0;
+ 
+-	VG_CLEAR(arg);
+-	arg.flags = DRM_MODE_CURSOR_BO;
+-	arg.crtc_id = crtc->id;
+-	arg.width = arg.height = 0;
+-	arg.handle = 0;
++		VG_CLEAR(arg);
++		arg.flags = DRM_MODE_CURSOR_BO;
++		arg.crtc_id = __sna_crtc_id(crtc);
++		arg.width = arg.height = 0;
++		arg.handle = 0;
+ 
+-	(void)drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_CURSOR, &arg);
+-	assert(crtc->cursor->ref > 0);
+-	crtc->cursor->ref--;
+-	crtc->cursor = NULL;
+-	crtc->last_cursor_size = 0;
++		(void)drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_CURSOR, &arg);
++	}
++	sigio_unblock(sigio);
++}
++
++static void
++sna_disable_cursors(ScrnInfoPtr scrn)
++{
++	xf86CrtcConfigPtr xf86_config = XF86_CRTC_CONFIG_PTR(scrn);
++	struct sna *sna = to_sna(scrn);
++	int sigio, c;
++
++	DBG(("%s\n", __FUNCTION__));
++
++	sigio = sigio_block();
++	for (c = 0; c < sna->mode.num_real_crtc; c++) {
++		assert(to_sna_crtc(xf86_config->crtc[c]));
++		sna_crtc_disable_cursor(sna, to_sna_crtc(xf86_config->crtc[c]));
++	}
++	sigio_unblock(sigio);
+ }
+ 
+ static void
+@@ -4852,6 +6340,7 @@ sna_set_cursor_position(ScrnInfoPtr scrn, int x, int y)
+ {
+ 	xf86CrtcConfigPtr xf86_config = XF86_CRTC_CONFIG_PTR(scrn);
+ 	struct sna *sna = to_sna(scrn);
++	struct kmsg kmsg;
+ 	int sigio, c;
+ 
+ 	__DBG(("%s(%d, %d), cursor? %d\n", __FUNCTION__,
+@@ -4859,6 +6348,7 @@ sna_set_cursor_position(ScrnInfoPtr scrn, int x, int y)
+ 	if (sna->cursor.ref == NULL)
+ 		return;
+ 
++	kmsg_open(&kmsg);
+ 	sigio = sigio_block();
+ 	sna->cursor.last_x = x;
+ 	sna->cursor.last_y = y;
+@@ -4876,27 +6366,37 @@ sna_set_cursor_position(ScrnInfoPtr scrn, int x, int y)
+ 
+ 		VG_CLEAR(arg);
+ 		arg.flags = 0;
+-		arg.crtc_id = sna_crtc->id;
++		arg.crtc_id = __sna_crtc_id(sna_crtc);
+ 		arg.handle = 0;
+ 
+ 		if (sna_crtc->bo == NULL)
+ 			goto disable;
+ 
++		cursor = __sna_get_cursor(sna, crtc);
++		if (cursor == NULL)
++			cursor = sna_crtc->cursor;
++		if (cursor == NULL) {
++			__DBG(("%s: failed to grab cursor, disabling\n", __FUNCTION__));
++			goto disable;
++		}
++
+ 		if (crtc->transform_in_use) {
+ 			int xhot = sna->cursor.ref->bits->xhot;
+ 			int yhot = sna->cursor.ref->bits->yhot;
+-			struct pict_f_vector v;
++			struct pict_f_vector v, hot;
+ 
+-			v.v[0] = (x + xhot) + 0.5;
+-			v.v[1] = (y + yhot) + 0.5;
+-			v.v[2] = 1;
++			v.v[0] = x + xhot + .5;
++			v.v[1] = y + yhot + .5;
++			v.v[2] = 1.;
+ 			pixman_f_transform_point(&crtc->f_framebuffer_to_crtc, &v);
+ 
+-			rotate_coord_back(crtc->rotation, sna->cursor.size, &xhot, &yhot);
++			hot.v[0] = xhot;
++			hot.v[1] = yhot;
++			hot.v[2] = 1.;
++			pixman_f_transform_point(&sna_crtc->fb_to_cursor, &hot);
+ 
+-			/* cursor will have 0.5 added to it already so floor is sufficent */
+-			arg.x = floor(v.v[0]) - xhot;
+-			arg.y = floor(v.v[1]) - yhot;
++			arg.x = floor(v.v[0] - hot.v[0]);
++			arg.y = floor(v.v[1] - hot.v[1]);
+ 		} else {
+ 			arg.x = x - crtc->x;
+ 			arg.y = y - crtc->y;
+@@ -4904,15 +6404,6 @@ sna_set_cursor_position(ScrnInfoPtr scrn, int x, int y)
+ 
+ 		if (arg.x < crtc->mode.HDisplay && arg.x > -sna->cursor.size &&
+ 		    arg.y < crtc->mode.VDisplay && arg.y > -sna->cursor.size) {
+-			cursor = __sna_get_cursor(sna, crtc);
+-			if (cursor == NULL)
+-				cursor = sna_crtc->cursor;
+-			if (cursor == NULL) {
+-				__DBG(("%s: failed to grab cursor, disabling\n",
+-				       __FUNCTION__));
+-				goto disable;
+-			}
+-
+ 			if (sna_crtc->cursor != cursor || sna_crtc->last_cursor_size != cursor->size) {
+ 				arg.flags |= DRM_MODE_CURSOR_BO;
+ 				arg.handle = cursor->handle;
+@@ -4932,10 +6423,13 @@ disable:
+ 		}
+ 
+ 		__DBG(("%s: CRTC:%d (%d, %d), handle=%d, flags=%x (old cursor handle=%d), move? %d, update handle? %d\n",
+-		       __FUNCTION__, sna_crtc->id, arg.x, arg.y, arg.handle, arg.flags, sna_crtc->cursor ? sna_crtc->cursor->handle : 0,
++		       __FUNCTION__, __sna_crtc_id(sna_crtc), arg.x, arg.y, arg.handle, arg.flags, sna_crtc->cursor ? sna_crtc->cursor->handle : 0,
+ 		       arg.flags & DRM_MODE_CURSOR_MOVE, arg.flags & DRM_MODE_CURSOR_BO));
+ 
+-		if (arg.flags &&
++		if (arg.flags == 0)
++			continue;
++
++		if (!FAIL_CURSOR_IOCTL &&
+ 		    drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_CURSOR, &arg) == 0) {
+ 			if (arg.flags & DRM_MODE_CURSOR_BO) {
+ 				if (sna_crtc->cursor) {
+@@ -4949,9 +6443,21 @@ disable:
+ 				} else
+ 					sna_crtc->last_cursor_size = 0;
+ 			}
++		} else {
++			ERR(("%s: failed to update cursor on CRTC:%d [pipe=%d], disabling hwcursor: errno=%d\n",
++			     __FUNCTION__, sna_crtc_id(crtc), sna_crtc_pipe(crtc), errno));
++			/* XXX How to force switch back to SW cursor?
++			 * Right now we just want until the next cursor image
++			 * change, which is fairly frequent.
++			 */
++			sna->cursor.disable = true;
+ 		}
+ 	}
+ 	sigio_unblock(sigio);
++	kmsg_close(&kmsg, sna->cursor.disable);
++
++	if (unlikely(sna->cursor.disable))
++		restore_swcursor(sna);
+ }
+ 
+ #if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,15,99,902,2)
+@@ -4978,17 +6484,6 @@ sna_load_cursor_image(ScrnInfoPtr scrn, unsigned char *src)
+ {
+ }
+ 
+-static int __cursor_size(CursorPtr cursor)
+-{
+-	int i, size;
+-
+-	i = MAX(cursor->bits->width, cursor->bits->height);
+-	for (size = 64; size < i; size <<= 1)
+-		;
+-
+-	return size;
+-}
+-
+ static bool
+ sna_cursor_preallocate(struct sna *sna)
+ {
+@@ -5006,6 +6501,50 @@ sna_cursor_preallocate(struct sna *sna)
+ 	return true;
+ }
+ 
++static bool
++transformable_cursor(struct sna *sna, CursorPtr cursor)
++{
++	xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(sna->scrn);
++	int i;
++
++	for (i = 0; i < sna->mode.num_real_crtc; i++) {
++		xf86CrtcPtr crtc = config->crtc[i];
++		struct pixman_box16 box;
++		int size;
++
++		if (!to_sna_crtc(crtc)->hwcursor) {
++			DBG(("%s: hwcursor disabled on CRTC:%d [pipe=%d]\n",
++			     __FUNCTION__, sna_crtc_id(crtc), sna_crtc_pipe(crtc)));
++			return false;
++		}
++
++		if (!sna->cursor.use_gtt || !sna->cursor.scratch) {
++			DBG(("%s: unable to use GTT curosor access [%d] or no scratch [%d]\n",
++			     __FUNCTION__, sna->cursor.use_gtt, sna->cursor.scratch));
++			return false;
++		}
++
++		box.x1 = box.y1 = 0;
++		box.x2 = cursor->bits->width;
++		box.y2 = cursor->bits->height;
++
++		if (!pixman_f_transform_bounds(&crtc->f_crtc_to_framebuffer,
++					       &box)) {
++			DBG(("%s: unable to transform bounds\n", __FUNCTION__));
++			return false;
++		}
++
++		size = __cursor_size(box.x2 - box.x1, box.y2 - box.y1);
++		if (size > sna->cursor.max_size) {
++			DBG(("%s: transformed cursor size=%d too large, max=%d\n",
++			     __FUNCTION__, size, sna->cursor.max_size));
++			return false;
++		}
++	}
++
++	return true;
++}
++
+ static Bool
+ sna_use_hw_cursor(ScreenPtr screen, CursorPtr cursor)
+ {
+@@ -5014,6 +6553,9 @@ sna_use_hw_cursor(ScreenPtr screen, CursorPtr cursor)
+ 	DBG(("%s (%dx%d)?\n", __FUNCTION__,
+ 	     cursor->bits->width, cursor->bits->height));
+ 
++	if (sna->cursor.disable)
++		return FALSE;
++
+ 	/* cursors are invariant */
+ 	if (cursor == sna->cursor.ref)
+ 		return TRUE;
+@@ -5023,12 +6565,24 @@ sna_use_hw_cursor(ScreenPtr screen, CursorPtr cursor)
+ 		sna->cursor.ref = NULL;
+ 	}
+ 
+-	sna->cursor.size = __cursor_size(cursor);
+-	if (sna->cursor.size > sna->cursor.max_size)
++	sna->cursor.size =
++		__cursor_size(cursor->bits->width, cursor->bits->height);
++	if (sna->cursor.size > sna->cursor.max_size) {
++		DBG(("%s: cursor size=%d too large, max %d: using sw cursor\n",
++		     __FUNCTION__, sna->cursor.size, sna->cursor.max_size));
+ 		return FALSE;
++	}
++
++	if (sna->mode.rr_active && !transformable_cursor(sna, cursor)) {
++		DBG(("%s: RandR active [%d] and non-transformable cursor: using sw cursor\n",
++		     __FUNCTION__, sna->mode.rr_active));
++		return FALSE;
++	}
+ 
+-	if (!sna_cursor_preallocate(sna))
++	if (!sna_cursor_preallocate(sna)) {
++		DBG(("%s: cursor preallocation failed: using sw cursor\n", __FUNCTION__));
+ 		return FALSE;
++	}
+ 
+ 	sna->cursor.ref = cursor;
+ 	cursor->refcnt++;
+@@ -5056,8 +6610,12 @@ sna_cursor_pre_init(struct sna *sna)
+ 		return;
+ 
+ #define LOCAL_IOCTL_GET_CAP	DRM_IOWR(0x0c, struct local_get_cap)
+-#define DRM_CAP_CURSOR_WIDTH	8
+-#define DRM_CAP_CURSOR_HEIGHT	9
++#ifndef DRM_CAP_CURSOR_WIDTH
++#define DRM_CAP_CURSOR_WIDTH	0x8
++#endif
++#ifndef DRM_CAP_CURSOR_HEIGHT
++#define DRM_CAP_CURSOR_HEIGHT	0x9
++#endif
+ 
+ #define I915_PARAM_HAS_COHERENT_PHYS_GTT 29
+ 
+@@ -5087,11 +6645,9 @@ sna_cursor_pre_init(struct sna *sna)
+ 	DBG(("%s: cursor updates use_gtt?=%d\n",
+ 	     __FUNCTION__, sna->cursor.use_gtt));
+ 
+-	if (!sna->cursor.use_gtt) {
+-		sna->cursor.scratch = malloc(sna->cursor.max_size * sna->cursor.max_size * 4);
+-		if (!sna->cursor.scratch)
+-			sna->cursor.max_size = 0;
+-	}
++	sna->cursor.scratch = malloc(sna->cursor.max_size * sna->cursor.max_size * 4);
++	if (!sna->cursor.scratch && !sna->cursor.use_gtt)
++		sna->cursor.max_size = 0;
+ 
+ 	sna->cursor.num_stash = -sna->mode.num_real_crtc;
+ 
+@@ -5193,7 +6749,7 @@ sna_crtc_flip(struct sna *sna, struct sna_crtc *crtc, struct kgem_bo *bo, int x,
+ 	int output_count = 0;
+ 	int i;
+ 
+-	DBG(("%s CRTC:%d [pipe=%d], handle=%d\n", __FUNCTION__, crtc->id, crtc->pipe, bo->handle));
++	DBG(("%s CRTC:%d [pipe=%d], handle=%d\n", __FUNCTION__, __sna_crtc_id(crtc), __sna_crtc_pipe(crtc), bo->handle));
+ 
+ 	assert(sna->mode.num_real_output < ARRAY_SIZE(output_ids));
+ 	assert(crtc->bo);
+@@ -5207,11 +6763,11 @@ sna_crtc_flip(struct sna *sna, struct sna_crtc *crtc, struct kgem_bo *bo, int x,
+ 
+ 		DBG(("%s: attaching output '%s' %d [%d] to crtc:%d (pipe %d) (possible crtc:%x, possible clones:%x)\n",
+ 		     __FUNCTION__, output->name, i, to_connector_id(output),
+-		     crtc->id, crtc->pipe,
++		     __sna_crtc_id(crtc), __sna_crtc_pipe(crtc),
+ 		     (uint32_t)output->possible_crtcs,
+ 		     (uint32_t)output->possible_clones));
+ 
+-		assert(output->possible_crtcs & (1 << crtc->pipe) ||
++		assert(output->possible_crtcs & (1 << __sna_crtc_pipe(crtc)) ||
+ 		       is_zaphod(sna->scrn));
+ 
+ 		output_ids[output_count] = to_connector_id(output);
+@@ -5221,7 +6777,7 @@ sna_crtc_flip(struct sna *sna, struct sna_crtc *crtc, struct kgem_bo *bo, int x,
+ 	assert(output_count);
+ 
+ 	VG_CLEAR(arg);
+-	arg.crtc_id = crtc->id;
++	arg.crtc_id = __sna_crtc_id(crtc);
+ 	arg.fb_id = fb_id(bo);
+ 	assert(arg.fb_id);
+ 	arg.x = x;
+@@ -5231,20 +6787,74 @@ sna_crtc_flip(struct sna *sna, struct sna_crtc *crtc, struct kgem_bo *bo, int x,
+ 	arg.mode = crtc->kmode;
+ 	arg.mode_valid = 1;
+ 
+-	DBG(("%s: applying crtc [%d, pipe=%d] mode=%dx%d+%d+%d@%d, fb=%d across %d outputs [%d...]\n",
+-	     __FUNCTION__, crtc->id, crtc->pipe,
+-	     arg.mode.hdisplay,
+-	     arg.mode.vdisplay,
+-	     arg.x, arg.y,
+-	     arg.mode.clock,
+-	     arg.fb_id,
+-	     output_count, output_count ? output_ids[0] : 0));
++	DBG(("%s: applying crtc [%d, pipe=%d] mode=%dx%d+%d+%d@%d, fb=%d across %d outputs [%d...]\n",
++	     __FUNCTION__, __sna_crtc_id(crtc), __sna_crtc_pipe(crtc),
++	     arg.mode.hdisplay,
++	     arg.mode.vdisplay,
++	     arg.x, arg.y,
++	     arg.mode.clock,
++	     arg.fb_id,
++	     output_count, output_count ? output_ids[0] : 0));
++
++	if (drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_SETCRTC, &arg))
++		return false;
++
++	crtc->offset = y << 16 | x;
++	__kgem_bo_clear_dirty(bo);
++	return true;
++}
++
++static void sna_mode_restore(struct sna *sna)
++{
++	xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(sna->scrn);
++	int error = 0;
++	int i;
++
++	assert(!sna->mode.hidden);
++
++	for (i = 0; i < sna->mode.num_real_crtc; i++) {
++		xf86CrtcPtr crtc = config->crtc[i];
++
++		assert(to_sna_crtc(crtc) != NULL);
++		if (to_sna_crtc(crtc)->bo == NULL)
++			continue;
++
++		assert(crtc->enabled);
++		if (!__sna_crtc_set_mode(crtc)) {
++			sna_crtc_disable(crtc, false);
++			error++;
++		}
++	}
++	sna_mode_wakeup(sna);
++	while (sna->mode.flip_active && sna_mode_wakeup(sna))
++		;
++	update_flush_interval(sna);
++	sna_cursors_reload(sna);
++	sna->mode.dirty = false;
++
++	if (error)
++		xf86DrvMsg(sna->scrn->scrnIndex, X_ERROR,
++			   "Failed to restore display configuration\n");
++}
++
++bool sna_needs_page_flip(struct sna *sna, struct kgem_bo *bo)
++{
++	xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(sna->scrn);
++	int i;
++
++	for (i = 0; i < sna->mode.num_real_crtc; i++) {
++		struct sna_crtc *crtc = config->crtc[i]->driver_private;
++
++		if (crtc->bo == NULL)
++			continue;
+ 
+-	if (drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_SETCRTC, &arg))
+-		return false;
++		if (crtc->bo == bo)
++			continue;
+ 
+-	crtc->offset = y << 16 | x;
+-	return true;
++		return true;
++	}
++
++	return false;
+ }
+ 
+ int
+@@ -5256,6 +6866,7 @@ sna_page_flip(struct sna *sna,
+ 	xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(sna->scrn);
+ 	const int width = sna->scrn->virtualX;
+ 	const int height = sna->scrn->virtualY;
++	int sigio;
+ 	int count = 0;
+ 	int i;
+ 
+@@ -5263,23 +6874,26 @@ sna_page_flip(struct sna *sna,
+ 	assert(bo->refcnt);
+ 
+ 	assert((sna->flags & SNA_IS_HOSTED) == 0);
+-	assert((sna->flags & SNA_TEAR_FREE) == 0);
+ 	assert(sna->mode.flip_active == 0);
+ 	assert(sna->mode.front_active);
++	assert(!sna->mode.hidden);
+ 	assert(sna->scrn->vtSema);
+ 
+ 	if ((sna->flags & (data ? SNA_HAS_FLIP : SNA_HAS_ASYNC_FLIP)) == 0)
+ 		return 0;
+ 
+ 	kgem_bo_submit(&sna->kgem, bo);
++	__kgem_bo_clear_dirty(bo);
+ 
++	sigio = sigio_block();
+ 	for (i = 0; i < sna->mode.num_real_crtc; i++) {
+ 		struct sna_crtc *crtc = config->crtc[i]->driver_private;
+ 		struct drm_mode_crtc_page_flip arg;
+ 		uint32_t crtc_offset;
++		int fixup;
+ 
+ 		DBG(("%s: crtc %d id=%d, pipe=%d active? %d\n",
+-		     __FUNCTION__, i, crtc->id, crtc->pipe, crtc->bo != NULL));
++		     __FUNCTION__, i, __sna_crtc_id(crtc), __sna_crtc_pipe(crtc), crtc->bo != NULL));
+ 		if (crtc->bo == NULL)
+ 			continue;
+ 		assert(!crtc->transform);
+@@ -5288,13 +6902,18 @@ sna_page_flip(struct sna *sna,
+ 		assert(crtc->bo->refcnt >= crtc->bo->active_scanout);
+ 		assert(crtc->flip_bo == NULL);
+ 
+-		arg.crtc_id = crtc->id;
++		assert_crtc_fb(sna, crtc);
++		if (data == NULL && crtc->bo == bo)
++			goto next_crtc;
++
++		arg.crtc_id = __sna_crtc_id(crtc);
+ 		arg.fb_id = get_fb(sna, bo, width, height);
+ 		if (arg.fb_id == 0) {
+ 			assert(count == 0);
+-			return 0;
++			break;
+ 		}
+ 
++		fixup = 0;
+ 		crtc_offset = crtc->base->y << 16 | crtc->base->x;
+ 
+ 		if (bo->pitch != crtc->bo->pitch || crtc_offset != crtc->offset) {
+@@ -5303,7 +6922,12 @@ sna_page_flip(struct sna *sna,
+ 			     bo->pitch, crtc->bo->pitch,
+ 			     crtc_offset, crtc->offset));
+ fixup_flip:
++			fixup = 1;
+ 			if (crtc->bo != bo && sna_crtc_flip(sna, crtc, bo, crtc->base->x, crtc->base->y)) {
++update_scanout:
++				DBG(("%s: removing handle=%d [active_scanout=%d] from scanout, installing handle=%d [active_scanout=%d]\n",
++				     __FUNCTION__, crtc->bo->handle, crtc->bo->active_scanout,
++				     bo->handle, bo->active_scanout));
+ 				assert(crtc->bo->active_scanout);
+ 				assert(crtc->bo->refcnt >= crtc->bo->active_scanout);
+ 				crtc->bo->active_scanout--;
+@@ -5321,15 +6945,8 @@ fixup_flip:
+ 					goto next_crtc;
+ 
+ 				/* queue a flip in order to send the event */
+-			} else {
+-				if (count && !xf86SetDesiredModes(sna->scrn)) {
+-					xf86DrvMsg(sna->scrn->scrnIndex, X_ERROR,
+-						   "failed to restore display configuration\n");
+-					for (; i < sna->mode.num_real_crtc; i++)
+-						sna_crtc_disable(config->crtc[i]);
+-				}
+-				return 0;
+-			}
++			} else
++				goto error;
+ 		}
+ 
+ 		/* Only the reference crtc will finally deliver its page flip
+@@ -5346,7 +6963,7 @@ fixup_flip:
+ 
+ retry_flip:
+ 		DBG(("%s: crtc %d id=%d, pipe=%d  --> fb %d\n",
+-		     __FUNCTION__, i, crtc->id, crtc->pipe, arg.fb_id));
++		     __FUNCTION__, i, __sna_crtc_id(crtc), __sna_crtc_pipe(crtc), arg.fb_id));
+ 		if (drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_PAGE_FLIP, &arg)) {
+ 			ERR(("%s: pageflip failed with err=%d\n", __FUNCTION__, errno));
+ 
+@@ -5354,7 +6971,7 @@ retry_flip:
+ 				struct drm_mode_crtc mode;
+ 
+ 				memset(&mode, 0, sizeof(mode));
+-				mode.crtc_id = crtc->id;
++				mode.crtc_id = __sna_crtc_id(crtc);
+ 				drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_GETCRTC, &mode);
+ 
+ 				DBG(("%s: crtc=%d, valid?=%d, fb attached?=%d, expected=%d\n",
+@@ -5366,7 +6983,7 @@ retry_flip:
+ 					goto fixup_flip;
+ 
+ 				if (count == 0)
+-					return 0;
++					break;
+ 
+ 				DBG(("%s: throttling on busy flip / waiting for kernel to catch up\n", __FUNCTION__));
+ 				drmIoctl(sna->kgem.fd, DRM_IOCTL_I915_GEM_THROTTLE, 0);
+@@ -5375,15 +6992,25 @@ retry_flip:
+ 				goto retry_flip;
+ 			}
+ 
++			if (!fixup)
++				goto fixup_flip;
++
++error:
+ 			xf86DrvMsg(sna->scrn->scrnIndex, X_ERROR,
+-				   "page flipping failed, on CRTC:%d (pipe=%d), disabling %s page flips\n",
+-				   crtc->id, crtc->pipe, data ? "synchronous": "asynchronous");
++					"page flipping failed, on CRTC:%d (pipe=%d), disabling %s page flips\n",
++					__sna_crtc_id(crtc), __sna_crtc_pipe(crtc), data ? "synchronous": "asynchronous");
++
++			if (count || crtc->bo == bo)
++				sna_mode_restore(sna);
++
+ 			sna->flags &= ~(data ? SNA_HAS_FLIP : SNA_HAS_ASYNC_FLIP);
+-			goto fixup_flip;
++			count = 0;
++			break;
+ 		}
+ 
+ 		if (data) {
+ 			assert(crtc->flip_bo == NULL);
++			assert(handler);
+ 			crtc->flip_handler = handler;
+ 			crtc->flip_data = data;
+ 			crtc->flip_bo = kgem_bo_reference(bo);
+@@ -5391,11 +7018,15 @@ retry_flip:
+ 			crtc->flip_serial = crtc->mode_serial;
+ 			crtc->flip_pending = true;
+ 			sna->mode.flip_active++;
+-		}
+ 
++			DBG(("%s: recording flip on CRTC:%d handle=%d, active_scanout=%d, serial=%d\n",
++			     __FUNCTION__, __sna_crtc_id(crtc), crtc->flip_bo->handle, crtc->flip_bo->active_scanout, crtc->flip_serial));
++		} else
++			goto update_scanout;
+ next_crtc:
+ 		count++;
+ 	}
++	sigio_unblock(sigio);
+ 
+ 	DBG(("%s: page flipped %d crtcs\n", __FUNCTION__, count));
+ 	return count;
+@@ -5471,7 +7102,7 @@ static void crtc_init_gamma(xf86CrtcPtr crtc)
+ 
+ 		assert(sna_crtc);
+ 
+-		lut.crtc_id = sna_crtc->id;
++		lut.crtc_id = __sna_crtc_id(sna_crtc);
+ 		lut.gamma_size = 256;
+ 		lut.red = (uintptr_t)(gamma);
+ 		lut.green = (uintptr_t)(gamma + 256);
+@@ -5485,7 +7116,7 @@ static void crtc_init_gamma(xf86CrtcPtr crtc)
+ 		}
+ 
+ 		DBG(("%s: CRTC:%d, pipe=%d: gamma set?=%d\n",
+-		     __FUNCTION__, sna_crtc->id, sna_crtc->pipe,
++		     __FUNCTION__, __sna_crtc_id(sna_crtc), __sna_crtc_pipe(sna_crtc),
+ 		     gamma_set));
+ 		if (!gamma_set) {
+ 			int i;
+@@ -5502,6 +7133,7 @@ static void crtc_init_gamma(xf86CrtcPtr crtc)
+ 			crtc->gamma_red = gamma;
+ 			crtc->gamma_green = gamma + 256;
+ 			crtc->gamma_blue = gamma + 2*256;
++			crtc->gamma_size = 256;
+ 		}
+ 	}
+ }
+@@ -5528,6 +7160,7 @@ static bool sna_probe_initial_configuration(struct sna *sna)
+ {
+ 	ScrnInfoPtr scrn = sna->scrn;
+ 	xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(scrn);
++	int crtc_active, crtc_enabled;
+ 	int width, height;
+ 	int i, j;
+ 
+@@ -5565,6 +7198,7 @@ static bool sna_probe_initial_configuration(struct sna *sna)
+ 	}
+ 
+ 	/* Copy the existing modes on each CRTCs */
++	crtc_active = crtc_enabled = 0;
+ 	for (i = 0; i < sna->mode.num_real_crtc; i++) {
+ 		xf86CrtcPtr crtc = config->crtc[i];
+ 		struct sna_crtc *sna_crtc = to_sna_crtc(crtc);
+@@ -5577,12 +7211,12 @@ static bool sna_probe_initial_configuration(struct sna *sna)
+ 
+ 		/* Retrieve the current mode */
+ 		VG_CLEAR(mode);
+-		mode.crtc_id = sna_crtc->id;
++		mode.crtc_id = __sna_crtc_id(sna_crtc);
+ 		if (drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_GETCRTC, &mode))
+ 			continue;
+ 
+ 		DBG(("%s: CRTC:%d, pipe=%d: has mode?=%d\n", __FUNCTION__,
+-		     sna_crtc->id, sna_crtc->pipe,
++		     __sna_crtc_id(sna_crtc), __sna_crtc_pipe(sna_crtc),
+ 		     mode.mode_valid && mode.mode.clock));
+ 
+ 		if (!mode.mode_valid || mode.mode.clock == 0)
+@@ -5593,6 +7227,7 @@ static bool sna_probe_initial_configuration(struct sna *sna)
+ 		crtc->desiredX = mode.x;
+ 		crtc->desiredY = mode.y;
+ 		crtc->desiredTransformPresent = FALSE;
++		crtc_active++;
+ 	}
+ 
+ 	/* Reconstruct outputs pointing to active CRTC */
+@@ -5604,6 +7239,7 @@ static bool sna_probe_initial_configuration(struct sna *sna)
+ 
+ 		crtc_id = (uintptr_t)output->crtc;
+ 		output->crtc = NULL;
++		output->status = XF86OutputStatusUnknown;
+ 		if (sna->flags & SNA_IS_SLAVED)
+ 			continue;
+ 
+@@ -5623,7 +7259,7 @@ static bool sna_probe_initial_configuration(struct sna *sna)
+ 			xf86CrtcPtr crtc = config->crtc[j];
+ 
+ 			assert(to_sna_crtc(crtc));
+-			if (to_sna_crtc(crtc)->id != crtc_id)
++			if (sna_crtc_id(crtc) != crtc_id)
+ 				continue;
+ 
+ 			if (crtc->desiredMode.status == MODE_OK) {
+@@ -5641,18 +7277,30 @@ static bool sna_probe_initial_configuration(struct sna *sna)
+ 					   "Output %s using initial mode %s on pipe %d\n",
+ 					   output->name,
+ 					   crtc->desiredMode.name,
+-					   to_sna_crtc(crtc)->pipe);
++					   sna_crtc_pipe(crtc));
+ 
+ 				output->crtc = crtc;
++				output->status = XF86OutputStatusConnected;
+ 				crtc->enabled = TRUE;
++				crtc_enabled++;
++
++				output_set_gamma(output, crtc);
++
++				if (output->conf_monitor) {
++					output->mm_width = output->conf_monitor->mon_width;
++					output->mm_height = output->conf_monitor->mon_height;
++				}
++
++#if 0
++				sna_output_attach_edid(output);
++				sna_output_attach_tile(output);
++#endif
+ 
+ 				if (output->mm_width == 0 || output->mm_height == 0) {
+ 					output->mm_height = (crtc->desiredMode.VDisplay * 254) / (10*DEFAULT_DPI);
+ 					output->mm_width = (crtc->desiredMode.HDisplay * 254) / (10*DEFAULT_DPI);
+ 				}
+ 
+-				output_set_gamma(output, crtc);
+-
+ 				M = calloc(1, sizeof(DisplayModeRec));
+ 				if (M) {
+ 					*M = crtc->desiredMode;
+@@ -5673,6 +7321,12 @@ static bool sna_probe_initial_configuration(struct sna *sna)
+ 		}
+ 	}
+ 
++	if (crtc_active != crtc_enabled) {
++		DBG(("%s: only enabled %d out of %d active CRTC, forcing a reconfigure\n",
++		     __FUNCTION__, crtc_enabled, crtc_active));
++		return false;
++	}
++
+ 	width = height = 0;
+ 	for (i = 0; i < sna->mode.num_real_crtc; i++) {
+ 		xf86CrtcPtr crtc = config->crtc[i];
+@@ -5707,8 +7361,8 @@ static bool sna_probe_initial_configuration(struct sna *sna)
+ 			if (sna_output->num_modes == 0)
+ 				continue;
+ 
+-			width = sna_output->modes[0].hdisplay;
+-			height= sna_output->modes[0].vdisplay;
++			width  = sna_output->modes[0].hdisplay;
++			height = sna_output->modes[0].vdisplay;
+ 
+ 			DBG(("%s: panel '%s' is %dx%d\n",
+ 			     __FUNCTION__, output->name, width, height));
+@@ -5788,7 +7442,7 @@ probe_capabilities(struct sna *sna)
+ 	sna->flags &= ~(SNA_HAS_FLIP | SNA_HAS_ASYNC_FLIP);
+ 	if (has_flip(sna))
+ 		sna->flags |= SNA_HAS_FLIP;
+-	if (has_flip__async(sna))
++	if (has_flip__async(sna) && (sna->flags & SNA_TEAR_FREE) == 0)
+ 		sna->flags |= SNA_HAS_ASYNC_FLIP;
+ 	DBG(("%s: page flips? %s, async? %s\n", __FUNCTION__,
+ 	     sna->flags & SNA_HAS_FLIP ? "enabled" : "disabled",
+@@ -5813,12 +7467,25 @@ sna_crtc_config_notify(ScreenPtr screen)
+ 		return;
+ 	}
+ 
++	/* Flush any events completed by the modeset */
++	sna_mode_wakeup(sna);
++
+ 	update_flush_interval(sna);
++	sna->cursor.disable = false; /* Reset HW cursor until the next fail */
+ 	sna_cursors_reload(sna);
+ 
+ 	probe_capabilities(sna);
+ 	sna_present_update(sna);
+ 
++	/* Allow TearFree to come back on when everything is off */
++	if (!sna->mode.front_active && sna->flags & SNA_WANT_TEAR_FREE) {
++		if ((sna->flags & SNA_TEAR_FREE) == 0)
++			DBG(("%s: enable TearFree next modeset\n",
++			     __FUNCTION__));
++
++		sna->flags |= SNA_TEAR_FREE;
++	}
++
+ 	sna->mode.dirty = false;
+ }
+ 
+@@ -5840,6 +7507,7 @@ bool sna_mode_pre_init(ScrnInfoPtr scrn, struct sna *sna)
+ 	}
+ 
+ 	probe_capabilities(sna);
++	sna->mode.hidden = 1;
+ 
+ 	if (!xf86GetOptValInteger(sna->Options, OPTION_VIRTUAL, &num_fake))
+ 		num_fake = 1;
+@@ -5855,6 +7523,9 @@ bool sna_mode_pre_init(ScrnInfoPtr scrn, struct sna *sna)
+ 	if (res) {
+ 		xf86CrtcConfigPtr xf86_config;
+ 
++		DBG(("%s: found %d CRTC, %d encoders, %d connectors\n",
++		     __FUNCTION__, res->count_crtcs, res->count_encoders, res->count_connectors));
++
+ 		assert(res->count_crtcs);
+ 		assert(res->count_connectors);
+ 
+@@ -5862,6 +7533,7 @@ bool sna_mode_pre_init(ScrnInfoPtr scrn, struct sna *sna)
+ 
+ 		xf86_config = XF86_CRTC_CONFIG_PTR(scrn);
+ 		xf86_config->xf86_crtc_notify = sna_crtc_config_notify;
++		xf86_config->compat_output = 0;
+ 
+ 		for (i = 0; i < res->count_crtcs; i++)
+ 			if (!sna_crtc_add(scrn, res->crtcs[i]))
+@@ -5900,6 +7572,11 @@ bool sna_mode_pre_init(ScrnInfoPtr scrn, struct sna *sna)
+ 	if (!sna_mode_fake_init(sna, num_fake))
+ 		return false;
+ 
++	sna->mode.shadow_size = 256;
++	sna->mode.shadow_events = malloc(sna->mode.shadow_size * sizeof(struct drm_event_vblank));
++	if (!sna->mode.shadow_events)
++		return false;
++
+ 	if (!sna_probe_initial_configuration(sna)) {
+ 		xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(scrn);
+ 
+@@ -5912,6 +7589,7 @@ bool sna_mode_pre_init(ScrnInfoPtr scrn, struct sna *sna)
+ 		}
+ 	}
+ 	sort_config_outputs(sna);
++	TimerSet(NULL, 0, COLDPLUG_DELAY_MS, sna_mode_coldplug, sna);
+ 
+ 	sna_setup_provider(scrn);
+ 	return scrn->modes != NULL;
+@@ -5921,18 +7599,58 @@ bool
+ sna_mode_wants_tear_free(struct sna *sna)
+ {
+ 	xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(sna->scrn);
++	bool found = false;
++	FILE *file;
+ 	int i;
+ 
++	file = fopen("/sys/module/i915/parameters/enable_fbc", "r");
++	if (file) {
++		int fbc_enabled = 0;
++		int value;
++
++		if (fscanf(file, "%d", &value) == 1)
++			fbc_enabled = value > 0;
++		fclose(file);
++
++		DBG(("%s: module parameter 'enable_fbc' enabled? %d\n",
++		     __FUNCTION__, fbc_enabled));
++
++		if (fbc_enabled)
++			return true;
++	}
++
+ 	for (i = 0; i < sna->mode.num_real_output; i++) {
+ 		struct sna_output *output = to_sna_output(config->output[i]);
+ 		int id = find_property(sna, output, "Panel Self-Refresh");
+-		if (id !=-1 && output->prop_values[id] != -1) {
++		if (id == -1)
++			continue;
++
++		found = true;
++		if (output->prop_values[id] != -1) {
+ 			DBG(("%s: Panel Self-Refresh detected on %s\n",
+ 			     __FUNCTION__, config->output[i]->name));
+ 			return true;
+ 		}
+ 	}
+ 
++	if (!found) {
++		file = fopen("/sys/module/i915/parameters/enable_psr", "r");
++		if (file) {
++			int psr_enabled = 0;
++			int value;
++
++			if (fscanf(file, "%d", &value) == 1)
++				psr_enabled = value > 0;
++			fclose(file);
++
++			DBG(("%s: module parameter 'enable_psr' enabled? %d\n",
++			     __FUNCTION__, psr_enabled));
++
++			if (psr_enabled)
++				return true;
++		}
++	}
++
+ 	return false;
+ }
+ 
+@@ -5955,7 +7673,7 @@ sna_mode_set_primary(struct sna *sna)
+ 
+ 		DBG(("%s: setting PrimaryOutput %s\n", __FUNCTION__, output->name));
+ 		rr->primaryOutput = output->randr_output;
+-		RROutputChanged(rr->primaryOutput, 0);
++		RROutputChanged(rr->primaryOutput, FALSE);
+ 		rr->layoutChanged = TRUE;
+ 		break;
+ 	}
+@@ -5974,12 +7692,9 @@ sna_mode_disable(struct sna *sna)
+ 	if (!sna->scrn->vtSema)
+ 		return false;
+ 
+-	/* XXX we will cause previously hidden cursors to be reshown, but
+-	 * this should be a rare fixup case for severe fragmentation.
+-	 */
+-	sna_hide_cursors(sna->scrn);
++	sna_disable_cursors(sna->scrn);
+ 	for (i = 0; i < sna->mode.num_real_crtc; i++)
+-		sna_crtc_disable(config->crtc[i]);
++		sna_crtc_disable(config->crtc[i], false);
+ 	assert(sna->mode.front_active == 0);
+ 
+ 	sna_mode_wakeup(sna);
+@@ -6001,6 +7716,11 @@ sna_mode_enable(struct sna *sna)
+ 	if (!sna->scrn->vtSema)
+ 		return;
+ 
++	if (sna->mode.hidden) {
++		DBG(("%s: hidden outputs\n", __FUNCTION__));
++		return;
++	}
++
+ 	for (i = 0; i < sna->mode.num_real_crtc; i++) {
+ 		xf86CrtcPtr crtc = config->crtc[i];
+ 
+@@ -6016,13 +7736,30 @@ sna_mode_enable(struct sna *sna)
+ 	}
+ 
+ 	update_flush_interval(sna);
+-	sna_show_cursors(sna->scrn);
++	sna_cursors_reload(sna);
+ 	sna->mode.dirty = false;
+ }
+ 
++static void sna_randr_close(struct sna *sna)
++{
++	xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(sna->scrn);
++	int n;
++
++	/* The RR structs are freed early during CloseScreen as they
++	 * are tracked as Resources. However, we may be tempted to
++	 * access them during shutdown so decouple them now.
++	 */
++	  for (n = 0; n < config->num_output; n++)
++		  config->output[n]->randr_output = NULL;
++
++	  for (n = 0; n < config->num_crtc; n++)
++		  config->crtc[n]->randr_crtc = NULL;
++}
++
+ void
+ sna_mode_close(struct sna *sna)
+ {
++	sna_randr_close(sna);
+ 	sna_mode_wakeup(sna);
+ 
+ 	if (sna->flags & SNA_IS_HOSTED)
+@@ -6077,15 +7814,22 @@ xf86CrtcPtr
+ sna_covering_crtc(struct sna *sna, const BoxRec *box, xf86CrtcPtr desired)
+ {
+ 	xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(sna->scrn);
+-	xf86CrtcPtr best_crtc;
+-	int best_coverage, c;
++	xf86CrtcPtr best_crtc = NULL;
++	int best_coverage = -1, c;
+ 
+ 	if (sna->flags & SNA_IS_HOSTED)
+ 		return NULL;
+ 
+ 	/* If we do not own the VT, we do not own the CRTC either */
+-	if (!sna->scrn->vtSema)
++	if (!sna->scrn->vtSema) {
++		DBG(("%s: none, VT switched\n", __FUNCTION__));
++		return NULL;
++	}
++
++	if (sna->mode.hidden) {
++		DBG(("%s: none, hidden outputs\n", __FUNCTION__));
+ 		return NULL;
++	}
+ 
+ 	DBG(("%s for box=(%d, %d), (%d, %d)\n",
+ 	     __FUNCTION__, box->x1, box->y1, box->x2, box->y2));
+@@ -6107,10 +7851,10 @@ sna_covering_crtc(struct sna *sna, const BoxRec *box, xf86CrtcPtr desired)
+ 			     cover_box.x2, cover_box.y2));
+ 			return desired;
+ 		}
++		best_crtc = desired;
++		best_coverage = 0;
+ 	}
+ 
+-	best_crtc = NULL;
+-	best_coverage = 0;
+ 	for (c = 0; c < sna->mode.num_real_crtc; c++) {
+ 		xf86CrtcPtr crtc = config->crtc[c];
+ 		BoxRec cover_box;
+@@ -6156,6 +7900,38 @@ sna_covering_crtc(struct sna *sna, const BoxRec *box, xf86CrtcPtr desired)
+ 	return best_crtc;
+ }
+ 
++static xf86CrtcPtr first_active_crtc(struct sna *sna)
++{
++	xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(sna->scrn);
++	int n;
++
++	for (n = 0; n < sna->mode.num_real_crtc; n++) {
++		xf86CrtcPtr crtc = config->crtc[n];
++		if (to_sna_crtc(crtc)->bo)
++			return crtc;
++	}
++
++	/* No active, use the first as a placeholder */
++	if (sna->mode.num_real_crtc)
++		return config->crtc[0];
++
++	return NULL;
++}
++
++xf86CrtcPtr sna_primary_crtc(struct sna *sna)
++{
++	rrScrPrivPtr rr = rrGetScrPriv(xf86ScrnToScreen(sna->scrn));
++	if (rr && rr->primaryOutput) {
++		xf86OutputPtr output = rr->primaryOutput->devPrivate;
++		if (output->crtc &&
++		    output->scrn == sna->scrn &&
++		    to_sna_crtc(output->crtc))
++			return output->crtc;
++	}
++
++	return first_active_crtc(sna);
++}
++
+ #define MI_LOAD_REGISTER_IMM			(0x22<<23)
+ 
+ static bool sna_emit_wait_for_scanline_hsw(struct sna *sna,
+@@ -6433,7 +8209,7 @@ sna_wait_for_scanline(struct sna *sna,
+ 		y2 /= 2;
+ 	}
+ 
+-	pipe = sna_crtc_to_pipe(crtc);
++	pipe = sna_crtc_pipe(crtc);
+ 	DBG(("%s: pipe=%d, y1=%d, y2=%d, full_height?=%d\n",
+ 	     __FUNCTION__, pipe, y1, y2, full_height));
+ 
+@@ -6457,19 +8233,101 @@ sna_wait_for_scanline(struct sna *sna,
+ 	return ret;
+ }
+ 
++static bool sna_mode_shutdown_crtc(xf86CrtcPtr crtc)
++{
++	struct sna *sna = to_sna(crtc->scrn);
++	xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(crtc->scrn);
++	bool disabled = false;
++	int o;
++
++	xf86DrvMsg(crtc->scrn->scrnIndex, X_ERROR,
++		   "%s: invalid state found on pipe %d, disabling CRTC:%d\n",
++		   __FUNCTION__,
++		   __sna_crtc_pipe(to_sna_crtc(crtc)),
++		   __sna_crtc_id(to_sna_crtc(crtc)));
++	sna_crtc_disable(crtc, true);
++#if XF86_CRTC_VERSION >= 3
++	crtc->active = FALSE;
++#endif
++	if (crtc->enabled) {
++		crtc->enabled = FALSE;
++		disabled = true;
++	}
++
++	for (o = 0; o < sna->mode.num_real_output; o++) {
++		xf86OutputPtr output = config->output[o];
++
++		if (output->crtc != crtc)
++			continue;
++
++		output->funcs->dpms(output, DPMSModeOff);
++		output->crtc = NULL;
++	}
++
++	return disabled;
++}
++
++static bool
++sna_mode_disable_secondary_planes(struct sna *sna)
++{
++	xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(sna->scrn);
++	bool disabled = false;
++	int c;
++
++	/* Disable all secondary planes on our CRTCs, just in case
++	 * other userspace left garbage in them.
++	 */
++	for (c = 0; c < sna->mode.num_real_crtc; c++) {
++		xf86CrtcPtr crtc = config->crtc[c];
++		struct sna_crtc *sna_crtc = to_sna_crtc(crtc);
++		struct plane *plane;
++
++		list_for_each_entry(plane, &sna_crtc->sprites, link) {
++			struct local_mode_get_plane p;
++			struct local_mode_set_plane s;
++
++			VG_CLEAR(p);
++			p.plane_id = plane->id;
++			p.count_format_types = 0;
++			if (drmIoctl(sna->kgem.fd,
++				     LOCAL_IOCTL_MODE_GETPLANE,
++				     &p))
++				continue;
++
++			if (p.fb_id == 0 || p.crtc_id == 0)
++				continue;
++
++			memset(&s, 0, sizeof(s));
++			s.plane_id = p.plane_id;
++			s.crtc_id = p.crtc_id;
++			if (drmIoctl(sna->kgem.fd,
++				     LOCAL_IOCTL_MODE_SETPLANE,
++				     &s))
++				disabled |= sna_mode_shutdown_crtc(crtc);
++		}
++	}
++
++	return disabled;
++}
++
+ void sna_mode_check(struct sna *sna)
+ {
+ 	xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(sna->scrn);
+-	int i;
++	bool disabled;
++	int c, o;
+ 
+ 	if (sna->flags & SNA_IS_HOSTED)
+ 		return;
+ 
+-	DBG(("%s\n", __FUNCTION__));
++	DBG(("%s: hidden?=%d\n", __FUNCTION__, sna->mode.hidden));
++	if (sna->mode.hidden)
++		return;
++
++	disabled = sna_mode_disable_secondary_planes(sna);
+ 
+ 	/* Validate CRTC attachments and force consistency upon the kernel */
+-	for (i = 0; i < sna->mode.num_real_crtc; i++) {
+-		xf86CrtcPtr crtc = config->crtc[i];
++	for (c = 0; c < sna->mode.num_real_crtc; c++) {
++		xf86CrtcPtr crtc = config->crtc[c];
+ 		struct sna_crtc *sna_crtc = to_sna_crtc(crtc);
+ 		struct drm_mode_crtc mode;
+ 		uint32_t expected[2];
+@@ -6483,7 +8341,7 @@ void sna_mode_check(struct sna *sna)
+ 		expected[1] = sna_crtc->flip_bo ? fb_id(sna_crtc->flip_bo) : -1;
+ 
+ 		VG_CLEAR(mode);
+-		mode.crtc_id = sna_crtc->id;
++		mode.crtc_id = __sna_crtc_id(sna_crtc);
+ 		if (drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_GETCRTC, &mode))
+ 			continue;
+ 
+@@ -6492,16 +8350,12 @@ void sna_mode_check(struct sna *sna)
+ 		     mode.crtc_id, mode.mode_valid,
+ 		     mode.fb_id, expected[0], expected[1]));
+ 
+-		if (mode.fb_id != expected[0] && mode.fb_id != expected[1]) {
+-			xf86DrvMsg(crtc->scrn->scrnIndex, X_ERROR,
+-				   "%s: invalid state found on pipe %d, disabling CRTC:%d\n",
+-				   __FUNCTION__, sna_crtc->pipe, sna_crtc->id);
+-			sna_crtc_disable(crtc);
+-		}
++		if (mode.fb_id != expected[0] && mode.fb_id != expected[1])
++			disabled |= sna_mode_shutdown_crtc(crtc);
+ 	}
+ 
+-	for (i = 0; i < config->num_output; i++) {
+-		xf86OutputPtr output = config->output[i];
++	for (o = 0; o < config->num_output; o++) {
++		xf86OutputPtr output = config->output[o];
+ 		struct sna_output *sna_output;
+ 
+ 		if (output->crtc)
+@@ -6515,26 +8369,16 @@ void sna_mode_check(struct sna *sna)
+ 	}
+ 
+ 	update_flush_interval(sna);
++
++	if (disabled)
++		xf86RandR12TellChanged(xf86ScrnToScreen(sna->scrn));
+ }
+ 
+ static bool
+ sna_crtc_hide_planes(struct sna *sna, struct sna_crtc *crtc)
+ {
+-#define LOCAL_IOCTL_MODE_SETPLANE DRM_IOWR(0xB7, struct local_mode_set_plane)
+-	struct local_mode_set_plane {
+-		uint32_t plane_id;
+-		uint32_t crtc_id;
+-		uint32_t fb_id; /* fb object contains surface format type */
+-		uint32_t flags;
+-
+-		/* Signed dest location allows it to be partially off screen */
+-		int32_t crtc_x, crtc_y;
+-		uint32_t crtc_w, crtc_h;
+-
+-		/* Source values are 16.16 fixed point */
+-		uint32_t src_x, src_y;
+-		uint32_t src_h, src_w;
+-	} s;
++	struct local_mode_set_plane s;
++	struct plane *plane;
+ 
+ 	if (crtc->primary.id == 0)
+ 		return false;
+@@ -6544,8 +8388,10 @@ sna_crtc_hide_planes(struct sna *sna, struct sna_crtc *crtc)
+ 	if (drmIoctl(sna->kgem.fd, LOCAL_IOCTL_MODE_SETPLANE, &s))
+ 		return false;
+ 
+-	s.plane_id = crtc->sprite.id;
+-	(void)drmIoctl(sna->kgem.fd, LOCAL_IOCTL_MODE_SETPLANE, &s);
++	list_for_each_entry(plane, &crtc->sprites, link) {
++		s.plane_id = plane->id;
++		(void)drmIoctl(sna->kgem.fd, LOCAL_IOCTL_MODE_SETPLANE, &s);
++	}
+ 
+ 	__sna_crtc_disable(sna, crtc);
+ 	return true;
+@@ -6561,21 +8407,22 @@ void sna_mode_reset(struct sna *sna)
+ 
+ 	DBG(("%s\n", __FUNCTION__));
+ 
+-	sna_hide_cursors(sna->scrn);
++	sna_disable_cursors(sna->scrn);
+ 	for (i = 0; i < sna->mode.num_real_crtc; i++)
+ 		if (!sna_crtc_hide_planes(sna, to_sna_crtc(config->crtc[i])))
+-			sna_crtc_disable(config->crtc[i]);
++			sna_crtc_disable(config->crtc[i], true);
+ 	assert(sna->mode.front_active == 0);
+ 
+ 	for (i = 0; i < sna->mode.num_real_crtc; i++) {
+ 		struct sna_crtc *sna_crtc = to_sna_crtc(config->crtc[i]);
++		struct plane *plane;
+ 
+ 		assert(sna_crtc != NULL);
+-		sna_crtc->dpms_mode = -1;
+ 
+ 		/* Force the rotation property to be reset on next use */
+ 		rotation_reset(&sna_crtc->primary);
+-		rotation_reset(&sna_crtc->sprite);
++		list_for_each_entry(plane, &sna_crtc->sprites, link)
++			rotation_reset(plane);
+ 	}
+ 
+ 	/* VT switching, likely to be fbcon so make the backlight usable */
+@@ -6641,9 +8488,10 @@ sna_crtc_redisplay__fallback(xf86CrtcPtr crtc, RegionPtr region, struct kgem_bo
+ {
+ 	int16_t sx, sy;
+ 	struct sna *sna = to_sna(crtc->scrn);
+-	ScreenPtr screen = sna->scrn->pScreen;
++	ScreenPtr screen = xf86ScrnToScreen(crtc->scrn);
+ 	DrawablePtr draw = crtc_source(crtc, &sx, &sy);
+ 	PictFormatPtr format;
++	PictTransform T;
+ 	PicturePtr src, dst;
+ 	PixmapPtr pixmap;
+ 	int depth, error;
+@@ -6664,6 +8512,14 @@ sna_crtc_redisplay__fallback(xf86CrtcPtr crtc, RegionPtr region, struct kgem_bo
+ 	     __FUNCTION__, format->format, depth, draw->bitsPerPixel,
+ 	     bo->pitch, crtc->mode.HDisplay, crtc->mode.VDisplay));
+ 
++	if (sx | sy)
++		RegionTranslate(region, sx, sy);
++	error = !sna_drawable_move_region_to_cpu(draw, region, MOVE_READ);
++	if (sx | sy)
++		RegionTranslate(region, -sx, -sy);
++	if (error)
++		return;
++
+ 	ptr = kgem_bo_map__gtt(&sna->kgem, bo);
+ 	if (ptr == NULL)
+ 		return;
+@@ -6683,9 +8539,37 @@ sna_crtc_redisplay__fallback(xf86CrtcPtr crtc, RegionPtr region, struct kgem_bo
+ 	if (!src)
+ 		goto free_pixmap;
+ 
+-	error = SetPictureTransform(src, &crtc->crtc_to_framebuffer);
+-	if (error)
+-		goto free_src;
++	pixman_transform_init_translate(&T, sx << 16, sy << 16);
++	pixman_transform_multiply(&T, &T, &crtc->crtc_to_framebuffer);
++	if (!sna_transform_is_integer_translation(&T, &sx, &sy)) {
++#define f2d(x) (((double)(x))/65536.)
++		DBG(("%s: transform=[[%f %f %f], [%f %f %f], [%f %f %f]] (raw [[%x %x %x], [%x %x %x], [%x %x %x]])\n",
++		     __FUNCTION__,
++		     f2d(T.matrix[0][0]),
++		     f2d(T.matrix[0][1]),
++		     f2d(T.matrix[0][2]),
++		     f2d(T.matrix[1][0]),
++		     f2d(T.matrix[1][1]),
++		     f2d(T.matrix[1][2]),
++		     f2d(T.matrix[2][0]),
++		     f2d(T.matrix[2][1]),
++		     f2d(T.matrix[2][2]),
++		     T.matrix[0][0],
++		     T.matrix[0][1],
++		     T.matrix[0][2],
++		     T.matrix[1][0],
++		     T.matrix[1][1],
++		     T.matrix[1][2],
++		     T.matrix[2][0],
++		     T.matrix[2][1],
++		     T.matrix[2][2]));
++#undef f2d
++
++		error = SetPictureTransform(src, &T);
++		if (error)
++			goto free_src;
++		sx = sy = 0;
++	}
+ 
+ 	if (crtc->filter && crtc->transform_in_use)
+ 		SetPicturePictFilter(src, crtc->filter,
+@@ -6733,10 +8617,11 @@ sna_crtc_redisplay__composite(xf86CrtcPtr crtc, RegionPtr region, struct kgem_bo
+ {
+ 	int16_t sx, sy;
+ 	struct sna *sna = to_sna(crtc->scrn);
+-	ScreenPtr screen = crtc->scrn->pScreen;
++	ScreenPtr screen = xf86ScrnToScreen(crtc->scrn);
+ 	DrawablePtr draw = crtc_source(crtc, &sx, &sy);
+ 	struct sna_composite_op tmp;
+ 	PictFormatPtr format;
++	PictTransform T;
+ 	PicturePtr src, dst;
+ 	PixmapPtr pixmap;
+ 	const BoxRec *b;
+@@ -6777,9 +8662,14 @@ sna_crtc_redisplay__composite(xf86CrtcPtr crtc, RegionPtr region, struct kgem_bo
+ 	if (!src)
+ 		goto free_pixmap;
+ 
+-	error = SetPictureTransform(src, &crtc->crtc_to_framebuffer);
+-	if (error)
+-		goto free_src;
++	pixman_transform_init_translate(&T, sx << 16, sy << 16);
++	pixman_transform_multiply(&T, &T, &crtc->crtc_to_framebuffer);
++	if (!sna_transform_is_integer_translation(&T, &sx, &sy)) {
++		error = SetPictureTransform(src, &T);
++		if (error)
++			goto free_src;
++		sx = sy = 0;
++	}
+ 
+ 	if (crtc->filter && crtc->transform_in_use)
+ 		SetPicturePictFilter(src, crtc->filter,
+@@ -6793,36 +8683,38 @@ sna_crtc_redisplay__composite(xf86CrtcPtr crtc, RegionPtr region, struct kgem_bo
+ 	ValidatePicture(src);
+ 	ValidatePicture(dst);
+ 
+-	if (!sna->render.composite(sna,
+-				   PictOpSrc, src, NULL, dst,
+-				   sx, sy,
+-				   0, 0,
+-				   0, 0,
+-				   crtc->mode.HDisplay, crtc->mode.VDisplay,
+-				   COMPOSITE_PARTIAL, memset(&tmp, 0, sizeof(tmp)))) {
+-		DBG(("%s: unsupported operation!\n", __FUNCTION__));
+-		sna_crtc_redisplay__fallback(crtc, region, bo);
+-		goto free_dst;
+-	}
+-
++	/* Composite each box individually as if we are dealing with a rotation
++	 * on a large display, we may have to perform intermediate copies. We
++	 * can then minimise the overdraw by looking at individual boxes rather
++	 * than the bbox.
++	 */
+ 	n = region_num_rects(region);
+ 	b = region_rects(region);
+ 	do {
+-		BoxRec box;
+-
+-		box = *b++;
++		BoxRec box = *b;
+ 		transformed_box(&box, crtc);
+ 
+ 		DBG(("%s: (%d, %d)x(%d, %d) -> (%d, %d), (%d, %d)\n",
+ 		     __FUNCTION__,
+-		     b[-1].x1, b[-1].y1, b[-1].x2-b[-1].x1, b[-1].y2-b[-1].y1,
++		     b->x1, b->y1, b->x2-b->x1, b->y2-b->y1,
+ 		     box.x1, box.y1, box.x2, box.y2));
+ 
+-		tmp.box(sna, &tmp, &box);
+-	} while (--n);
+-	tmp.done(sna, &tmp);
++		if (!sna->render.composite(sna,
++					   PictOpSrc, src, NULL, dst,
++					   sx + box.x1, sy + box.y1,
++					   0, 0,
++					   box.x1, box.y1,
++					   box.x2 - box.x1, box.y2 - box.y1,
++					   0, memset(&tmp, 0, sizeof(tmp)))) {
++			DBG(("%s: unsupported operation!\n", __FUNCTION__));
++			sna_crtc_redisplay__fallback(crtc, region, bo);
++			break;
++		} else {
++			tmp.box(sna, &tmp, &box);
++			tmp.done(sna, &tmp);
++		}
++	} while (b++, --n);
+ 
+-free_dst:
+ 	FreePicture(dst, None);
+ free_src:
+ 	FreePicture(src, None);
+@@ -6839,7 +8731,7 @@ sna_crtc_redisplay(xf86CrtcPtr crtc, RegionPtr region, struct kgem_bo *bo)
+ 	struct sna_pixmap *priv = sna_pixmap((PixmapPtr)draw);
+ 
+ 	DBG(("%s: crtc %d [pipe=%d], damage (%d, %d), (%d, %d) x %d\n",
+-	     __FUNCTION__, to_sna_crtc(crtc)->id, to_sna_crtc(crtc)->pipe,
++	     __FUNCTION__, sna_crtc_id(crtc), sna_crtc_pipe(crtc),
+ 	     region->extents.x1, region->extents.y1,
+ 	     region->extents.x2, region->extents.y2,
+ 	     region_num_rects(region)));
+@@ -6898,7 +8790,10 @@ sna_crtc_redisplay(xf86CrtcPtr crtc, RegionPtr region, struct kgem_bo *bo)
+ static void shadow_flip_handler(struct drm_event_vblank *e,
+ 				void *data)
+ {
+-	sna_mode_redisplay(data);
++	struct sna *sna = data;
++
++	if (!sna->mode.shadow_wait)
++		sna_mode_redisplay(sna);
+ }
+ 
+ void sna_shadow_set_crtc(struct sna *sna,
+@@ -6908,18 +8803,23 @@ void sna_shadow_set_crtc(struct sna *sna,
+ 	struct sna_crtc *sna_crtc = to_sna_crtc(crtc);
+ 	struct sna_pixmap *priv;
+ 
++	assert(sna_crtc);
+ 	DBG(("%s: setting shadow override for CRTC:%d to handle=%d\n",
+-	     __FUNCTION__, sna_crtc->id, bo->handle));
++	     __FUNCTION__, __sna_crtc_id(sna_crtc), bo->handle));
+ 
+ 	assert(sna->flags & SNA_TEAR_FREE);
+-	assert(sna_crtc);
+ 	assert(!sna_crtc->transform);
+ 
+ 	if (sna_crtc->client_bo != bo) {
+-		if (sna_crtc->client_bo)
++		if (sna_crtc->client_bo) {
++			assert(sna_crtc->client_bo->refcnt >= sna_crtc->client_bo->active_scanout);
++			sna_crtc->client_bo->active_scanout--;
+ 			kgem_bo_destroy(&sna->kgem, sna_crtc->client_bo);
++		}
+ 
+ 		sna_crtc->client_bo = kgem_bo_reference(bo);
++		sna_crtc->client_bo->active_scanout++;
++		assert(sna_crtc->client_bo->refcnt >= sna_crtc->client_bo->active_scanout);
+ 		sna_crtc_damage(crtc);
+ 	}
+ 
+@@ -6969,11 +8869,13 @@ void sna_shadow_unset_crtc(struct sna *sna,
+ 	struct sna_crtc *sna_crtc = to_sna_crtc(crtc);
+ 
+ 	DBG(("%s: clearin shadow override for CRTC:%d\n",
+-	     __FUNCTION__, sna_crtc->id));
++	     __FUNCTION__, __sna_crtc_id(sna_crtc)));
+ 
+ 	if (sna_crtc->client_bo == NULL)
+ 		return;
+ 
++	assert(sna_crtc->client_bo->refcnt >= sna_crtc->client_bo->active_scanout);
++	sna_crtc->client_bo->active_scanout--;
+ 	kgem_bo_destroy(&sna->kgem, sna_crtc->client_bo);
+ 	sna_crtc->client_bo = NULL;
+ 	list_del(&sna_crtc->shadow_link);
+@@ -6982,15 +8884,57 @@ void sna_shadow_unset_crtc(struct sna *sna,
+ 	sna_crtc_damage(crtc);
+ }
+ 
++static bool move_crtc_to_gpu(struct sna *sna)
++{
++	xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(sna->scrn);
++	int i;
++
++	for (i = 0; i < sna->mode.num_real_crtc; i++) {
++		struct sna_crtc *crtc = to_sna_crtc(config->crtc[i]);
++		unsigned hint;
++
++		assert(crtc);
++
++		if (crtc->bo == NULL)
++			continue;
++
++		if (crtc->slave_pixmap)
++			continue;
++
++		if (crtc->client_bo)
++			continue;
++
++		if (crtc->shadow_bo)
++			continue;
++
++		hint = MOVE_READ | MOVE_ASYNC_HINT | __MOVE_SCANOUT;
++		if (sna->flags & SNA_TEAR_FREE)
++			hint |= __MOVE_FORCE;
++
++		DBG(("%s: CRTC %d [pipe=%d] requires frontbuffer\n",
++		     __FUNCTION__, __sna_crtc_id(crtc), __sna_crtc_pipe(crtc)));
++		return sna_pixmap_move_to_gpu(sna->front, hint);
++	}
++
++	return true;
++}
++
+ void sna_mode_redisplay(struct sna *sna)
+ {
+ 	xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(sna->scrn);
+ 	RegionPtr region;
+ 	int i;
+ 
+-	if (!sna->mode.shadow_damage)
++	if (sna->mode.hidden) {
++		DBG(("%s: hidden outputs, skipping\n", __FUNCTION__));
++		return;
++	}
++
++	if (!sna->mode.shadow_enabled)
+ 		return;
+ 
++	assert(sna->mode.shadow_damage);
++
+ 	DBG(("%s: posting shadow damage? %d (flips pending? %d, mode reconfiguration pending? %d)\n",
+ 	     __FUNCTION__,
+ 	     !RegionNil(DamageRegion(sna->mode.shadow_damage)),
+@@ -7012,21 +8956,23 @@ void sna_mode_redisplay(struct sna *sna)
+ 	     region->extents.x2, region->extents.y2));
+ 
+ 	if (sna->mode.flip_active) {
+-		DamagePtr damage;
+-
+-		damage = sna->mode.shadow_damage;
+-		sna->mode.shadow_damage = NULL;
++		DBG(("%s: checking for %d outstanding flip completions\n",
++		     __FUNCTION__, sna->mode.flip_active));
+ 
++		sna->mode.dirty = true;
+ 		while (sna->mode.flip_active && sna_mode_wakeup(sna))
+ 			;
++		sna->mode.dirty = false;
+ 
+-		sna->mode.shadow_damage = damage;
++		DBG(("%s: now %d outstanding flip completions (enabled? %d)\n",
++		     __FUNCTION__,
++		     sna->mode.flip_active,
++		     sna->mode.shadow_enabled));
++		if (sna->mode.flip_active || !sna->mode.shadow_enabled)
++			return;
+ 	}
+ 
+-	if (sna->mode.flip_active)
+-		return;
+-
+-	if (wedged(sna) || !sna_pixmap_move_to_gpu(sna->front, MOVE_READ | MOVE_ASYNC_HINT | __MOVE_SCANOUT)) {
++	if (!move_crtc_to_gpu(sna)) {
+ 		DBG(("%s: forcing scanout update using the CPU\n", __FUNCTION__));
+ 		if (!sna_pixmap_move_to_cpu(sna->front, MOVE_READ))
+ 			return;
+@@ -7047,90 +8993,14 @@ void sna_mode_redisplay(struct sna *sna)
+ 			damage.data = NULL;
+ 			RegionIntersect(&damage, &damage, region);
+ 			if (!box_empty(&damage.extents)) {
+-				struct kgem_bo *bo = NULL;
+-
+ 				DBG(("%s: fallback intersects pipe=%d [(%d, %d), (%d, %d)]\n",
+-				     __FUNCTION__, sna_crtc->pipe,
++				     __FUNCTION__, __sna_crtc_pipe(sna_crtc),
+ 				     damage.extents.x1, damage.extents.y1,
+ 				     damage.extents.x2, damage.extents.y2));
+ 
+-				if (sna->flags & SNA_TEAR_FREE) {
+-					RegionRec new_damage;
+-
+-					RegionNull(&new_damage);
+-					RegionCopy(&new_damage, &damage);
+-
+-					bo = sna_crtc->client_bo;
+-					if (bo == NULL) {
+-						damage.extents = crtc->bounds;
+-						damage.data = NULL;
+-						bo = kgem_create_2d(&sna->kgem,
+-								crtc->mode.HDisplay,
+-								crtc->mode.VDisplay,
+-								crtc->scrn->bitsPerPixel,
+-								sna_crtc->bo->tiling,
+-								CREATE_SCANOUT);
+-					} else
+-						RegionUnion(&damage, &damage, &sna_crtc->client_damage);
+-
+-					DBG(("%s: TearFree fallback, shadow handle=%d, crtc handle=%d\n", __FUNCTION__, bo->handle, sna_crtc->bo->handle));
+-
+-					sna_crtc->client_damage = new_damage;
+-				}
+-
+-				if (bo == NULL)
+-					bo = sna_crtc->bo;
+-				sna_crtc_redisplay__fallback(crtc, &damage, bo);
+-
+-				if (bo != sna_crtc->bo) {
+-					struct drm_mode_crtc_page_flip arg;
+-
+-					arg.crtc_id = sna_crtc->id;
+-					arg.fb_id = get_fb(sna, bo,
+-							   crtc->mode.HDisplay,
+-							   crtc->mode.VDisplay);
+-
+-					arg.user_data = (uintptr_t)sna_crtc;
+-					arg.flags = DRM_MODE_PAGE_FLIP_EVENT;
+-					arg.reserved = 0;
+-
+-					if (drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_PAGE_FLIP, &arg)) {
+-						if (sna_crtc_flip(sna, sna_crtc, bo, 0, 0)) {
+-							assert(sna_crtc->bo->active_scanout);
+-							assert(sna_crtc->bo->refcnt >= sna_crtc->bo->active_scanout);
+-							sna_crtc->bo->active_scanout--;
+-							kgem_bo_destroy(&sna->kgem, sna_crtc->bo);
+-
+-							sna_crtc->bo = bo;
+-							sna_crtc->bo->active_scanout++;
+-							sna_crtc->client_bo = NULL;
+-						} else {
+-							DBG(("%s: flip [fb=%d] on crtc %d [%d, pipe=%d] failed - %d\n",
+-							     __FUNCTION__, arg.fb_id, i, sna_crtc->id, sna_crtc->pipe, errno));
+-							xf86DrvMsg(sna->scrn->scrnIndex, X_ERROR,
+-								   "Page flipping failed, disabling TearFree\n");
+-							sna->flags &= ~SNA_TEAR_FREE;
+-
+-							damage.extents = crtc->bounds;
+-							damage.data = NULL;
+-							sna_crtc_redisplay__fallback(crtc, &damage, sna_crtc->bo);
+-
+-							kgem_bo_destroy(&sna->kgem, bo);
+-							sna_crtc->client_bo = NULL;
+-						}
+-					} else {
+-						sna->mode.flip_active++;
+-
+-						assert(sna_crtc->flip_bo == NULL);
+-						sna_crtc->flip_handler = shadow_flip_handler;
+-						sna_crtc->flip_data = sna;
+-						sna_crtc->flip_bo = bo;
+-						sna_crtc->flip_bo->active_scanout++;
+-						sna_crtc->flip_serial = sna_crtc->mode_serial;
+-
+-						sna_crtc->client_bo = kgem_bo_reference(sna_crtc->bo);
+-					}
+-				}
++				sna_crtc_redisplay__fallback(crtc,
++							     &damage,
++							     sna_crtc->bo);
+ 			}
+ 			RegionUninit(&damage);
+ 
+@@ -7171,6 +9041,7 @@ void sna_mode_redisplay(struct sna *sna)
+ 		xf86CrtcPtr crtc = config->crtc[i];
+ 		struct sna_crtc *sna_crtc = to_sna_crtc(crtc);
+ 		RegionRec damage;
++		int sigio;
+ 
+ 		assert(sna_crtc != NULL);
+ 		DBG(("%s: crtc[%d] transformed? %d\n",
+@@ -7192,30 +9063,38 @@ void sna_mode_redisplay(struct sna *sna)
+ 		     region_num_rects(&damage),
+ 		     damage.extents.x1, damage.extents.y1,
+ 		     damage.extents.x2, damage.extents.y2));
++		sigio = sigio_block();
+ 		if (!box_empty(&damage.extents)) {
+ 			if (sna->flags & SNA_TEAR_FREE) {
++				RegionRec new_damage;
+ 				struct drm_mode_crtc_page_flip arg;
+ 				struct kgem_bo *bo;
+ 
+-				RegionUninit(&damage);
+-				damage.extents = crtc->bounds;
+-				damage.data = NULL;
++				RegionNull(&new_damage);
++				RegionCopy(&new_damage, &damage);
+ 
+-				bo = sna_crtc->client_bo;
+-				if (bo == NULL)
++				bo = sna_crtc->cache_bo;
++				if (bo == NULL) {
++					damage.extents = crtc->bounds;
++					damage.data = NULL;
+ 					bo = kgem_create_2d(&sna->kgem,
+ 							    crtc->mode.HDisplay,
+ 							    crtc->mode.VDisplay,
+ 							    crtc->scrn->bitsPerPixel,
+ 							    sna_crtc->bo->tiling,
+ 							    CREATE_SCANOUT);
+-				if (bo == NULL)
+-					goto disable1;
++					if (bo == NULL)
++						continue;
++				} else
++					RegionUnion(&damage, &damage, &sna_crtc->crtc_damage);
++				sna_crtc->crtc_damage = new_damage;
+ 
+ 				sna_crtc_redisplay(crtc, &damage, bo);
+ 				kgem_bo_submit(&sna->kgem, bo);
++				__kgem_bo_clear_dirty(bo);
+ 
+-				arg.crtc_id = sna_crtc->id;
++				assert_crtc_fb(sna, sna_crtc);
++				arg.crtc_id = __sna_crtc_id(sna_crtc);
+ 				arg.fb_id = get_fb(sna, bo,
+ 						   crtc->mode.HDisplay,
+ 						   crtc->mode.VDisplay);
+@@ -7228,6 +9107,9 @@ void sna_mode_redisplay(struct sna *sna)
+ 
+ 				if (drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_PAGE_FLIP, &arg)) {
+ 					if (sna_crtc_flip(sna, sna_crtc, bo, 0, 0)) {
++						DBG(("%s: removing handle=%d [active_scanout=%d] from scanout, installing handle=%d [active_scanout=%d]\n",
++						     __FUNCTION__, sna_crtc->bo->handle, sna_crtc->bo->active_scanout - 1,
++						     bo->handle, bo->active_scanout));
+ 						assert(sna_crtc->bo->active_scanout);
+ 						assert(sna_crtc->bo->refcnt >= sna_crtc->bo->active_scanout);
+ 						sna_crtc->bo->active_scanout--;
+@@ -7235,13 +9117,12 @@ void sna_mode_redisplay(struct sna *sna)
+ 
+ 						sna_crtc->bo = kgem_bo_reference(bo);
+ 						sna_crtc->bo->active_scanout++;
+-						sna_crtc->client_bo = kgem_bo_reference(bo);
+ 					} else {
+ 						BoxRec box;
+ 						DrawableRec tmp;
+ 
+ 						DBG(("%s: flip [fb=%d] on crtc %d [%d, pipe=%d] failed - %d\n",
+-						     __FUNCTION__, arg.fb_id, i, sna_crtc->id, sna_crtc->pipe, errno));
++						     __FUNCTION__, arg.fb_id, i, __sna_crtc_id(sna_crtc), __sna_crtc_pipe(sna_crtc), errno));
+ 						xf86DrvMsg(sna->scrn->scrnIndex, X_ERROR,
+ 							   "Page flipping failed, disabling TearFree\n");
+ 						sna->flags &= ~SNA_TEAR_FREE;
+@@ -7260,13 +9141,13 @@ disable1:
+ 									    &box, 1, COPY_LAST)) {
+ 							xf86DrvMsg(crtc->scrn->scrnIndex, X_ERROR,
+ 								   "%s: page flipping failed, disabling CRTC:%d (pipe=%d)\n",
+-								   __FUNCTION__, sna_crtc->id, sna_crtc->pipe);
+-							sna_crtc_disable(crtc);
++								   __FUNCTION__, __sna_crtc_id(sna_crtc), __sna_crtc_pipe(sna_crtc));
++							sna_crtc_disable(crtc, false);
+ 						}
+-
+-						kgem_bo_destroy(&sna->kgem, bo);
+-						sna_crtc->client_bo = NULL;
+ 					}
++
++					kgem_bo_destroy(&sna->kgem, bo);
++					sna_crtc->cache_bo = NULL;
+ 					continue;
+ 				}
+ 				sna->mode.flip_active++;
+@@ -7279,13 +9160,20 @@ disable1:
+ 				sna_crtc->flip_serial = sna_crtc->mode_serial;
+ 				sna_crtc->flip_pending = true;
+ 
+-				sna_crtc->client_bo = kgem_bo_reference(sna_crtc->bo);
++				if (sna_crtc->bo != sna->mode.shadow) {
++					assert_scanout(&sna->kgem, sna_crtc->bo,
++						       crtc->mode.HDisplay, crtc->mode.VDisplay);
++					sna_crtc->cache_bo = kgem_bo_reference(sna_crtc->bo);
++				}
++				DBG(("%s: recording flip on CRTC:%d handle=%d, active_scanout=%d, serial=%d\n",
++				     __FUNCTION__, __sna_crtc_id(sna_crtc), sna_crtc->flip_bo->handle, sna_crtc->flip_bo->active_scanout, sna_crtc->flip_serial));
+ 			} else {
+ 				sna_crtc_redisplay(crtc, &damage, sna_crtc->bo);
+ 				kgem_scanout_flush(&sna->kgem, sna_crtc->bo);
+ 			}
+ 		}
+ 		RegionUninit(&damage);
++		sigio_unblock(sigio);
+ 
+ 		if (sna_crtc->slave_damage)
+ 			DamageEmpty(sna_crtc->slave_damage);
+@@ -7296,6 +9184,7 @@ disable1:
+ 		struct kgem_bo *old = sna->mode.shadow;
+ 		struct drm_mode_crtc_page_flip arg;
+ 		uint32_t fb = 0;
++		int sigio;
+ 
+ 		DBG(("%s: flipping TearFree outputs, current scanout handle=%d [active?=%d], new handle=%d [active=%d]\n",
+ 		     __FUNCTION__, old->handle, old->active_scanout, new->handle, new->active_scanout));
+@@ -7307,7 +9196,9 @@ disable1:
+ 		arg.reserved = 0;
+ 
+ 		kgem_bo_submit(&sna->kgem, new);
++		__kgem_bo_clear_dirty(new);
+ 
++		sigio = sigio_block();
+ 		for (i = 0; i < sna->mode.num_real_crtc; i++) {
+ 			struct sna_crtc *crtc = config->crtc[i]->driver_private;
+ 			struct kgem_bo *flip_bo;
+@@ -7315,20 +9206,20 @@ disable1:
+ 
+ 			assert(crtc != NULL);
+ 			DBG(("%s: crtc %d [%d, pipe=%d] active? %d, transformed? %d\n",
+-			     __FUNCTION__, i, crtc->id, crtc->pipe, crtc->bo ? crtc->bo->handle : 0, crtc->transform));
++			     __FUNCTION__, i, __sna_crtc_id(crtc), __sna_crtc_pipe(crtc), crtc->bo ? crtc->bo->handle : 0, crtc->transform));
+ 			if (crtc->bo == NULL || crtc->transform)
+ 				continue;
+ 
+ 			assert(config->crtc[i]->enabled);
+-			assert(crtc->dpms_mode <= DPMSModeOn);
+ 			assert(crtc->flip_bo == NULL);
++			assert_crtc_fb(sna, crtc);
+ 
+-			arg.crtc_id = crtc->id;
++			arg.crtc_id = __sna_crtc_id(crtc);
+ 			arg.user_data = (uintptr_t)crtc;
+ 
+ 			if (crtc->client_bo) {
+ 				DBG(("%s: apply shadow override bo for CRTC:%d on pipe=%d, handle=%d\n",
+-				     __FUNCTION__, crtc->id, crtc->pipe, crtc->client_bo->handle));
++				     __FUNCTION__, __sna_crtc_id(crtc), __sna_crtc_pipe(crtc), crtc->client_bo->handle));
+ 				arg.fb_id = get_fb(sna, crtc->client_bo,
+ 						   crtc->base->mode.HDisplay,
+ 						   crtc->base->mode.VDisplay);
+@@ -7356,6 +9247,7 @@ fixup_shadow:
+ 						}
+ 					}
+ 
++					sigio_unblock(sigio);
+ 					return;
+ 				}
+ 
+@@ -7365,8 +9257,12 @@ fixup_shadow:
+ 				y = crtc->base->y;
+ 			}
+ 
+-			if (crtc->bo == flip_bo)
++			if (crtc->bo == flip_bo) {
++				assert(crtc->bo->refcnt >= crtc->bo->active_scanout);
++				DBG(("%s: flip handle=%d is already on the CRTC\n",
++				     __FUNCTION__, flip_bo->handle));
+ 				continue;
++			}
+ 
+ 			if (flip_bo->pitch != crtc->bo->pitch || (y << 16 | x)  != crtc->offset) {
+ 				DBG(("%s: changing pitch (new %d =?= old %d) or offset (new %x =?= old %x)\n",
+@@ -7375,6 +9271,9 @@ fixup_shadow:
+ 				     y << 16 | x, crtc->offset));
+ fixup_flip:
+ 				if (sna_crtc_flip(sna, crtc, flip_bo, x, y)) {
++					DBG(("%s: removing handle=%d [active_scanout=%d] from scanout, installing handle=%d [active_scanout=%d]\n",
++					     __FUNCTION__, crtc->bo->handle, crtc->bo->active_scanout-1,
++					     flip_bo->handle, flip_bo->active_scanout));
+ 					assert(flip_bo != crtc->bo);
+ 					assert(crtc->bo->active_scanout);
+ 					assert(crtc->bo->refcnt >= crtc->bo->active_scanout);
+@@ -7389,9 +9288,11 @@ fixup_flip:
+ 					crtc->bo = kgem_bo_reference(flip_bo);
+ 					crtc->bo->active_scanout++;
+ 				} else {
+-					xf86DrvMsg(sna->scrn->scrnIndex, X_ERROR,
+-						   "Failed to prepare CRTC for page flipping, disabling TearFree\n");
+-					sna->flags &= ~SNA_TEAR_FREE;
++					if (sna->flags & SNA_TEAR_FREE) {
++						xf86DrvMsg(sna->scrn->scrnIndex, X_ERROR,
++								"Failed to prepare CRTC for page flipping, disabling TearFree\n");
++						sna->flags &= ~SNA_TEAR_FREE;
++					}
+ 
+ 					if (sna->mode.flip_active == 0) {
+ 						DBG(("%s: abandoning flip attempt\n", __FUNCTION__));
+@@ -7400,15 +9301,15 @@ fixup_flip:
+ 
+ 					xf86DrvMsg(sna->scrn->scrnIndex, X_ERROR,
+ 						   "%s: page flipping failed, disabling CRTC:%d (pipe=%d)\n",
+-						   __FUNCTION__, crtc->id, crtc->pipe);
+-					sna_crtc_disable(crtc->base);
++						   __FUNCTION__, __sna_crtc_id(crtc), __sna_crtc_pipe(crtc));
++					sna_crtc_disable(crtc->base, false);
+ 				}
+ 				continue;
+ 			}
+ 
+ 			if (drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_PAGE_FLIP, &arg)) {
+ 				ERR(("%s: flip [fb=%d] on crtc %d [%d, pipe=%d] failed - %d\n",
+-				     __FUNCTION__, arg.fb_id, i, crtc->id, crtc->pipe, errno));
++				     __FUNCTION__, arg.fb_id, i, __sna_crtc_id(crtc), __sna_crtc_pipe(crtc), errno));
+ 				goto fixup_flip;
+ 			}
+ 			sna->mode.flip_active++;
+@@ -7421,6 +9322,9 @@ fixup_flip:
+ 			crtc->flip_serial = crtc->mode_serial;
+ 			crtc->flip_pending = true;
+ 
++			DBG(("%s: recording flip on CRTC:%d handle=%d, active_scanout=%d, serial=%d\n",
++			     __FUNCTION__, __sna_crtc_id(crtc), crtc->flip_bo->handle, crtc->flip_bo->active_scanout, crtc->flip_serial));
++
+ 			{
+ 				struct drm_i915_gem_busy busy = { flip_bo->handle };
+ 				if (drmIoctl(sna->kgem.fd, DRM_IOCTL_I915_GEM_BUSY, &busy) == 0) {
+@@ -7435,6 +9339,7 @@ fixup_flip:
+ 				}
+ 			}
+ 		}
++		sigio_unblock(sigio);
+ 
+ 		DBG(("%s: flipped %d outputs, shadow active? %d\n",
+ 		     __FUNCTION__,
+@@ -7486,7 +9391,9 @@ again:
+ 		struct drm_event *e = (struct drm_event *)&buffer[i];
+ 		switch (e->type) {
+ 		case DRM_EVENT_VBLANK:
+-			if (((uintptr_t)((struct drm_event_vblank *)e)->user_data) & 2)
++			if (sna->mode.shadow_wait)
++				defer_event(sna, e);
++			else if (((uintptr_t)((struct drm_event_vblank *)e)->user_data) & 2)
+ 				sna_present_vblank_handler((struct drm_event_vblank *)e);
+ 			else
+ 				sna_dri2_vblank_handler((struct drm_event_vblank *)e);
+@@ -7495,13 +9402,19 @@ again:
+ 			{
+ 				struct drm_event_vblank *vbl = (struct drm_event_vblank *)e;
+ 				struct sna_crtc *crtc = (void *)(uintptr_t)vbl->user_data;
++				uint64_t msc;
+ 
+ 				/* Beware Zaphod! */
+ 				sna = to_sna(crtc->base->scrn);
+ 
+-				crtc->swap.tv_sec = vbl->tv_sec;
+-				crtc->swap.tv_usec = vbl->tv_usec;
+-				crtc->swap.msc = msc64(crtc, vbl->sequence);
++				if (msc64(crtc, vbl->sequence, &msc)) {
++					DBG(("%s: recording last swap on pipe=%d, frame %d [%08llx], time %d.%06d\n",
++					     __FUNCTION__, __sna_crtc_pipe(crtc), vbl->sequence, (long long)msc, vbl->tv_sec, vbl->tv_usec));
++					crtc->swap.tv_sec = vbl->tv_sec;
++					crtc->swap.tv_usec = vbl->tv_usec;
++					crtc->swap.msc = msc;
++				}
++				assert(crtc->flip_pending);
+ 				crtc->flip_pending = false;
+ 
+ 				assert(crtc->flip_bo);
+@@ -7509,10 +9422,12 @@ again:
+ 				assert(crtc->flip_bo->refcnt >= crtc->flip_bo->active_scanout);
+ 
+ 				if (crtc->flip_serial == crtc->mode_serial) {
+-					DBG(("%s: removing handle=%d from scanout, installing handle=%d\n",
+-					     __FUNCTION__, crtc->bo->handle, crtc->flip_bo->handle));
++					DBG(("%s: removing handle=%d [active_scanout=%d] from scanout, installing handle=%d [active_scanout=%d]\n",
++					     __FUNCTION__, crtc->bo->handle, crtc->bo->active_scanout - 1,
++					     crtc->flip_bo->handle, crtc->flip_bo->active_scanout));
+ 					assert(crtc->bo->active_scanout);
+ 					assert(crtc->bo->refcnt >= crtc->bo->active_scanout);
++
+ 					crtc->bo->active_scanout--;
+ 					kgem_bo_destroy(&sna->kgem, crtc->bo);
+ 
+@@ -7523,6 +9438,8 @@ again:
+ 
+ 					crtc->bo = crtc->flip_bo;
+ 					crtc->flip_bo = NULL;
++
++					assert_crtc_fb(sna, crtc);
+ 				} else {
+ 					crtc->flip_bo->active_scanout--;
+ 					kgem_bo_destroy(&sna->kgem, crtc->flip_bo);
+@@ -7531,8 +9448,10 @@ again:
+ 
+ 				DBG(("%s: flip complete, pending? %d\n", __FUNCTION__, sna->mode.flip_active));
+ 				assert(sna->mode.flip_active);
+-				if (--sna->mode.flip_active == 0)
++				if (--sna->mode.flip_active == 0) {
++					assert(crtc->flip_handler);
+ 					crtc->flip_handler(vbl, crtc->flip_data);
++				}
+ 			}
+ 			break;
+ 		default:
+diff --git a/src/sna/sna_display_fake.c b/src/sna/sna_display_fake.c
+index 4d74c38d..fa26bda1 100644
+--- a/src/sna/sna_display_fake.c
++++ b/src/sna/sna_display_fake.c
+@@ -96,12 +96,6 @@ sna_crtc_set_mode_major(xf86CrtcPtr crtc, DisplayModePtr mode,
+ }
+ 
+ static void
+-sna_crtc_gamma_set(xf86CrtcPtr crtc,
+-		       CARD16 *red, CARD16 *green, CARD16 *blue, int size)
+-{
+-}
+-
+-static void
+ sna_crtc_destroy(xf86CrtcPtr crtc)
+ {
+ }
+@@ -109,7 +103,6 @@ sna_crtc_destroy(xf86CrtcPtr crtc)
+ static const xf86CrtcFuncsRec sna_crtc_funcs = {
+ 	.dpms = sna_crtc_dpms,
+ 	.set_mode_major = sna_crtc_set_mode_major,
+-	.gamma_set = sna_crtc_gamma_set,
+ 	.destroy = sna_crtc_destroy,
+ };
+ 
+@@ -192,7 +185,7 @@ static const xf86OutputFuncsRec sna_output_funcs = {
+ static Bool
+ sna_mode_resize(ScrnInfoPtr scrn, int width, int height)
+ {
+-	ScreenPtr screen = scrn->pScreen;
++	ScreenPtr screen = xf86ScrnToScreen(scrn);
+ 	PixmapPtr new_front;
+ 
+ 	DBG(("%s (%d, %d) -> (%d, %d)\n", __FUNCTION__,
+@@ -262,6 +255,7 @@ static bool add_fake_output(struct sna *sna, bool late)
+ 	output->mm_height = 0;
+ 	output->interlaceAllowed = FALSE;
+ 	output->subpixel_order = SubPixelNone;
++	output->status = XF86OutputStatusDisconnected;
+ 
+ 	output->possible_crtcs = ~((1 << sna->mode.num_real_crtc) - 1);
+ 	output->possible_clones = ~((1 << sna->mode.num_real_output) - 1);
+@@ -297,6 +291,8 @@ static bool add_fake_output(struct sna *sna, bool late)
+ 
+ 		RRCrtcSetRotations(crtc->randr_crtc,
+ 				   RR_Rotate_All | RR_Reflect_All);
++		if (!RRCrtcGammaSetSize(crtc->randr_crtc, 256))
++			goto err;
+ 	}
+ 
+ 	sna->mode.num_fake++;
+@@ -312,13 +308,16 @@ err:
+ 			continue;
+ 
+ 		xf86OutputDestroy(output);
++		i--;
+ 	}
+ 
+ 	for (i = 0; i < xf86_config->num_crtc; i++) {
+ 		crtc = xf86_config->crtc[i];
+ 		if (crtc->driver_private)
+ 			continue;
++
+ 		xf86CrtcDestroy(crtc);
++		i--;
+ 	}
+ 	sna->mode.num_fake = -1;
+ 	return false;
+diff --git a/src/sna/sna_dri2.c b/src/sna/sna_dri2.c
+index e5c4d53e..d89525cc 100644
+--- a/src/sna/sna_dri2.c
++++ b/src/sna/sna_dri2.c
+@@ -82,12 +82,23 @@ get_private(void *buffer)
+ 	return (struct sna_dri2_private *)((DRI2Buffer2Ptr)buffer+1);
+ }
+ 
++pure static inline DRI2BufferPtr sna_pixmap_get_buffer(PixmapPtr pixmap)
++{
++	assert(pixmap->refcnt);
++	return ((void **)__get_private(pixmap, sna_pixmap_key))[2];
++}
++
++static inline void sna_pixmap_set_buffer(PixmapPtr pixmap, void *ptr)
++{
++	assert(pixmap->refcnt);
++	((void **)__get_private(pixmap, sna_pixmap_key))[2] = ptr;
++}
++
+ #if DRI2INFOREC_VERSION >= 4
+ enum event_type {
+ 	WAITMSC = 0,
+ 	SWAP,
+-	SWAP_WAIT,
+-	SWAP_THROTTLE,
++	SWAP_COMPLETE,
+ 	FLIP,
+ 	FLIP_THROTTLE,
+ 	FLIP_COMPLETE,
+@@ -98,6 +109,7 @@ struct dri_bo {
+ 	struct list link;
+ 	struct kgem_bo *bo;
+ 	uint32_t name;
++	unsigned flags;
+ };
+ 
+ struct sna_dri2_event {
+@@ -108,6 +120,8 @@ struct sna_dri2_event {
+ 	xf86CrtcPtr crtc;
+ 	int pipe;
+ 	bool queued;
++	bool sync;
++	bool chained;
+ 
+ 	/* for swaps & flips only */
+ 	DRI2SwapEventPtr event_complete;
+@@ -116,35 +130,146 @@ struct sna_dri2_event {
+ 	DRI2BufferPtr back;
+ 	struct kgem_bo *bo;
+ 
++	struct copy {
++		struct kgem_bo *bo;
++		unsigned flags;
++		uint32_t name;
++		uint32_t size;
++	} pending;
++
+ 	struct sna_dri2_event *chain;
+ 
+-	struct list cache;
+ 	struct list link;
+ 
+-	int mode;
++	int flip_continue;
++	int keepalive;
++	int signal;
+ };
+ 
++#if DRI2INFOREC_VERSION < 10
++#undef USE_ASYNC_SWAP
++#endif
++
++#if USE_ASYNC_SWAP
++#define KEEPALIVE 8 /* wait ~100ms before discarding swap caches */
++#define APPLY_DAMAGE 0
++#else
++#define USE_ASYNC_SWAP 0
++#define KEEPALIVE 1
++#define APPLY_DAMAGE 1
++#endif
++
+ static void sna_dri2_flip_event(struct sna_dri2_event *flip);
++inline static DRI2BufferPtr dri2_window_get_front(WindowPtr win);
++
++static struct kgem_bo *
++__sna_dri2_copy_region(struct sna *sna, DrawablePtr draw, RegionPtr region,
++		      DRI2BufferPtr src, DRI2BufferPtr dst,
++		      unsigned flags);
++
++inline static void
++__sna_dri2_copy_event(struct sna_dri2_event *info, unsigned flags)
++{
++	DBG(("%s: flags = %x\n", __FUNCTION__, flags));
++	assert(info->front != info->back);
++	info->bo = __sna_dri2_copy_region(info->sna, info->draw, NULL,
++					  info->back, info->front,
++					  flags);
++	info->front->flags = info->back->flags;
++}
++
++static int front_pitch(DrawablePtr draw)
++{
++	DRI2BufferPtr buffer;
++
++	buffer = NULL;
++	if (draw->type != DRAWABLE_PIXMAP)
++		buffer = dri2_window_get_front((WindowPtr)draw);
++	if (buffer == NULL)
++		buffer = sna_pixmap_get_buffer(get_drawable_pixmap(draw));
++
++	return buffer ? buffer->pitch : 0;
++}
++
++struct dri2_window {
++	DRI2BufferPtr front;
++	struct sna_dri2_event *chain;
++	xf86CrtcPtr crtc;
++	int64_t msc_delta;
++	struct list cache;
++	uint32_t cache_size;
++	int scanout;
++};
++
++static struct dri2_window *dri2_window(WindowPtr win)
++{
++	assert(win->drawable.type != DRAWABLE_PIXMAP);
++	return ((void **)__get_private(win, sna_window_key))[1];
++}
++
++static bool use_scanout(struct sna *sna,
++			DrawablePtr draw,
++			struct dri2_window *priv)
++{
++	if (priv->front)
++		return true;
++
++	if (priv->scanout < 0)
++		priv->scanout =
++			(sna->flags & (SNA_LINEAR_FB | SNA_NO_WAIT | SNA_NO_FLIP)) == 0 &&
++			draw->width  == sna->front->drawable.width &&
++			draw->height == sna->front->drawable.height &&
++			draw->bitsPerPixel == sna->front->drawable.bitsPerPixel;
++
++	return priv->scanout;
++}
+ 
+ static void
+ sna_dri2_get_back(struct sna *sna,
+ 		  DrawablePtr draw,
+-		  DRI2BufferPtr back,
+-		  struct sna_dri2_event *info)
++		  DRI2BufferPtr back)
+ {
++	struct dri2_window *priv = dri2_window((WindowPtr)draw);
++	uint32_t size;
+ 	struct kgem_bo *bo;
++	struct dri_bo *c;
+ 	uint32_t name;
++	int flags;
+ 	bool reuse;
+ 
+-	DBG(("%s: draw size=%dx%d, buffer size=%dx%d\n",
++	DBG(("%s: draw size=%dx%d, back buffer handle=%d size=%dx%d, is-scanout? %d, active?=%d, pitch=%d, front pitch=%d\n",
+ 	     __FUNCTION__, draw->width, draw->height,
+-	     get_private(back)->size & 0xffff, get_private(back)->size >> 16));
+-	reuse = (draw->height << 16 | draw->width) == get_private(back)->size;
++	     get_private(back)->bo->handle,
++	     get_private(back)->size & 0xffff, get_private(back)->size >> 16,
++	     get_private(back)->bo->scanout,
++	     get_private(back)->bo->active_scanout,
++	     back->pitch, front_pitch(draw)));
++	assert(priv);
++
++	size = draw->height << 16 | draw->width;
++	if (size != priv->cache_size) {
++		while (!list_is_empty(&priv->cache)) {
++			c = list_first_entry(&priv->cache, struct dri_bo, link);
++			list_del(&c->link);
++
++			DBG(("%s: releasing cached handle=%d\n", __FUNCTION__, c->bo ? c->bo->handle : 0));
++			assert(c->bo);
++			kgem_bo_destroy(&sna->kgem, c->bo);
++
++			free(c);
++		}
++		priv->cache_size = size;
++	}
++
++	reuse = size == get_private(back)->size;
++	if (reuse)
++		reuse = get_private(back)->bo->scanout == use_scanout(sna, draw, priv);
++	DBG(("%s: reuse backbuffer? %d\n", __FUNCTION__, reuse));
+ 	if (reuse) {
+ 		bo = get_private(back)->bo;
+ 		assert(bo->refcnt);
+-		DBG(("%s: back buffer handle=%d, scanout?=%d, refcnt=%d\n",
+-					__FUNCTION__, bo->handle, bo->active_scanout, get_private(back)->refcnt));
++		DBG(("%s: back buffer handle=%d, active?=%d, refcnt=%d\n",
++		     __FUNCTION__, bo->handle, bo->active_scanout, get_private(back)->refcnt));
+ 		if (bo->active_scanout == 0) {
+ 			DBG(("%s: reuse unattached back\n", __FUNCTION__));
+ 			get_private(back)->stale = false;
+@@ -153,24 +278,37 @@ sna_dri2_get_back(struct sna *sna,
+ 	}
+ 
+ 	bo = NULL;
+-	if (info) {
+-		struct dri_bo *c;
+-		list_for_each_entry(c, &info->cache, link) {
+-			if (c->bo && c->bo->scanout == 0) {
+-				bo = c->bo;
+-				name = c->name;
+-				DBG(("%s: reuse cache handle=%d\n", __FUNCTION__, bo->handle));
+-				list_move_tail(&c->link, &info->cache);
+-				c->bo = NULL;
++	list_for_each_entry(c, &priv->cache, link) {
++		DBG(("%s: cache: handle=%d, active=%d\n",
++		     __FUNCTION__, c->bo ? c->bo->handle : 0, c->bo ? c->bo->active_scanout : -1));
++		assert(c->bo);
++		if (c->bo->active_scanout == 0) {
++			_list_del(&c->link);
++			if (c->bo == NULL) {
++				free(c);
++				goto out;
+ 			}
++			bo = c->bo;
++			name = c->name;
++			flags = c->flags;
++			DBG(("%s: reuse cache handle=%d, name=%d, flags=%d\n", __FUNCTION__, bo->handle, name, flags));
++			c->bo = NULL;
++			break;
+ 		}
+ 	}
+ 	if (bo == NULL) {
+ 		DBG(("%s: allocating new backbuffer\n", __FUNCTION__));
++		flags = CREATE_EXACT;
++
++		if (use_scanout(sna, draw, priv)) {
++			DBG(("%s: requesting scanout compatible back\n", __FUNCTION__));
++			flags |= CREATE_SCANOUT;
++		}
++
+ 		bo = kgem_create_2d(&sna->kgem,
+ 				    draw->width, draw->height, draw->bitsPerPixel,
+ 				    get_private(back)->bo->tiling,
+-				    get_private(back)->bo->scanout ? CREATE_SCANOUT : 0);
++				    flags);
+ 		if (bo == NULL)
+ 			return;
+ 
+@@ -179,30 +317,42 @@ sna_dri2_get_back(struct sna *sna,
+ 			kgem_bo_destroy(&sna->kgem, bo);
+ 			return;
+ 		}
++
++		flags = 0;
++		if (USE_ASYNC_SWAP && back->flags) {
++			BoxRec box;
++
++			box.x1 = 0;
++			box.y1 = 0;
++			box.x2 = draw->width;
++			box.y2 = draw->height;
++
++			DBG(("%s: filling new buffer with old back\n", __FUNCTION__));
++			if (sna->render.copy_boxes(sna, GXcopy,
++						   draw, get_private(back)->bo, 0, 0,
++						   draw, bo, 0, 0,
++						   &box, 1, COPY_LAST | COPY_DRI))
++				flags = back->flags;
++		}
+ 	}
+ 	assert(bo->active_scanout == 0);
+ 
+-	if (info && reuse) {
+-		bool found = false;
+-		struct dri_bo *c;
+-
+-		list_for_each_entry_reverse(c, &info->cache, link) {
+-			if (c->bo == NULL) {
+-				found = true;
+-				_list_del(&c->link);
+-				break;
+-			}
+-		}
+-		if (!found)
++	if (reuse && get_private(back)->bo->refcnt == 1 + get_private(back)->bo->active_scanout) {
++		if (&c->link == &priv->cache)
+ 			c = malloc(sizeof(*c));
+ 		if (c != NULL) {
+ 			c->bo = ref(get_private(back)->bo);
+ 			c->name = back->name;
+-			list_add(&c->link, &info->cache);
+-			DBG(("%s: cacheing handle=%d (name=%d)\n", __FUNCTION__, c->bo->handle, c->name));
++			c->flags = back->flags;
++			list_add(&c->link, &priv->cache);
++			DBG(("%s: caching handle=%d (name=%d, flags=%d, active_scanout=%d)\n", __FUNCTION__, c->bo->handle, c->name, c->flags, c->bo->active_scanout));
+ 		}
++	} else {
++		if (&c->link != &priv->cache)
++			free(c);
+ 	}
+ 
++	assert(bo->active_scanout == 0);
+ 	assert(bo != get_private(back)->bo);
+ 	kgem_bo_destroy(&sna->kgem, get_private(back)->bo);
+ 
+@@ -210,21 +360,13 @@ sna_dri2_get_back(struct sna *sna,
+ 	get_private(back)->size = draw->height << 16 | draw->width;
+ 	back->pitch = bo->pitch;
+ 	back->name = name;
++	back->flags = flags;
+ 
+-	get_private(back)->stale = false;
+-}
+-
+-struct dri2_window {
+-	DRI2BufferPtr front;
+-	struct sna_dri2_event *chain;
+-	xf86CrtcPtr crtc;
+-	int64_t msc_delta;
+-};
++	assert(back->pitch);
++	assert(back->name);
+ 
+-static struct dri2_window *dri2_window(WindowPtr win)
+-{
+-	assert(win->drawable.type != DRAWABLE_PIXMAP);
+-	return ((void **)__get_private(win, sna_window_key))[1];
++out:
++	get_private(back)->stale = false;
+ }
+ 
+ static struct sna_dri2_event *
+@@ -232,21 +374,25 @@ dri2_chain(DrawablePtr d)
+ {
+ 	struct dri2_window *priv = dri2_window((WindowPtr)d);
+ 	assert(priv != NULL);
++	assert(priv->chain == NULL || priv->chain->chained);
+ 	return priv->chain;
+ }
+ inline static DRI2BufferPtr dri2_window_get_front(WindowPtr win)
+ {
+ 	struct dri2_window *priv = dri2_window(win);
++	assert(priv->front == NULL || get_private(priv->front)->bo->active_scanout);
+ 	return priv ? priv->front : NULL;
+ }
+ #else
+ inline static void *dri2_window_get_front(WindowPtr win) { return NULL; }
++#define APPLY_DAMAGE 1
+ #endif
+ 
+ #if DRI2INFOREC_VERSION < 6
+ 
+ #define xorg_can_triple_buffer() 0
+ #define swap_limit(d, l) false
++#define mark_stale(b)
+ 
+ #else
+ 
+@@ -273,6 +419,8 @@ mark_stale(DRI2BufferPtr back)
+ 	 * stale frame. (This is mostly useful for tracking down
+ 	 * driver bugs!)
+ 	 */
++	DBG(("%s(handle=%d) => %d\n", __FUNCTION__,
++	     get_private(back)->bo->handle, xorg_can_triple_buffer()));
+ 	get_private(back)->stale = xorg_can_triple_buffer();
+ }
+ 
+@@ -286,21 +434,29 @@ sna_dri2_swap_limit_validate(DrawablePtr draw, int swap_limit)
+ static void
+ sna_dri2_reuse_buffer(DrawablePtr draw, DRI2BufferPtr buffer)
+ {
++	struct sna *sna = to_sna_from_drawable(draw);
++
+ 	DBG(("%s: reusing buffer pixmap=%ld, attachment=%d, handle=%d, name=%d\n",
+ 	     __FUNCTION__, get_drawable_pixmap(draw)->drawable.serialNumber,
+ 	     buffer->attachment, get_private(buffer)->bo->handle, buffer->name));
+ 	assert(get_private(buffer)->refcnt);
+-	assert(get_private(buffer)->bo->refcnt > get_private(buffer)->bo->active_scanout);
++	assert(get_private(buffer)->bo->refcnt >= get_private(buffer)->bo->active_scanout);
++	assert(kgem_bo_flink(&sna->kgem, get_private(buffer)->bo) == buffer->name);
+ 
+ 	if (buffer->attachment == DRI2BufferBackLeft &&
+ 	    draw->type != DRAWABLE_PIXMAP) {
+-		DBG(("%s: replacing back buffer\n", __FUNCTION__));
+-		sna_dri2_get_back(to_sna_from_drawable(draw), draw, buffer, dri2_chain(draw));
++		DBG(("%s: replacing back buffer on window %ld\n", __FUNCTION__, draw->id));
++		sna_dri2_get_back(sna, draw, buffer);
+ 
+-		assert(kgem_bo_flink(&to_sna_from_drawable(draw)->kgem, get_private(buffer)->bo) == buffer->name);
+ 		assert(get_private(buffer)->bo->refcnt);
+ 		assert(get_private(buffer)->bo->active_scanout == 0);
++		assert(kgem_bo_flink(&sna->kgem, get_private(buffer)->bo) == buffer->name);
++		DBG(("%s: reusing back buffer handle=%d, name=%d, pitch=%d, age=%d\n",
++		     __FUNCTION__, get_private(buffer)->bo->handle,
++		     buffer->name, buffer->pitch, buffer->flags));
+ 	}
++
++	kgem_bo_submit(&sna->kgem, get_private(buffer)->bo);
+ }
+ 
+ static bool swap_limit(DrawablePtr draw, int limit)
+@@ -314,11 +470,6 @@ static bool swap_limit(DrawablePtr draw, int limit)
+ }
+ #endif
+ 
+-#if DRI2INFOREC_VERSION < 10
+-#undef USE_ASYNC_SWAP
+-#define USE_ASYNC_SWAP 0
+-#endif
+-
+ #define COLOR_PREFER_TILING_Y 0
+ 
+ /* Prefer to enable TILING_Y if this buffer will never be a
+@@ -328,6 +479,9 @@ static uint32_t color_tiling(struct sna *sna, DrawablePtr draw)
+ {
+ 	uint32_t tiling;
+ 
++	if (!sna->kgem.can_fence)
++		return I915_TILING_NONE;
++
+ 	if (COLOR_PREFER_TILING_Y &&
+ 	    (draw->width  != sna->front->drawable.width ||
+ 	     draw->height != sna->front->drawable.height))
+@@ -355,7 +509,6 @@ static struct kgem_bo *sna_pixmap_set_dri(struct sna *sna,
+ 					  PixmapPtr pixmap)
+ {
+ 	struct sna_pixmap *priv;
+-	int tiling;
+ 
+ 	DBG(("%s: attaching DRI client to pixmap=%ld\n",
+ 	     __FUNCTION__, pixmap->drawable.serialNumber));
+@@ -373,31 +526,29 @@ static struct kgem_bo *sna_pixmap_set_dri(struct sna *sna,
+ 		return NULL;
+ 	}
+ 
+-	assert(priv->flush == false);
++	assert(priv->flush == false || priv->pinned & PIN_DRI3);
++	assert(priv->gpu_bo->flush == false || priv->pinned & PIN_DRI3);
+ 	assert(priv->cpu_damage == NULL);
+ 	assert(priv->gpu_bo);
+ 	assert(priv->gpu_bo->proxy == NULL);
+-	assert(priv->gpu_bo->flush == false);
+-
+-	tiling = color_tiling(sna, &pixmap->drawable);
+-	if (tiling < 0)
+-		tiling = -tiling;
+-	if (priv->gpu_bo->tiling != tiling)
+-		sna_pixmap_change_tiling(pixmap, tiling);
+ 
+-	return priv->gpu_bo;
+-}
++	if (!kgem_bo_is_fenced(&sna->kgem, priv->gpu_bo)) {
++		if (priv->gpu_bo->tiling &&
++		    !sna_pixmap_change_tiling(pixmap, I915_TILING_NONE)) {
++			DBG(("%s: failed to discard tiling (%d) for DRI2 protocol\n", __FUNCTION__, priv->gpu_bo->tiling));
++			return NULL;
++		}
++	} else {
++		int tiling = color_tiling(sna, &pixmap->drawable);
++		if (tiling < 0)
++			tiling = -tiling;
++		if (priv->gpu_bo->tiling < tiling && !priv->gpu_bo->scanout)
++			sna_pixmap_change_tiling(pixmap, tiling);
++	}
+ 
+-pure static inline void *sna_pixmap_get_buffer(PixmapPtr pixmap)
+-{
+-	assert(pixmap->refcnt);
+-	return ((void **)__get_private(pixmap, sna_pixmap_key))[2];
+-}
++	priv->gpu_bo->active_scanout++;
+ 
+-static inline void sna_pixmap_set_buffer(PixmapPtr pixmap, void *ptr)
+-{
+-	assert(pixmap->refcnt);
+-	((void **)__get_private(pixmap, sna_pixmap_key))[2] = ptr;
++	return priv->gpu_bo;
+ }
+ 
+ void
+@@ -422,13 +573,18 @@ sna_dri2_pixmap_update_bo(struct sna *sna, PixmapPtr pixmap, struct kgem_bo *bo)
+ 	if (private->bo == bo)
+ 		return;
+ 
++	assert(private->bo->active_scanout > 0);
++	private->bo->active_scanout--;
++
+ 	DBG(("%s: dropping flush hint from handle=%d\n", __FUNCTION__, private->bo->handle));
+ 	private->bo->flush = false;
+ 	kgem_bo_destroy(&sna->kgem, private->bo);
+ 
++
+ 	buffer->name = kgem_bo_flink(&sna->kgem, bo);
+ 	buffer->pitch = bo->pitch;
+ 	private->bo = ref(bo);
++	bo->active_scanout++;
+ 
+ 	DBG(("%s: adding flush hint to handle=%d\n", __FUNCTION__, bo->handle));
+ 	bo->flush = true;
+@@ -449,9 +605,9 @@ sna_dri2_create_buffer(DrawablePtr draw,
+ 	struct sna_dri2_private *private;
+ 	PixmapPtr pixmap;
+ 	struct kgem_bo *bo;
+-	unsigned flags = 0;
++	unsigned bpp = format ?: draw->bitsPerPixel;
++	unsigned flags = CREATE_EXACT;
+ 	uint32_t size;
+-	int bpp;
+ 
+ 	DBG(("%s pixmap=%ld, (attachment=%d, format=%d, drawable=%dx%d), window?=%d\n",
+ 	     __FUNCTION__,
+@@ -468,11 +624,11 @@ sna_dri2_create_buffer(DrawablePtr draw,
+ 		if (draw->type != DRAWABLE_PIXMAP)
+ 			buffer = dri2_window_get_front((WindowPtr)draw);
+ 		if (buffer == NULL)
+-			buffer = sna_pixmap_get_buffer(pixmap);
++			buffer = (DRI2Buffer2Ptr)sna_pixmap_get_buffer(pixmap);
+ 		if (buffer) {
+ 			private = get_private(buffer);
+ 
+-			DBG(("%s: reusing front buffer attachment, win=%lu %dx%d, pixmap=%ld [%ld] %dx%d, handle=%d, name=%d\n",
++			DBG(("%s: reusing front buffer attachment, win=%lu %dx%d, pixmap=%ld [%ld] %dx%d, handle=%d, name=%d, active_scanout=%d\n",
+ 			     __FUNCTION__,
+ 			     draw->type != DRAWABLE_PIXMAP ? (long)draw->id : (long)0,
+ 			     draw->width, draw->height,
+@@ -480,12 +636,22 @@ sna_dri2_create_buffer(DrawablePtr draw,
+ 			     private->pixmap->drawable.serialNumber,
+ 			     pixmap->drawable.width,
+ 			     pixmap->drawable.height,
+-			     private->bo->handle, buffer->name));
++			     private->bo->handle, buffer->name,
++			     private->bo->active_scanout));
+ 
++			assert(buffer->attachment == DRI2BufferFrontLeft);
+ 			assert(private->pixmap == pixmap);
+ 			assert(sna_pixmap(pixmap)->flush);
+ 			assert(sna_pixmap(pixmap)->pinned & PIN_DRI2);
+ 			assert(kgem_bo_flink(&sna->kgem, private->bo) == buffer->name);
++			assert(private->bo->pitch == buffer->pitch);
++			assert(private->bo->active_scanout);
++
++			sna_pixmap_move_to_gpu(pixmap,
++					       MOVE_READ |
++					       __MOVE_FORCE |
++					       __MOVE_DRI);
++			kgem_bo_submit(&sna->kgem, private->bo);
+ 
+ 			private->refcnt++;
+ 			return buffer;
+@@ -498,7 +664,6 @@ sna_dri2_create_buffer(DrawablePtr draw,
+ 		assert(sna_pixmap(pixmap) != NULL);
+ 
+ 		bo = ref(bo);
+-		bpp = pixmap->drawable.bitsPerPixel;
+ 		if (pixmap == sna->front && !(sna->flags & SNA_LINEAR_FB))
+ 			flags |= CREATE_SCANOUT;
+ 		DBG(("%s: attaching to front buffer %dx%d [%p:%d], scanout? %d\n",
+@@ -506,6 +671,7 @@ sna_dri2_create_buffer(DrawablePtr draw,
+ 		     pixmap->drawable.width, pixmap->drawable.height,
+ 		     pixmap, pixmap->refcnt, flags & CREATE_SCANOUT));
+ 		size = (uint32_t)pixmap->drawable.height << 16 | pixmap->drawable.width;
++		bpp = pixmap->drawable.bitsPerPixel;
+ 		break;
+ 
+ 	case DRI2BufferBackLeft:
+@@ -514,6 +680,7 @@ sna_dri2_create_buffer(DrawablePtr draw,
+ 				flags |= CREATE_SCANOUT;
+ 			if (draw->width  == sna->front->drawable.width &&
+ 			    draw->height == sna->front->drawable.height &&
++			    draw->bitsPerPixel == bpp &&
+ 			    (sna->flags & (SNA_LINEAR_FB | SNA_NO_WAIT | SNA_NO_FLIP)) == 0)
+ 				flags |= CREATE_SCANOUT;
+ 		}
+@@ -521,7 +688,6 @@ sna_dri2_create_buffer(DrawablePtr draw,
+ 	case DRI2BufferFrontRight:
+ 	case DRI2BufferFakeFrontLeft:
+ 	case DRI2BufferFakeFrontRight:
+-		bpp = draw->bitsPerPixel;
+ 		DBG(("%s: creating back buffer %dx%d, suitable for scanout? %d\n",
+ 		     __FUNCTION__,
+ 		     draw->width, draw->height,
+@@ -530,7 +696,7 @@ sna_dri2_create_buffer(DrawablePtr draw,
+ 		bo = kgem_create_2d(&sna->kgem,
+ 				    draw->width,
+ 				    draw->height,
+-				    draw->bitsPerPixel,
++				    bpp,
+ 				    color_tiling(sna, draw),
+ 				    flags);
+ 		break;
+@@ -558,7 +724,6 @@ sna_dri2_create_buffer(DrawablePtr draw,
+ 		 * not understand W tiling and the GTT is incapable of
+ 		 * W fencing.
+ 		 */
+-		bpp = format ? format : draw->bitsPerPixel;
+ 		bpp *= 2;
+ 		bo = kgem_create_2d(&sna->kgem,
+ 				    ALIGN(draw->width, 64),
+@@ -570,7 +735,6 @@ sna_dri2_create_buffer(DrawablePtr draw,
+ 	case DRI2BufferDepthStencil:
+ 	case DRI2BufferHiz:
+ 	case DRI2BufferAccum:
+-		bpp = format ? format : draw->bitsPerPixel,
+ 		bo = kgem_create_2d(&sna->kgem,
+ 				    draw->width, draw->height, bpp,
+ 				    other_tiling(sna, draw),
+@@ -614,7 +778,7 @@ sna_dri2_create_buffer(DrawablePtr draw,
+ 		pixmap->refcnt++;
+ 
+ 		priv = sna_pixmap(pixmap);
+-		assert(priv->flush == false);
++		assert(priv->flush == false || priv->pinned & PIN_DRI3);
+ 		assert((priv->pinned & PIN_DRI2) == 0);
+ 
+ 		/* Don't allow this named buffer to be replaced */
+@@ -630,17 +794,17 @@ sna_dri2_create_buffer(DrawablePtr draw,
+ 		if (priv->gpu_bo->exec)
+ 			sna->kgem.flush = 1;
+ 
+-		priv->flush |= 1;
++		priv->flush |= FLUSH_READ;
+ 		if (draw->type == DRAWABLE_PIXMAP) {
+ 			/* DRI2 renders directly into GLXPixmaps, treat as hostile */
+ 			kgem_bo_unclean(&sna->kgem, priv->gpu_bo);
+ 			sna_damage_all(&priv->gpu_damage, pixmap);
+ 			priv->clear = false;
+ 			priv->cpu = false;
+-			priv->flush |= 2;
++			priv->flush |= FLUSH_WRITE;
+ 		}
+ 
+-		sna_accel_watch_flush(sna, 1);
++		sna_watch_flush(sna, 1);
+ 	}
+ 
+ 	return buffer;
+@@ -651,16 +815,80 @@ err:
+ 	return NULL;
+ }
+ 
+-static void _sna_dri2_destroy_buffer(struct sna *sna, DRI2Buffer2Ptr buffer)
++static void
++sna_dri2_cache_bo(struct sna *sna,
++		  DrawablePtr draw,
++		  struct kgem_bo *bo,
++		  uint32_t name,
++		  uint32_t size,
++		  uint32_t flags)
++{
++	struct dri_bo *c;
++
++	DBG(("%s(handle=%d, name=%d)\n", __FUNCTION__, bo->handle, name));
++
++	if (draw == NULL) {
++		DBG(("%s: no draw, releasing handle=%d\n",
++		     __FUNCTION__, bo->handle));
++		goto err;
++	}
++
++	if (draw->type == DRAWABLE_PIXMAP) {
++		DBG(("%s: not a window, releasing handle=%d\n",
++		     __FUNCTION__, bo->handle));
++		goto err;
++	}
++
++	if (bo->refcnt > 1 + bo->active_scanout) {
++		DBG(("%s: multiple references [%d], releasing handle\n",
++		     __FUNCTION__, bo->refcnt, bo->handle));
++		goto err;
++	}
++
++	if ((draw->height << 16 | draw->width) != size) {
++		DBG(("%s: wrong size [%dx%d], releasing handle\n",
++		     __FUNCTION__,
++		     size & 0xffff, size >> 16,
++		     bo->handle));
++		goto err;
++	}
++
++	if (bo->scanout && front_pitch(draw) != bo->pitch) {
++		DBG(("%s: scanout with pitch change [%d != %d], releasing handle\n",
++		     __FUNCTION__, bo->pitch, front_pitch(draw), bo->handle));
++		goto err;
++	}
++
++	c = malloc(sizeof(*c));
++	if (!c)
++		goto err;
++
++	DBG(("%s: caching handle=%d (name=%d, flags=%d, active_scanout=%d)\n", __FUNCTION__, bo->handle, name, flags, bo->active_scanout));
++
++	c->bo = bo;
++	c->name = name;
++	c->flags = flags;
++	list_add(&c->link, &dri2_window((WindowPtr)draw)->cache);
++	return;
++
++err:
++	kgem_bo_destroy(&sna->kgem, bo);
++}
++
++static void _sna_dri2_destroy_buffer(struct sna *sna,
++				     DrawablePtr draw,
++				     DRI2Buffer2Ptr buffer)
+ {
+ 	struct sna_dri2_private *private = get_private(buffer);
+ 
+ 	if (buffer == NULL)
+ 		return;
+ 
+-	DBG(("%s: %p [handle=%d] -- refcnt=%d, pixmap=%ld\n",
++	DBG(("%s: %p [handle=%d] -- refcnt=%d, draw=%ld, pixmap=%ld, proxy?=%d\n",
+ 	     __FUNCTION__, buffer, private->bo->handle, private->refcnt,
+-	     private->pixmap ? private->pixmap->drawable.serialNumber : 0));
++	     draw ? draw->id : 0,
++	     private->pixmap ? private->pixmap->drawable.serialNumber : 0,
++	     private->proxy != NULL));
+ 	assert(private->refcnt > 0);
+ 	if (--private->refcnt)
+ 		return;
+@@ -669,7 +897,10 @@ static void _sna_dri2_destroy_buffer(struct sna *sna, DRI2Buffer2Ptr buffer)
+ 
+ 	if (private->proxy) {
+ 		DBG(("%s: destroying proxy\n", __FUNCTION__));
+-		_sna_dri2_destroy_buffer(sna, private->proxy);
++		assert(private->bo->active_scanout > 0);
++		private->bo->active_scanout--;
++
++		_sna_dri2_destroy_buffer(sna, draw, private->proxy);
+ 		private->pixmap = NULL;
+ 	}
+ 
+@@ -683,6 +914,11 @@ static void _sna_dri2_destroy_buffer(struct sna *sna, DRI2Buffer2Ptr buffer)
+ 		assert(priv->pinned & PIN_DRI2);
+ 		assert(priv->flush);
+ 
++		DBG(("%s: removing active_scanout=%d from pixmap handle=%d\n",
++		     __FUNCTION__, priv->gpu_bo->active_scanout, priv->gpu_bo->handle));
++		assert(priv->gpu_bo->active_scanout > 0);
++		priv->gpu_bo->active_scanout--;
++
+ 		/* Undo the DRI markings on this pixmap */
+ 		DBG(("%s: releasing last DRI pixmap=%ld, scanout?=%d\n",
+ 		     __FUNCTION__,
+@@ -692,28 +928,34 @@ static void _sna_dri2_destroy_buffer(struct sna *sna, DRI2Buffer2Ptr buffer)
+ 		list_del(&priv->flush_list);
+ 
+ 		DBG(("%s: dropping flush hint from handle=%d\n", __FUNCTION__, private->bo->handle));
+-		priv->gpu_bo->flush = false;
+ 		priv->pinned &= ~PIN_DRI2;
+ 
+-		priv->flush = false;
+-		sna_accel_watch_flush(sna, -1);
++		if ((priv->pinned & PIN_DRI3) == 0) {
++			priv->gpu_bo->flush = false;
++			priv->flush = false;
++		}
++		sna_watch_flush(sna, -1);
+ 
+ 		sna_pixmap_set_buffer(pixmap, NULL);
+ 		pixmap->drawable.pScreen->DestroyPixmap(pixmap);
+ 	}
+-	assert(private->bo->flush == false);
+ 
+-	kgem_bo_destroy(&sna->kgem, private->bo);
++	sna_dri2_cache_bo(sna, draw,
++			  private->bo,
++			  buffer->name,
++			  private->size,
++			  buffer->flags);
+ 	free(buffer);
+ }
+ 
+ static void sna_dri2_destroy_buffer(DrawablePtr draw, DRI2Buffer2Ptr buffer)
+ {
+-	_sna_dri2_destroy_buffer(to_sna_from_drawable(draw), buffer);
++	_sna_dri2_destroy_buffer(to_sna_from_drawable(draw), draw, buffer);
+ }
+ 
+ static DRI2BufferPtr sna_dri2_reference_buffer(DRI2BufferPtr buffer)
+ {
++	assert(get_private(buffer)->refcnt > 0);
+ 	get_private(buffer)->refcnt++;
+ 	return buffer;
+ }
+@@ -746,10 +988,9 @@ static void set_bo(PixmapPtr pixmap, struct kgem_bo *bo)
+ {
+ 	struct sna *sna = to_sna_from_pixmap(pixmap);
+ 	struct sna_pixmap *priv = sna_pixmap(pixmap);
+-	RegionRec region;
+ 
+-	DBG(("%s: pixmap=%ld, handle=%d\n",
+-	     __FUNCTION__, pixmap->drawable.serialNumber, bo->handle));
++	DBG(("%s: pixmap=%ld, handle=%d (old handle=%d)\n",
++	     __FUNCTION__, pixmap->drawable.serialNumber, bo->handle, priv->gpu_bo->handle));
+ 
+ 	assert(pixmap->drawable.width * pixmap->drawable.bitsPerPixel <= 8*bo->pitch);
+ 	assert(pixmap->drawable.height * bo->pitch <= kgem_bo_size(bo));
+@@ -758,21 +999,34 @@ static void set_bo(PixmapPtr pixmap, struct kgem_bo *bo)
+ 	assert((priv->pinned & (PIN_PRIME | PIN_DRI3)) == 0);
+ 	assert(priv->flush);
+ 
+-	/* Post damage on the new front buffer so that listeners, such
+-	 * as DisplayLink know take a copy and shove it over the USB,
+-	 * also for software cursors and the like.
+-	 */
+-	region.extents.x1 = region.extents.y1 = 0;
+-	region.extents.x2 = pixmap->drawable.width;
+-	region.extents.y2 = pixmap->drawable.height;
+-	region.data = NULL;
+-	DamageRegionAppend(&pixmap->drawable, &region);
++	if (APPLY_DAMAGE) {
++		RegionRec region;
++
++		/* Post damage on the new front buffer so that listeners, such
++		 * as DisplayLink know take a copy and shove it over the USB,
++		 * also for software cursors and the like.
++		 */
++		region.extents.x1 = region.extents.y1 = 0;
++		region.extents.x2 = pixmap->drawable.width;
++		region.extents.y2 = pixmap->drawable.height;
++		region.data = NULL;
++
++		/*
++		 * Eeek, beware the sw cursor copying to the old bo
++		 * causing recursion and mayhem.
++		 */
++		DBG(("%s: marking whole pixmap as damaged\n", __FUNCTION__));
++		sna->ignore_copy_area = sna->flags & SNA_TEAR_FREE;
++		DamageRegionAppend(&pixmap->drawable, &region);
++	}
+ 
+ 	damage(pixmap, priv, NULL);
+ 
+ 	assert(bo->refcnt);
+-	if (priv->move_to_gpu)
++	if (priv->move_to_gpu) {
++		DBG(("%s: applying final/discard move-to-gpu\n", __FUNCTION__));
+ 		priv->move_to_gpu(sna, priv, 0);
++	}
+ 	if (priv->gpu_bo != bo) {
+ 		DBG(("%s: dropping flush hint from handle=%d\n", __FUNCTION__, priv->gpu_bo->handle));
+ 		priv->gpu_bo->flush = false;
+@@ -792,8 +1046,27 @@ static void set_bo(PixmapPtr pixmap, struct kgem_bo *bo)
+ 		bo->domain = DOMAIN_NONE;
+ 	assert(bo->flush);
+ 
+-	DamageRegionProcessPending(&pixmap->drawable);
++	if (APPLY_DAMAGE) {
++		sna->ignore_copy_area = false;
++		DamageRegionProcessPending(&pixmap->drawable);
++	}
++}
++
++#if defined(__GNUC__)
++#define popcount(x) __builtin_popcount(x)
++#else
++static int popcount(unsigned int x)
++{
++	int count = 0;
++
++	while (x) {
++		count += x&1;
++		x >>= 1;
++	}
++
++	return count;
+ }
++#endif
+ 
+ static void sna_dri2_select_mode(struct sna *sna, struct kgem_bo *dst, struct kgem_bo *src, bool sync)
+ {
+@@ -823,6 +1096,12 @@ static void sna_dri2_select_mode(struct sna *sna, struct kgem_bo *dst, struct kg
+ 		return;
+ 	}
+ 
++	if (sna->render_state.gt < 2 && sna->kgem.has_semaphores) {
++		DBG(("%s: small GT [%d], not forcing selection\n",
++		     __FUNCTION__, sna->render_state.gt));
++		return;
++	}
++
+ 	VG_CLEAR(busy);
+ 	busy.handle = src->handle;
+ 	if (drmIoctl(sna->kgem.fd, DRM_IOCTL_I915_GEM_BUSY, &busy))
+@@ -860,9 +1139,11 @@ static void sna_dri2_select_mode(struct sna *sna, struct kgem_bo *dst, struct kg
+ 	 * the cost of the query.
+ 	 */
+ 	mode = KGEM_RENDER;
+-	if (busy.busy & (0xfffe << 16))
++	if ((busy.busy & 0xffff) == I915_EXEC_BLT)
+ 		mode = KGEM_BLT;
+-	kgem_bo_mark_busy(&sna->kgem, busy.handle == src->handle ? src : dst, mode);
++	kgem_bo_mark_busy(&sna->kgem,
++			  busy.handle == src->handle ? src : dst,
++			  mode);
+ 	_kgem_set_mode(&sna->kgem, mode);
+ }
+ 
+@@ -871,10 +1152,13 @@ static bool is_front(int attachment)
+ 	return attachment == DRI2BufferFrontLeft;
+ }
+ 
++#define DRI2_SYNC 0x1
++#define DRI2_DAMAGE 0x2
++#define DRI2_BO 0x4
+ static struct kgem_bo *
+ __sna_dri2_copy_region(struct sna *sna, DrawablePtr draw, RegionPtr region,
+ 		      DRI2BufferPtr src, DRI2BufferPtr dst,
+-		      bool sync)
++		      unsigned flags)
+ {
+ 	PixmapPtr pixmap = get_drawable_pixmap(draw);
+ 	DrawableRec scratch, *src_draw = &pixmap->drawable, *dst_draw = &pixmap->drawable;
+@@ -886,7 +1170,7 @@ __sna_dri2_copy_region(struct sna *sna, DrawablePtr draw, RegionPtr region,
+ 	struct kgem_bo *dst_bo;
+ 	const BoxRec *boxes;
+ 	int16_t dx, dy, sx, sy;
+-	unsigned flags;
++	unsigned hint;
+ 	int n;
+ 
+ 	/* To hide a stale DRI2Buffer, one may choose to substitute
+@@ -962,8 +1246,9 @@ __sna_dri2_copy_region(struct sna *sna, DrawablePtr draw, RegionPtr region,
+ 			}
+ 		}
+ 	} else
+-		sync = false;
++		flags &= ~DRI2_SYNC;
+ 
++	scratch.pScreen = draw->pScreen;
+ 	scratch.x = scratch.y = 0;
+ 	scratch.width = scratch.height = 0;
+ 	scratch.depth = draw->depth;
+@@ -971,6 +1256,7 @@ __sna_dri2_copy_region(struct sna *sna, DrawablePtr draw, RegionPtr region,
+ 
+ 	src_bo = src_priv->bo;
+ 	assert(src_bo->refcnt);
++	kgem_bo_unclean(&sna->kgem, src_bo);
+ 	if (is_front(src->attachment)) {
+ 		struct sna_pixmap *priv;
+ 
+@@ -987,11 +1273,12 @@ __sna_dri2_copy_region(struct sna *sna, DrawablePtr draw, RegionPtr region,
+ 		scratch.height = src_priv->size >> 16;
+ 		src_draw = &scratch;
+ 
+-		DBG(("%s: source size %dx%d, region size %dx%d\n",
++		DBG(("%s: source size %dx%d, region size %dx%d, src offset %dx%d\n",
+ 		     __FUNCTION__,
+ 		     scratch.width, scratch.height,
+ 		     clip.extents.x2 - clip.extents.x1,
+-		     clip.extents.y2 - clip.extents.y1));
++		     clip.extents.y2 - clip.extents.y1,
++		     -sx, -sy));
+ 
+ 		source.extents.x1 = -sx;
+ 		source.extents.y1 = -sy;
+@@ -1002,6 +1289,10 @@ __sna_dri2_copy_region(struct sna *sna, DrawablePtr draw, RegionPtr region,
+ 		assert(region == NULL || region == &clip);
+ 		pixman_region_intersect(&clip, &clip, &source);
+ 
++		if (!pixman_region_not_empty(&clip)) {
++			DBG(("%s: region doesn't overlap pixmap\n", __FUNCTION__));
++			return NULL;
++		}
+ 	}
+ 
+ 	dst_bo = dst_priv->bo;
+@@ -1013,12 +1304,12 @@ __sna_dri2_copy_region(struct sna *sna, DrawablePtr draw, RegionPtr region,
+ 		/* Preserve the CRTC shadow overrides */
+ 		sna_shadow_steal_crtcs(sna, &shadow);
+ 
+-		flags = MOVE_WRITE | __MOVE_FORCE;
++		hint = MOVE_WRITE | __MOVE_FORCE;
+ 		if (clip.data)
+-			flags |= MOVE_READ;
++			hint |= MOVE_READ;
+ 
+ 		assert(region == NULL || region == &clip);
+-		priv = sna_pixmap_move_area_to_gpu(pixmap, &clip.extents, flags);
++		priv = sna_pixmap_move_area_to_gpu(pixmap, &clip.extents, hint);
+ 		if (priv) {
+ 			damage(pixmap, priv, region);
+ 			dst_bo = priv->gpu_bo;
+@@ -1050,20 +1341,20 @@ __sna_dri2_copy_region(struct sna *sna, DrawablePtr draw, RegionPtr region,
+ 		assert(region == NULL || region == &clip);
+ 		pixman_region_intersect(&clip, &clip, &target);
+ 
+-		sync = false;
++		flags &= ~DRI2_SYNC;
+ 	}
+ 
+ 	if (!wedged(sna)) {
+ 		xf86CrtcPtr crtc;
+ 
+ 		crtc = NULL;
+-		if (sync && sna_pixmap_is_scanout(sna, pixmap))
++		if (flags & DRI2_SYNC && sna_pixmap_is_scanout(sna, pixmap))
+ 			crtc = sna_covering_crtc(sna, &clip.extents, NULL);
+ 		sna_dri2_select_mode(sna, dst_bo, src_bo, crtc != NULL);
+ 
+-		sync = (crtc != NULL&&
+-			sna_wait_for_scanline(sna, pixmap, crtc,
+-					      &clip.extents));
++		if (crtc == NULL ||
++		    !sna_wait_for_scanline(sna, pixmap, crtc, &clip.extents))
++			flags &= ~DRI2_SYNC;
+ 	}
+ 
+ 	if (region) {
+@@ -1075,8 +1366,11 @@ __sna_dri2_copy_region(struct sna *sna, DrawablePtr draw, RegionPtr region,
+ 		boxes = &clip.extents;
+ 		n = 1;
+ 	}
+-	DamageRegionAppend(&pixmap->drawable, region);
+-
++	if (APPLY_DAMAGE || flags & DRI2_DAMAGE) {
++		DBG(("%s: marking region as damaged\n", __FUNCTION__));
++		sna->ignore_copy_area = sna->flags & SNA_TEAR_FREE;
++		DamageRegionAppend(&pixmap->drawable, region);
++	}
+ 
+ 	DBG(("%s: copying [(%d, %d), (%d, %d)]x%d src=(%d, %d), dst=(%d, %d)\n",
+ 	     __FUNCTION__,
+@@ -1084,29 +1378,36 @@ __sna_dri2_copy_region(struct sna *sna, DrawablePtr draw, RegionPtr region,
+ 	     boxes[0].x2, boxes[0].y2,
+ 	     n, sx, sy, dx, dy));
+ 
+-	flags = COPY_LAST;
+-	if (sync)
+-		flags |= COPY_SYNC;
++	hint = COPY_LAST | COPY_DRI;
++	if (flags & DRI2_SYNC)
++		hint |= COPY_SYNC;
+ 	if (!sna->render.copy_boxes(sna, GXcopy,
+ 				    src_draw, src_bo, sx, sy,
+ 				    dst_draw, dst_bo, dx, dy,
+-				    boxes, n, flags))
++				    boxes, n, hint))
+ 		memcpy_copy_boxes(sna, GXcopy,
+ 				  src_draw, src_bo, sx, sy,
+ 				  dst_draw, dst_bo, dx, dy,
+-				  boxes, n, flags);
+-
+-	DBG(("%s: flushing? %d\n", __FUNCTION__, sync));
+-	if (sync) { /* STAT! */
+-		struct kgem_request *rq = sna->kgem.next_request;
+-		kgem_submit(&sna->kgem);
+-		if (rq->bo) {
+-			bo = ref(rq->bo);
+-			DBG(("%s: recording sync fence handle=%d\n", __FUNCTION__, bo->handle));
++				  boxes, n, hint);
++
++	sna->needs_dri_flush = true;
++	if (flags & (DRI2_SYNC | DRI2_BO)) { /* STAT! */
++		struct kgem_request *rq = RQ(dst_bo->rq);
++		if (rq && rq != (void *)&sna->kgem) {
++			if (rq->bo == NULL)
++				kgem_submit(&sna->kgem);
++			if (rq->bo) { /* Becareful in case the gpu is wedged */
++				bo = ref(rq->bo);
++				DBG(("%s: recording sync fence handle=%d\n",
++				     __FUNCTION__, bo->handle));
++			}
+ 		}
+ 	}
+ 
+-	DamageRegionProcessPending(&pixmap->drawable);
++	if (APPLY_DAMAGE || flags & DRI2_DAMAGE) {
++		sna->ignore_copy_area = false;
++		DamageRegionProcessPending(&pixmap->drawable);
++	}
+ 
+ 	if (clip.data)
+ 		pixman_region_fini(&clip);
+@@ -1142,6 +1443,8 @@ sna_dri2_copy_region(DrawablePtr draw,
+ 	assert(get_private(src)->refcnt);
+ 	assert(get_private(dst)->refcnt);
+ 
++	assert(get_private(src)->bo != get_private(dst)->bo);
++
+ 	assert(get_private(src)->bo->refcnt);
+ 	assert(get_private(dst)->bo->refcnt);
+ 
+@@ -1151,7 +1454,7 @@ sna_dri2_copy_region(DrawablePtr draw,
+ 	     region->extents.x2, region->extents.y2,
+ 	     region_num_rects(region)));
+ 
+-	__sna_dri2_copy_region(sna, draw, region, src, dst, false);
++	__sna_dri2_copy_region(sna, draw, region, src, dst, DRI2_DAMAGE);
+ }
+ 
+ inline static uint32_t pipe_select(int pipe)
+@@ -1161,6 +1464,7 @@ inline static uint32_t pipe_select(int pipe)
+ 	 * we can safely ignore the capability check - if we have more
+ 	 * than two pipes, we can assume that they are fully supported.
+ 	 */
++	assert(pipe < _DRM_VBLANK_HIGH_CRTC_MASK);
+ 	if (pipe > 1)
+ 		return pipe << DRM_VBLANK_HIGH_CRTC_SHIFT;
+ 	else if (pipe > 0)
+@@ -1169,15 +1473,53 @@ inline static uint32_t pipe_select(int pipe)
+ 		return 0;
+ }
+ 
+-static inline int sna_wait_vblank(struct sna *sna, union drm_wait_vblank *vbl, int pipe)
++static inline bool sna_next_vblank(struct sna_dri2_event *info)
+ {
+-	DBG(("%s(pipe=%d, waiting until seq=%u%s)\n",
+-	     __FUNCTION__, pipe, vbl->request.sequence,
+-	     vbl->request.type & DRM_VBLANK_RELATIVE ? " [relative]" : ""));
+-	assert(pipe != -1);
++	union drm_wait_vblank vbl;
+ 
+-	vbl->request.type |= pipe_select(pipe);
+-	return drmIoctl(sna->kgem.fd, DRM_IOCTL_WAIT_VBLANK, vbl);
++	DBG(("%s(pipe=%d, waiting until next vblank)\n",
++	     __FUNCTION__, info->pipe));
++	assert(info->pipe != -1);
++
++	VG_CLEAR(vbl);
++	vbl.request.type =
++		DRM_VBLANK_RELATIVE |
++		DRM_VBLANK_EVENT |
++		pipe_select(info->pipe);
++	vbl.request.sequence = 1;
++	vbl.request.signal = (uintptr_t)info;
++
++	assert(!info->queued);
++	if (drmIoctl(info->sna->kgem.fd, DRM_IOCTL_WAIT_VBLANK, &vbl))
++		return false;
++
++	info->queued = true;
++	return true;
++}
++
++static inline bool sna_wait_vblank(struct sna_dri2_event *info,
++				   unsigned seq)
++{
++	union drm_wait_vblank vbl;
++
++	DBG(("%s(pipe=%d, waiting until vblank %u)\n",
++	     __FUNCTION__, info->pipe, seq));
++	assert(info->pipe != -1);
++
++	VG_CLEAR(vbl);
++	vbl.request.type =
++		DRM_VBLANK_ABSOLUTE |
++		DRM_VBLANK_EVENT |
++		pipe_select(info->pipe);
++	vbl.request.sequence = seq;
++	vbl.request.signal = (uintptr_t)info;
++
++	assert(!info->queued);
++	if (drmIoctl(info->sna->kgem.fd, DRM_IOCTL_WAIT_VBLANK, &vbl))
++		return false;
++
++	info->queued = true;
++	return true;
+ }
+ 
+ #if DRI2INFOREC_VERSION >= 4
+@@ -1195,6 +1537,7 @@ draw_current_msc(DrawablePtr draw, xf86CrtcPtr crtc, uint64_t msc)
+ {
+ 	struct dri2_window *priv;
+ 
++	assert(draw);
+ 	if (draw->type != DRAWABLE_WINDOW)
+ 		return msc;
+ 
+@@ -1206,6 +1549,9 @@ draw_current_msc(DrawablePtr draw, xf86CrtcPtr crtc, uint64_t msc)
+ 			priv->crtc = crtc;
+ 			priv->msc_delta = 0;
+ 			priv->chain = NULL;
++			priv->scanout = -1;
++			priv->cache_size = 0;
++			list_init(&priv->cache);
+ 			dri2_window_attach((WindowPtr)draw, priv);
+ 		}
+ 	} else {
+@@ -1214,8 +1560,8 @@ draw_current_msc(DrawablePtr draw, xf86CrtcPtr crtc, uint64_t msc)
+ 			const struct ust_msc *this = sna_crtc_last_swap(crtc);
+ 			DBG(("%s: Window transferring from pipe=%d [msc=%llu] to pipe=%d [msc=%llu], delta now %lld\n",
+ 			     __FUNCTION__,
+-			     sna_crtc_to_pipe(priv->crtc), (long long)last->msc,
+-			     sna_crtc_to_pipe(crtc), (long long)this->msc,
++			     sna_crtc_pipe(priv->crtc), (long long)last->msc,
++			     sna_crtc_pipe(crtc), (long long)this->msc,
+ 			     (long long)(priv->msc_delta + this->msc - last->msc)));
+ 			priv->msc_delta += this->msc - last->msc;
+ 			priv->crtc = crtc;
+@@ -1248,57 +1594,119 @@ sna_dri2_get_crtc(DrawablePtr draw)
+ 				 NULL);
+ }
+ 
+-static void
+-sna_dri2_remove_event(WindowPtr win, struct sna_dri2_event *info)
++static void frame_swap_complete(struct sna_dri2_event *frame, int type)
+ {
+-	struct dri2_window *priv;
+-	struct sna_dri2_event *chain;
+-
+-	assert(win->drawable.type == DRAWABLE_WINDOW);
+-	DBG(("%s: remove[%p] from window %ld, active? %d\n",
+-	     __FUNCTION__, info, (long)win->drawable.id, info->draw != NULL));
++	const struct ust_msc *swap;
+ 
+-	priv = dri2_window(win);
+-	assert(priv);
+-	assert(priv->chain != NULL);
++	assert(frame->signal);
++	frame->signal = false;
+ 
+-	if (priv->chain == info) {
+-		priv->chain = info->chain;
++	if (frame->client == NULL) {
++		DBG(("%s: client already gone\n", __FUNCTION__));
+ 		return;
+ 	}
+ 
+-	chain = priv->chain;
+-	while (chain->chain != info)
+-		chain = chain->chain;
+-	assert(chain != info);
+-	assert(info->chain != chain);
+-	chain->chain = info->chain;
++	assert(frame->draw);
++
++	swap = sna_crtc_last_swap(frame->crtc);
++	DBG(("%s(type=%d): draw=%ld, pipe=%d, frame=%lld [msc=%lld], tv=%d.%06d\n",
++	     __FUNCTION__, type, (long)frame->draw->id, frame->pipe,
++	     (long long)swap->msc,
++	     (long long)draw_current_msc(frame->draw, frame->crtc, swap->msc),
++	     swap->tv_sec, swap->tv_usec));
++
++	DRI2SwapComplete(frame->client, frame->draw,
++			 draw_current_msc(frame->draw, frame->crtc, swap->msc),
++			 swap->tv_sec, swap->tv_usec,
++			 type, frame->event_complete, frame->event_data);
+ }
+ 
+-static void
+-sna_dri2_event_free(struct sna_dri2_event *info)
++static void fake_swap_complete(struct sna *sna, ClientPtr client,
++			       DrawablePtr draw, xf86CrtcPtr crtc,
++			       int type, DRI2SwapEventPtr func, void *data)
+ {
+-	DrawablePtr draw = info->draw;
++	const struct ust_msc *swap;
+ 
+-	DBG(("%s(draw?=%d)\n", __FUNCTION__, draw != NULL));
+-	if (draw && draw->type == DRAWABLE_WINDOW)
+-		sna_dri2_remove_event((WindowPtr)draw, info);
++	assert(draw);
+ 
+-	_sna_dri2_destroy_buffer(info->sna, info->front);
+-	_sna_dri2_destroy_buffer(info->sna, info->back);
++	if (crtc == NULL)
++		crtc = sna_primary_crtc(sna);
+ 
+-	while (!list_is_empty(&info->cache)) {
+-		struct dri_bo *c;
++	swap = sna_crtc_last_swap(crtc);
++	DBG(("%s(type=%d): draw=%ld, pipe=%d, frame=%lld [msc %lld], tv=%d.%06d\n",
++	     __FUNCTION__, type, (long)draw->id, crtc ? sna_crtc_pipe(crtc) : -1,
++	     (long long)swap->msc,
++	     (long long)draw_current_msc(draw, crtc, swap->msc),
++	     swap->tv_sec, swap->tv_usec));
+ 
+-		c = list_first_entry(&info->cache, struct dri_bo, link);
+-		list_del(&c->link);
++	DRI2SwapComplete(client, draw,
++			 draw_current_msc(draw, crtc, swap->msc),
++			 swap->tv_sec, swap->tv_usec,
++			 type, func, data);
++}
+ 
+-		DBG(("%s: releasing cached handle=%d\n", __FUNCTION__, c->bo ? c->bo->handle : 0));
+-		if (c->bo)
+-			kgem_bo_destroy(&info->sna->kgem, c->bo);
++static void
++sna_dri2_remove_event(struct sna_dri2_event *info)
++{
++	WindowPtr win = (WindowPtr)info->draw;
++	struct dri2_window *priv;
+ 
+-		free(c);
++	assert(win->drawable.type == DRAWABLE_WINDOW);
++	DBG(("%s: remove[%p] from window %ld, active? %d\n",
++	     __FUNCTION__, info, (long)win->drawable.id, info->draw != NULL));
++	assert(!info->signal);
++
++	priv = dri2_window(win);
++	assert(priv);
++	assert(priv->chain != NULL);
++	assert(info->chained);
++	info->chained = false;
++
++	if (priv->chain != info) {
++		struct sna_dri2_event *chain = priv->chain;
++		while (chain->chain != info) {
++			assert(chain->chained);
++			chain = chain->chain;
++		}
++		assert(chain != info);
++		assert(info->chain != chain);
++		chain->chain = info->chain;
++		return;
++	}
++
++	priv->chain = info->chain;
++	if (priv->chain == NULL) {
++		struct dri_bo *c, *tmp;
++
++		c = list_entry(priv->cache.next->next, struct dri_bo, link);
++		list_for_each_entry_safe_from(c, tmp, &priv->cache, link) {
++			list_del(&c->link);
++
++			DBG(("%s: releasing cached handle=%d\n", __FUNCTION__, c->bo ? c->bo->handle : 0));
++			assert(c->bo);
++			kgem_bo_destroy(&info->sna->kgem, c->bo);
++			free(c);
++		}
+ 	}
++}
++
++static void
++sna_dri2_event_free(struct sna_dri2_event *info)
++{
++	DBG(("%s(draw?=%d)\n", __FUNCTION__, info->draw != NULL));
++	assert(!info->queued);
++	assert(!info->signal);
++	assert(info->pending.bo == NULL);
++
++	if (info->sna->dri2.flip_pending == info)
++		info->sna->dri2.flip_pending = NULL;
++	assert(info->sna->dri2.flip_pending != info);
++	if (info->chained)
++		sna_dri2_remove_event(info);
++
++	assert((info->front == NULL && info->back == NULL) || info->front != info->back);
++	_sna_dri2_destroy_buffer(info->sna, info->draw, info->front);
++	_sna_dri2_destroy_buffer(info->sna, info->draw, info->back);
+ 
+ 	if (info->bo) {
+ 		DBG(("%s: releasing batch handle=%d\n", __FUNCTION__, info->bo->handle));
+@@ -1331,15 +1739,26 @@ sna_dri2_client_gone(CallbackListPtr *list, void *closure, void *data)
+ 
+ 		event = list_first_entry(&priv->events, struct sna_dri2_event, link);
+ 		assert(event->client == client);
++		list_del(&event->link);
++		event->signal = false;
+ 
+-		if (event->queued) {
+-			if (event->draw)
+-				sna_dri2_remove_event((WindowPtr)event->draw,
+-						      event);
+-			event->client = NULL;
+-			event->draw = NULL;
+-			list_del(&event->link);
+-		} else
++		if (event->pending.bo) {
++			assert(event->pending.bo->active_scanout > 0);
++			event->pending.bo->active_scanout--;
++
++			kgem_bo_destroy(&sna->kgem, event->pending.bo);
++			event->pending.bo = NULL;
++		}
++
++		if (event->chained)
++			sna_dri2_remove_event(event);
++
++		event->client = NULL;
++		event->draw = NULL;
++		event->keepalive = 1;
++		assert(!event->signal);
++
++		if (!event->queued)
+ 			sna_dri2_event_free(event);
+ 	}
+ 
+@@ -1365,11 +1784,15 @@ static bool add_event_to_client(struct sna_dri2_event *info, struct sna *sna, Cl
+ }
+ 
+ static struct sna_dri2_event *
+-sna_dri2_add_event(struct sna *sna, DrawablePtr draw, ClientPtr client)
++sna_dri2_add_event(struct sna *sna,
++		   DrawablePtr draw,
++		   ClientPtr client,
++		   xf86CrtcPtr crtc)
+ {
+ 	struct dri2_window *priv;
+ 	struct sna_dri2_event *info, *chain;
+ 
++	assert(draw != NULL);
+ 	assert(draw->type == DRAWABLE_WINDOW);
+ 	DBG(("%s: adding event to window %ld)\n",
+ 	     __FUNCTION__, (long)draw->id));
+@@ -1382,11 +1805,11 @@ sna_dri2_add_event(struct sna *sna, DrawablePtr draw, ClientPtr client)
+ 	if (info == NULL)
+ 		return NULL;
+ 
+-	list_init(&info->cache);
+ 	info->sna = sna;
+ 	info->draw = draw;
+-	info->crtc = priv->crtc;
+-	info->pipe = sna_crtc_to_pipe(priv->crtc);
++	info->crtc = crtc;
++	info->pipe = sna_crtc_pipe(crtc);
++	info->keepalive = 1;
+ 
+ 	if (!add_event_to_client(info, sna, client)) {
+ 		free(info);
+@@ -1394,6 +1817,7 @@ sna_dri2_add_event(struct sna *sna, DrawablePtr draw, ClientPtr client)
+ 	}
+ 
+ 	assert(priv->chain != info);
++	info->chained = true;
+ 
+ 	if (priv->chain == NULL) {
+ 		priv->chain = info;
+@@ -1409,6 +1833,66 @@ sna_dri2_add_event(struct sna *sna, DrawablePtr draw, ClientPtr client)
+ 	return info;
+ }
+ 
++static void decouple_window(WindowPtr win,
++			    struct dri2_window *priv,
++			    struct sna *sna,
++			    bool signal)
++{
++	if (priv->front) {
++		DBG(("%s: decouple private front\n", __FUNCTION__));
++		assert(priv->crtc);
++		sna_shadow_unset_crtc(sna, priv->crtc);
++
++		_sna_dri2_destroy_buffer(sna, NULL, priv->front);
++		priv->front = NULL;
++	}
++
++	if (priv->chain) {
++		struct sna_dri2_event *info, *chain;
++
++		DBG(("%s: freeing chain\n", __FUNCTION__));
++
++		chain = priv->chain;
++		while ((info = chain)) {
++			DBG(("%s: freeing event, pending signal? %d, pending swap? handle=%d\n",
++			     __FUNCTION__, info->signal,
++			     info->pending.bo ? info->pending.bo->handle : 0));
++			assert(info->draw == &win->drawable);
++
++			if (info->pending.bo) {
++				if (signal) {
++					bool was_signalling = info->signal;
++					info->signal = true;
++					frame_swap_complete(info, DRI2_EXCHANGE_COMPLETE);
++					info->signal = was_signalling;
++				}
++				assert(info->pending.bo->active_scanout > 0);
++				info->pending.bo->active_scanout--;
++
++				kgem_bo_destroy(&sna->kgem, info->pending.bo);
++				info->pending.bo = NULL;
++			}
++
++			if (info->signal && signal)
++				frame_swap_complete(info, DRI2_EXCHANGE_COMPLETE);
++			info->signal = false;
++			info->draw = NULL;
++			info->keepalive = 1;
++			assert(!info->signal);
++			list_del(&info->link);
++
++			chain = info->chain;
++			info->chain = NULL;
++			info->chained = false;
++
++			if (!info->queued)
++				sna_dri2_event_free(info);
++		}
++
++		priv->chain = NULL;
++	}
++}
++
+ void sna_dri2_decouple_window(WindowPtr win)
+ {
+ 	struct dri2_window *priv;
+@@ -1418,50 +1902,34 @@ void sna_dri2_decouple_window(WindowPtr win)
+ 		return;
+ 
+ 	DBG(("%s: window=%ld\n", __FUNCTION__, win->drawable.id));
++	decouple_window(win, priv, to_sna_from_drawable(&win->drawable), true);
+ 
+-	if (priv->front) {
+-		struct sna *sna = to_sna_from_drawable(&win->drawable);
+-		assert(priv->crtc);
+-		sna_shadow_unset_crtc(sna, priv->crtc);
+-		_sna_dri2_destroy_buffer(sna, priv->front);
+-		priv->front = NULL;
+-	}
++	priv->scanout = -1;
+ }
+ 
+ void sna_dri2_destroy_window(WindowPtr win)
+ {
+ 	struct dri2_window *priv;
++	struct sna *sna;
+ 
+ 	priv = dri2_window(win);
+ 	if (priv == NULL)
+ 		return;
+ 
+ 	DBG(("%s: window=%ld\n", __FUNCTION__, win->drawable.id));
++	sna = to_sna_from_drawable(&win->drawable);
++	decouple_window(win, priv, sna, false);
+ 
+-	if (priv->front) {
+-		struct sna *sna = to_sna_from_drawable(&win->drawable);
+-		assert(priv->crtc);
+-		sna_shadow_unset_crtc(sna, priv->crtc);
+-		_sna_dri2_destroy_buffer(sna, priv->front);
+-	}
+-
+-	if (priv->chain) {
+-		struct sna_dri2_event *info, *chain;
+-
+-		DBG(("%s: freeing chain\n", __FUNCTION__));
+-
+-		chain = priv->chain;
+-		while ((info = chain)) {
+-			info->draw = NULL;
+-			info->client = NULL;
+-			list_del(&info->link);
++	while (!list_is_empty(&priv->cache)) {
++		struct dri_bo *c;
+ 
+-			chain = info->chain;
+-			info->chain = NULL;
++		c = list_first_entry(&priv->cache, struct dri_bo, link);
++		list_del(&c->link);
+ 
+-			if (!info->queued)
+-				sna_dri2_event_free(info);
+-		}
++		DBG(("%s: releasing cached handle=%d\n", __FUNCTION__, c->bo ? c->bo->handle : 0));
++		assert(c->bo);
++		kgem_bo_destroy(&sna->kgem, c->bo);
++		free(c);
+ 	}
+ 
+ 	free(priv);
+@@ -1479,19 +1947,30 @@ sna_dri2_flip(struct sna_dri2_event *info)
+ {
+ 	struct kgem_bo *bo = get_private(info->back)->bo;
+ 	struct kgem_bo *tmp_bo;
+-	uint32_t tmp_name;
++	uint32_t tmp_name, tmp_flags;
+ 	int tmp_pitch;
+ 
+ 	DBG(("%s(type=%d)\n", __FUNCTION__, info->type));
+ 
+ 	assert(sna_pixmap_get_buffer(info->sna->front) == info->front);
+ 	assert(get_drawable_pixmap(info->draw)->drawable.height * bo->pitch <= kgem_bo_size(bo));
++	assert(get_private(info->front)->size == get_private(info->back)->size);
+ 	assert(bo->refcnt);
+ 
++	if (info->sna->mode.flip_active) {
++		DBG(("%s: %d flips still active, aborting\n",
++		     __FUNCTION__, info->sna->mode.flip_active));
++		return false;
++	}
++
++	assert(!info->queued);
+ 	if (!sna_page_flip(info->sna, bo, sna_dri2_flip_handler,
+ 			   info->type == FLIP_ASYNC ? NULL : info))
+ 		return false;
+ 
++	DBG(("%s: queued flip=%p\n", __FUNCTION__, info->type == FLIP_ASYNC ? NULL : info));
++	assert(info->signal || info->type != FLIP_THROTTLE);
++
+ 	assert(info->sna->dri2.flip_pending == NULL ||
+ 	       info->sna->dri2.flip_pending == info);
+ 	if (info->type != FLIP_ASYNC)
+@@ -1505,13 +1984,21 @@ sna_dri2_flip(struct sna_dri2_event *info)
+ 	tmp_bo = get_private(info->front)->bo;
+ 	tmp_name = info->front->name;
+ 	tmp_pitch = info->front->pitch;
++	tmp_flags = info->front->flags;
++
++	assert(tmp_bo->active_scanout > 0);
++	tmp_bo->active_scanout--;
+ 
+ 	set_bo(info->sna->front, bo);
+ 
++	info->front->flags = info->back->flags;
+ 	info->front->name = info->back->name;
+ 	info->front->pitch = info->back->pitch;
+ 	get_private(info->front)->bo = bo;
++	bo->active_scanout++;
++	assert(bo->active_scanout <= bo->refcnt);
+ 
++	info->back->flags = tmp_flags;
+ 	info->back->name = tmp_name;
+ 	info->back->pitch = tmp_pitch;
+ 	get_private(info->back)->bo = tmp_bo;
+@@ -1521,6 +2008,7 @@ sna_dri2_flip(struct sna_dri2_event *info)
+ 	assert(get_private(info->back)->bo->refcnt);
+ 	assert(get_private(info->front)->bo != get_private(info->back)->bo);
+ 
++	info->keepalive = KEEPALIVE;
+ 	info->queued = true;
+ 	return true;
+ }
+@@ -1549,15 +2037,16 @@ can_flip(struct sna * sna,
+ 	}
+ 
+ 	assert(sna->scrn->vtSema);
++	assert(!sna->mode.hidden);
+ 
+ 	if ((sna->flags & (SNA_HAS_FLIP | SNA_HAS_ASYNC_FLIP)) == 0) {
+ 		DBG(("%s: no, pageflips disabled\n", __FUNCTION__));
+ 		return false;
+ 	}
+ 
+-	if (front->format != back->format) {
++	if (front->cpp != back->cpp) {
+ 		DBG(("%s: no, format mismatch, front = %d, back = %d\n",
+-		     __FUNCTION__, front->format, back->format));
++		     __FUNCTION__, front->cpp, back->cpp));
+ 		return false;
+ 	}
+ 
+@@ -1567,7 +2056,7 @@ can_flip(struct sna * sna,
+ 	}
+ 
+ 	if (!sna_crtc_is_on(crtc)) {
+-		DBG(("%s: ref-pipe=%d is disabled\n", __FUNCTION__, sna_crtc_to_pipe(crtc)));
++		DBG(("%s: ref-pipe=%d is disabled\n", __FUNCTION__, sna_crtc_pipe(crtc)));
+ 		return false;
+ 	}
+ 
+@@ -1581,7 +2070,7 @@ can_flip(struct sna * sna,
+ 	if (sna_pixmap_get_buffer(pixmap) != front) {
+ 		DBG(("%s: no, DRI2 drawable is no longer attached (old name=%d, new name=%d) to pixmap=%ld\n",
+ 		     __FUNCTION__, front->name,
+-		     sna_pixmap_get_buffer(pixmap) ? ((DRI2BufferPtr)sna_pixmap_get_buffer(pixmap))->name : 0,
++		     sna_pixmap_get_buffer(pixmap) ? sna_pixmap_get_buffer(pixmap)->name : 0,
+ 		     pixmap->drawable.serialNumber));
+ 		return false;
+ 	}
+@@ -1661,7 +2150,6 @@ can_flip(struct sna * sna,
+ 	}
+ 
+ 	DBG(("%s: yes, pixmap=%ld\n", __FUNCTION__, pixmap->drawable.serialNumber));
+-	assert(dri2_window(win)->front == NULL);
+ 	return true;
+ }
+ 
+@@ -1680,9 +2168,9 @@ can_xchg(struct sna *sna,
+ 	if (draw->type == DRAWABLE_PIXMAP)
+ 		return false;
+ 
+-	if (front->format != back->format) {
++	if (front->cpp != back->cpp) {
+ 		DBG(("%s: no, format mismatch, front = %d, back = %d\n",
+-		     __FUNCTION__, front->format, back->format));
++		     __FUNCTION__, front->cpp, back->cpp));
+ 		return false;
+ 	}
+ 
+@@ -1714,6 +2202,8 @@ can_xchg(struct sna *sna,
+ 		return false;
+ 	}
+ 
++	DBG(("%s: back size=%x, front size=%x\n",
++	     __FUNCTION__, get_private(back)->size, get_private(front)->size));
+ 	if (get_private(back)->size != get_private(front)->size) {
+ 		DBG(("%s: no, back buffer %dx%d does not match front buffer %dx%d\n",
+ 		     __FUNCTION__,
+@@ -1766,9 +2256,9 @@ overlaps_other_crtc(struct sna *sna, xf86CrtcPtr desired)
+ static bool
+ can_xchg_crtc(struct sna *sna,
+ 	      DrawablePtr draw,
++	      xf86CrtcPtr crtc,
+ 	      DRI2BufferPtr front,
+-	      DRI2BufferPtr back,
+-	      xf86CrtcPtr crtc)
++	      DRI2BufferPtr back)
+ {
+ 	WindowPtr win = (WindowPtr)draw;
+ 	PixmapPtr pixmap;
+@@ -1785,9 +2275,9 @@ can_xchg_crtc(struct sna *sna,
+ 	if (draw->type == DRAWABLE_PIXMAP)
+ 		return false;
+ 
+-	if (front->format != back->format) {
++	if (front->cpp != back->cpp) {
+ 		DBG(("%s: no, format mismatch, front = %d, back = %d\n",
+-		     __FUNCTION__, front->format, back->format));
++		     __FUNCTION__, front->cpp, back->cpp));
+ 		return false;
+ 	}
+ 
+@@ -1866,20 +2356,21 @@ sna_dri2_xchg(DrawablePtr draw, DRI2BufferPtr front, DRI2BufferPtr back)
+ 
+ 	back_bo = get_private(back)->bo;
+ 	front_bo = get_private(front)->bo;
+-	assert(front_bo != back_bo);
+ 
+-	DBG(("%s: win=%ld, exchange front=%d/%d and back=%d/%d, pixmap=%ld %dx%d\n",
++	DBG(("%s: win=%ld, exchange front=%d/%d,ref=%d and back=%d/%d,ref=%d, pixmap=%ld %dx%d\n",
+ 	     __FUNCTION__, win->drawable.id,
+-	     front_bo->handle, front->name,
+-	     back_bo->handle, back->name,
++	     front_bo->handle, front->name, get_private(front)->refcnt,
++	     back_bo->handle, back->name, get_private(back)->refcnt,
+ 	     pixmap->drawable.serialNumber,
+ 	     pixmap->drawable.width,
+ 	     pixmap->drawable.height));
+ 
+-	DBG(("%s: back_bo pitch=%d, size=%d, ref=%d, active_scanout?=%d\n",
+-	     __FUNCTION__, back_bo->pitch, kgem_bo_size(back_bo), back_bo->refcnt, back_bo->active_scanout));
+-	DBG(("%s: front_bo pitch=%d, size=%d, ref=%d, active_scanout?=%d\n",
+-	     __FUNCTION__, front_bo->pitch, kgem_bo_size(front_bo), front_bo->refcnt, front_bo->active_scanout));
++	DBG(("%s: back_bo handle=%d, pitch=%d, size=%d, ref=%d, active_scanout?=%d\n",
++	     __FUNCTION__, back_bo->handle, back_bo->pitch, kgem_bo_size(back_bo), back_bo->refcnt, back_bo->active_scanout));
++	DBG(("%s: front_bo handle=%d, pitch=%d, size=%d, ref=%d, active_scanout?=%d\n",
++	     __FUNCTION__, front_bo->handle, front_bo->pitch, kgem_bo_size(front_bo), front_bo->refcnt, front_bo->active_scanout));
++
++	assert(front_bo != back_bo);
+ 	assert(front_bo->refcnt);
+ 	assert(back_bo->refcnt);
+ 
+@@ -1894,6 +2385,11 @@ sna_dri2_xchg(DrawablePtr draw, DRI2BufferPtr front, DRI2BufferPtr back)
+ 	get_private(back)->bo = front_bo;
+ 	mark_stale(back);
+ 
++	assert(front_bo->active_scanout > 0);
++	front_bo->active_scanout--;
++	back_bo->active_scanout++;
++	assert(back_bo->active_scanout <= back_bo->refcnt);
++
+ 	tmp = front->name;
+ 	front->name = back->name;
+ 	back->name = tmp;
+@@ -1902,17 +2398,23 @@ sna_dri2_xchg(DrawablePtr draw, DRI2BufferPtr front, DRI2BufferPtr back)
+ 	front->pitch = back->pitch;
+ 	back->pitch = tmp;
+ 
++	tmp = front->flags;
++	front->flags = back->flags;
++	back->flags = tmp;
++
+ 	assert(front_bo->refcnt);
+ 	assert(back_bo->refcnt);
+ 
++	assert(front_bo->pitch == get_private(front)->bo->pitch);
++	assert(back_bo->pitch == get_private(back)->bo->pitch);
++
+ 	assert(get_private(front)->bo == sna_pixmap(pixmap)->gpu_bo);
+ }
+ 
+ static void sna_dri2_xchg_crtc(struct sna *sna, DrawablePtr draw, xf86CrtcPtr crtc, DRI2BufferPtr front, DRI2BufferPtr back)
+ {
+ 	WindowPtr win = (WindowPtr)draw;
+-	DRI2Buffer2Ptr tmp;
+-	struct kgem_bo *bo;
++	struct dri2_window *priv = dri2_window(win);
+ 
+ 	DBG(("%s: exchange front=%d/%d and back=%d/%d, win id=%lu, pixmap=%ld %dx%d\n",
+ 	     __FUNCTION__,
+@@ -1922,162 +2424,130 @@ static void sna_dri2_xchg_crtc(struct sna *sna, DrawablePtr draw, xf86CrtcPtr cr
+ 	     get_window_pixmap(win)->drawable.serialNumber,
+ 	     get_window_pixmap(win)->drawable.width,
+ 	     get_window_pixmap(win)->drawable.height));
++	assert(can_xchg_crtc(sna, draw, crtc, front, back));
+ 
+-	DamageRegionAppend(&win->drawable, &win->clipList);
++	if (APPLY_DAMAGE) {
++		DBG(("%s: marking drawable as damaged\n", __FUNCTION__));
++		sna->ignore_copy_area = sna->flags & SNA_TEAR_FREE;
++		DamageRegionAppend(&win->drawable, &win->clipList);
++	}
+ 	sna_shadow_set_crtc(sna, crtc, get_private(back)->bo);
+-	DamageRegionProcessPending(&win->drawable);
++	if (APPLY_DAMAGE) {
++		sna->ignore_copy_area = false;
++		DamageRegionProcessPending(&win->drawable);
++	}
+ 
+-	assert(dri2_window(win)->front == NULL);
++	if (priv->front == NULL) {
++		DRI2Buffer2Ptr tmp;
+ 
+-	tmp = calloc(1, sizeof(*tmp) + sizeof(struct sna_dri2_private));
+-	if (tmp == NULL) {
+-		back->attachment = -1;
+-		if (get_private(back)->proxy == NULL) {
+-			get_private(back)->pixmap = get_window_pixmap(win);
+-			get_private(back)->proxy = sna_dri2_reference_buffer(sna_pixmap_get_buffer(get_private(back)->pixmap));
++		tmp = calloc(1, sizeof(*tmp) + sizeof(struct sna_dri2_private));
++		if (tmp == NULL) {
++			sna_shadow_unset_crtc(sna, crtc);
++			return;
+ 		}
+-		dri2_window(win)->front = sna_dri2_reference_buffer(back);
+-		return;
+-	}
+ 
+-	*tmp = *back;
+-	tmp->attachment = DRI2BufferFrontLeft;
+-	tmp->driverPrivate = tmp + 1;
+-	get_private(tmp)->refcnt = 1;
+-	get_private(tmp)->bo = get_private(back)->bo;
+-	get_private(tmp)->size = get_private(back)->size;
+-	get_private(tmp)->pixmap = get_window_pixmap(win);
+-	get_private(tmp)->proxy = sna_dri2_reference_buffer(sna_pixmap_get_buffer(get_private(tmp)->pixmap));
+-	dri2_window(win)->front = tmp;
+-
+-	DBG(("%s: allocating new backbuffer\n", __FUNCTION__));
+-	back->name = 0;
+-	bo = kgem_create_2d(&sna->kgem,
+-			    draw->width, draw->height, draw->bitsPerPixel,
+-			    get_private(back)->bo->tiling,
+-			    CREATE_SCANOUT);
+-	if (bo != NULL) {
+-		get_private(back)->bo = bo;
+-		back->pitch = bo->pitch;
+-		back->name = kgem_bo_flink(&sna->kgem, bo);
+-	}
+-	if (back->name == 0) {
+-		if (bo != NULL)
+-			kgem_bo_destroy(&sna->kgem, bo);
+-		get_private(back)->bo = NULL;
+-		back->attachment = -1;
++		tmp->attachment = DRI2BufferFrontLeft;
++		tmp->driverPrivate = tmp + 1;
++		tmp->cpp = back->cpp;
++		tmp->format = back->format;
++
++		get_private(tmp)->refcnt = 1;
++		get_private(tmp)->bo = kgem_create_2d(&sna->kgem,
++						      draw->width, draw->height, draw->bitsPerPixel,
++						      get_private(back)->bo->tiling,
++						      CREATE_SCANOUT | CREATE_EXACT);
++		if (get_private(tmp)->bo != NULL) {
++			tmp->pitch = get_private(tmp)->bo->pitch;
++			tmp->name = kgem_bo_flink(&sna->kgem, get_private(tmp)->bo);
++		}
++		if (tmp->name == 0) {
++			if (get_private(tmp)->bo != NULL)
++				kgem_bo_destroy(&sna->kgem, get_private(tmp)->bo);
++			sna_shadow_unset_crtc(sna, crtc);
++			return;
++		}
++		get_private(tmp)->size = get_private(back)->size;
++		get_private(tmp)->pixmap = get_private(front)->pixmap;
++		get_private(tmp)->proxy = sna_dri2_reference_buffer(front);
++		get_private(tmp)->bo->active_scanout++;
++
++		priv->front = front = tmp;
+ 	}
+-}
++	assert(front == priv->front);
+ 
+-static void frame_swap_complete(struct sna_dri2_event *frame, int type)
+-{
+-	const struct ust_msc *swap;
++	{
++		struct kgem_bo *front_bo = get_private(front)->bo;
++		struct kgem_bo *back_bo = get_private(back)->bo;
++		unsigned tmp;
+ 
+-	if (frame->draw == NULL)
+-		return;
++		assert(front_bo->refcnt);
++		assert(back_bo->refcnt);
+ 
+-	assert(frame->client);
++		get_private(back)->bo = front_bo;
++		get_private(front)->bo = back_bo;
++		mark_stale(back);
+ 
+-	swap = sna_crtc_last_swap(frame->crtc);
+-	DBG(("%s(type=%d): draw=%ld, pipe=%d, frame=%lld [msc=%lld], tv=%d.%06d\n",
+-	     __FUNCTION__, type, (long)frame->draw, frame->pipe,
+-	     (long long)swap->msc,
+-	     (long long)draw_current_msc(frame->draw, frame->crtc, swap->msc),
+-	     swap->tv_sec, swap->tv_usec));
++		assert(front_bo->active_scanout > 0);
++		front_bo->active_scanout--;
++		back_bo->active_scanout++;
++		assert(back_bo->active_scanout <= back_bo->refcnt);
+ 
+-	DRI2SwapComplete(frame->client, frame->draw,
+-			 draw_current_msc(frame->draw, frame->crtc, swap->msc),
+-			 swap->tv_sec, swap->tv_usec,
+-			 type, frame->event_complete, frame->event_data);
+-}
++		tmp = front->name;
++		front->name = back->name;
++		back->name = tmp;
+ 
+-static void fake_swap_complete(struct sna *sna, ClientPtr client,
+-			       DrawablePtr draw, xf86CrtcPtr crtc,
+-			       int type, DRI2SwapEventPtr func, void *data)
+-{
+-	const struct ust_msc *swap;
+-
+-	swap = sna_crtc_last_swap(crtc);
+-	DBG(("%s(type=%d): draw=%ld, pipe=%d, frame=%lld [msc %lld], tv=%d.%06d\n",
+-	     __FUNCTION__, type, (long)draw->id, crtc ? sna_crtc_to_pipe(crtc) : -1,
+-	     (long long)swap->msc,
+-	     (long long)draw_current_msc(draw, crtc, swap->msc),
+-	     swap->tv_sec, swap->tv_usec));
++		tmp = front->pitch;
++		front->pitch = back->pitch;
++		back->pitch = tmp;
+ 
+-	DRI2SwapComplete(client, draw,
+-			 draw_current_msc(draw, crtc, swap->msc),
+-			 swap->tv_sec, swap->tv_usec,
+-			 type, func, data);
++		tmp = front->flags;
++		front->flags = back->flags;
++		back->flags = tmp;
++	}
+ }
+ 
+ static void chain_swap(struct sna_dri2_event *chain)
+ {
+-	union drm_wait_vblank vbl;
++	DBG(("%s: draw=%ld, queued?=%d, type=%d\n",
++	     __FUNCTION__, (long)chain->draw->id, chain->queued, chain->type));
++
++	if (chain->queued) /* too early! */
++		return;
+ 
+ 	if (chain->draw == NULL) {
+ 		sna_dri2_event_free(chain);
+ 		return;
+ 	}
+ 
+-	if (chain->queued) /* too early! */
+-		return;
+-
+ 	assert(chain == dri2_chain(chain->draw));
+-	DBG(("%s: chaining draw=%ld, type=%d\n",
+-	     __FUNCTION__, (long)chain->draw->id, chain->type));
+-	chain->queued = true;
++	assert(chain->signal);
+ 
+ 	switch (chain->type) {
+-	case SWAP_THROTTLE:
++	case SWAP_COMPLETE:
+ 		DBG(("%s: emitting chained vsync'ed blit\n", __FUNCTION__));
+-		if (chain->sna->mode.shadow &&
+-		    !chain->sna->mode.shadow_damage) {
+-			/* recursed from wait_for_shadow(), simply requeue */
+-			DBG(("%s -- recursed from wait_for_shadow(), requeuing\n", __FUNCTION__));
+-			VG_CLEAR(vbl);
+-			vbl.request.type =
+-				DRM_VBLANK_RELATIVE |
+-				DRM_VBLANK_EVENT;
+-			vbl.request.sequence = 1;
+-			vbl.request.signal = (uintptr_t)chain;
+-
+-			if (!sna_wait_vblank(chain->sna, &vbl, chain->pipe))
+-				return;
+-
+-			DBG(("%s -- requeue failed, errno=%d\n", __FUNCTION__, errno));
+-		}
+-
+ 		if (can_xchg(chain->sna, chain->draw, chain->front, chain->back)) {
+ 			sna_dri2_xchg(chain->draw, chain->front, chain->back);
+-		} else if (can_xchg_crtc(chain->sna, chain->draw, chain->front, chain->back, chain->crtc)) {
+-			sna_dri2_xchg_crtc(chain->sna, chain->draw, chain->crtc, chain->front, chain->back);
++		} else if (can_xchg_crtc(chain->sna, chain->draw, chain->crtc,
++					 chain->front, chain->back)) {
++			sna_dri2_xchg_crtc(chain->sna, chain->draw, chain->crtc,
++					   chain->front, chain->back);
+ 		} else {
+-			assert(chain->queued);
+-			chain->bo = __sna_dri2_copy_region(chain->sna, chain->draw, NULL,
+-							   chain->back, chain->front,
+-							   true);
++			__sna_dri2_copy_event(chain, chain->sync | DRI2_BO);
+ 		}
++		assert(get_private(chain->back)->bo != get_private(chain->front)->bo);
+ 	case SWAP:
+ 		break;
+ 	default:
+ 		return;
+ 	}
+ 
+-	VG_CLEAR(vbl);
+-	vbl.request.type =
+-		DRM_VBLANK_RELATIVE |
+-		DRM_VBLANK_EVENT;
+-	vbl.request.sequence = 1;
+-	vbl.request.signal = (uintptr_t)chain;
+-	if (sna_wait_vblank(chain->sna, &vbl, chain->pipe)) {
++	if ((chain->type == SWAP_COMPLETE &&
++	     !swap_limit(chain->draw, 2 + !chain->sync) &&
++	     !chain->sync) ||
++	    !sna_next_vblank(chain)) {
+ 		DBG(("%s: vblank wait failed, unblocking client\n", __FUNCTION__));
+ 		frame_swap_complete(chain, DRI2_BLIT_COMPLETE);
+ 		sna_dri2_event_free(chain);
+-	} else {
+-		if (chain->type == SWAP_THROTTLE && !swap_limit(chain->draw, 2)) {
+-			DBG(("%s: fake triple buffering, unblocking client\n", __FUNCTION__));
+-			frame_swap_complete(chain, DRI2_BLIT_COMPLETE);
+-		}
+ 	}
+ }
+ 
+@@ -2086,40 +2556,27 @@ static inline bool rq_is_busy(struct kgem *kgem, struct kgem_bo *bo)
+ 	if (bo == NULL)
+ 		return false;
+ 
+-	DBG(("%s: handle=%d, domain: %d exec? %d, rq? %d\n", __FUNCTION__,
+-	     bo->handle, bo->domain, bo->exec != NULL, bo->rq != NULL));
+-	assert(bo->refcnt);
+-
+-	if (bo->exec)
+-		return true;
+-
+-	if (bo->rq == NULL)
+-		return false;
+-
+-	return __kgem_busy(kgem, bo->handle);
++	return __kgem_bo_is_busy(kgem, bo);
+ }
+ 
+-static bool sna_dri2_blit_complete(struct sna *sna,
+-				   struct sna_dri2_event *info)
++static bool sna_dri2_blit_complete(struct sna_dri2_event *info)
+ {
+-	if (rq_is_busy(&sna->kgem, info->bo)) {
+-		union drm_wait_vblank vbl;
++	if (!info->bo)
++		return true;
+ 
++	if (__kgem_bo_is_busy(&info->sna->kgem, info->bo)) {
+ 		DBG(("%s: vsync'ed blit is still busy, postponing\n",
+ 		     __FUNCTION__));
+-
+-		VG_CLEAR(vbl);
+-		vbl.request.type =
+-			DRM_VBLANK_RELATIVE |
+-			DRM_VBLANK_EVENT;
+-		vbl.request.sequence = 1;
+-		vbl.request.signal = (uintptr_t)info;
+-		assert(info->queued);
+-		if (!sna_wait_vblank(sna, &vbl, info->pipe))
++		if (sna_next_vblank(info))
+ 			return false;
++
++		kgem_bo_sync__gtt(&info->sna->kgem, info->bo);
+ 	}
+ 
+ 	DBG(("%s: blit finished\n", __FUNCTION__));
++	kgem_bo_destroy(&info->sna->kgem, info->bo);
++	info->bo = NULL;
++
+ 	return true;
+ }
+ 
+@@ -2128,11 +2585,12 @@ void sna_dri2_vblank_handler(struct drm_event_vblank *event)
+ 	struct sna_dri2_event *info = (void *)(uintptr_t)event->user_data;
+ 	struct sna *sna = info->sna;
+ 	DrawablePtr draw;
+-	union drm_wait_vblank vbl;
+ 	uint64_t msc;
+ 
+-	DBG(("%s(type=%d, sequence=%d)\n", __FUNCTION__, info->type, event->sequence));
++	DBG(("%s(type=%d, sequence=%d, draw=%ld)\n", __FUNCTION__, info->type, event->sequence, info->draw ? info->draw->serialNumber : 0));
+ 	assert(info->queued);
++	info->queued = false;
++
+ 	msc = sna_crtc_record_event(info->crtc, event);
+ 
+ 	draw = info->draw;
+@@ -2141,68 +2599,120 @@ void sna_dri2_vblank_handler(struct drm_event_vblank *event)
+ 		goto done;
+ 	}
+ 
++	assert((info->front == NULL && info->back == NULL) || info->front != info->back);
+ 	switch (info->type) {
+ 	case FLIP:
+ 		/* If we can still flip... */
++		assert(info->signal);
+ 		if (can_flip(sna, draw, info->front, info->back, info->crtc) &&
+ 		    sna_dri2_flip(info))
+ 			return;
+ 
+ 		/* else fall through to blit */
+ 	case SWAP:
+-		assert(info->queued);
+-		if (sna->mode.shadow && !sna->mode.shadow_damage) {
+-			/* recursed from wait_for_shadow(), simply requeue */
+-			DBG(("%s -- recursed from wait_for_shadow(), requeuing\n", __FUNCTION__));
+-
+-		} else if (can_xchg(info->sna, draw, info->front, info->back)) {
++		assert(info->signal);
++		if (can_xchg(info->sna, draw, info->front, info->back)) {
+ 			sna_dri2_xchg(draw, info->front, info->back);
+-			info->type = SWAP_WAIT;
+-		} else if (can_xchg_crtc(sna, draw, info->front, info->back, info->crtc)) {
+-			sna_dri2_xchg_crtc(sna, draw, info->crtc, info->front, info->back);
+-			info->type = SWAP_WAIT;
++			info->type = SWAP_COMPLETE;
++		} else if (can_xchg_crtc(sna, draw, info->crtc,
++					 info->front, info->back)) {
++			sna_dri2_xchg_crtc(sna, draw, info->crtc,
++					   info->front, info->back);
++			info->type = SWAP_COMPLETE;
+ 		}  else {
+-			assert(info->queued);
+-			info->bo = __sna_dri2_copy_region(sna, draw, NULL,
+-							  info->back, info->front, true);
+-			info->type = SWAP_WAIT;
++			__sna_dri2_copy_event(info, DRI2_BO | DRI2_SYNC);
++			info->type = SWAP_COMPLETE;
+ 		}
+ 
+-		VG_CLEAR(vbl);
+-		vbl.request.type =
+-			DRM_VBLANK_RELATIVE |
+-			DRM_VBLANK_EVENT;
+-		vbl.request.sequence = 1;
+-		vbl.request.signal = (uintptr_t)info;
+-
+-		assert(info->queued);
+-		if (!sna_wait_vblank(sna, &vbl, info->pipe))
++		if (sna_next_vblank(info))
+ 			return;
+ 
+ 		DBG(("%s -- requeue failed, errno=%d\n", __FUNCTION__, errno));
++		assert(info->pending.bo == NULL);
++		assert(info->keepalive == 1);
+ 		/* fall through to SwapComplete */
+-	case SWAP_WAIT:
+-		if (!sna_dri2_blit_complete(sna, info))
+-			return;
+-
+-		DBG(("%s: swap complete, unblocking client (frame=%d, tv=%d.%06d)\n", __FUNCTION__,
+-		     event->sequence, event->tv_sec, event->tv_usec));
+-		frame_swap_complete(info, DRI2_BLIT_COMPLETE);
+-		break;
+-
+-	case SWAP_THROTTLE:
++	case SWAP_COMPLETE:
+ 		DBG(("%s: %d complete, frame=%d tv=%d.%06d\n",
+ 		     __FUNCTION__, info->type,
+ 		     event->sequence, event->tv_sec, event->tv_usec));
+ 
+-		if (xorg_can_triple_buffer()) {
+-			if (!sna_dri2_blit_complete(sna, info))
++		if (info->signal) {
++			if (!sna_dri2_blit_complete(info))
+ 				return;
+ 
+ 			DBG(("%s: triple buffer swap complete, unblocking client (frame=%d, tv=%d.%06d)\n", __FUNCTION__,
+ 			     event->sequence, event->tv_sec, event->tv_usec));
+ 			frame_swap_complete(info, DRI2_BLIT_COMPLETE);
+ 		}
++
++		if (info->pending.bo) {
++			struct copy current_back;
++
++			DBG(("%s: swapping back handle=%d [name=%d, active=%d] for pending handle=%d [name=%d, active=%d], front handle=%d [name=%d, active=%d]\n",
++			     __FUNCTION__,
++			     get_private(info->back)->bo->handle, info->back->name, get_private(info->back)->bo->active_scanout,
++			     info->pending.bo->handle, info->pending.name, info->pending.bo->active_scanout,
++			     get_private(info->front)->bo->handle, info->front->name, get_private(info->front)->bo->active_scanout));
++
++			assert(info->pending.bo->active_scanout > 0);
++			info->pending.bo->active_scanout--;
++
++			current_back.bo = get_private(info->back)->bo;
++			current_back.size = get_private(info->back)->size;
++			current_back.name = info->back->name;
++			current_back.flags = info->back->flags;
++
++			get_private(info->back)->bo = info->pending.bo;
++			get_private(info->back)->size = info->pending.size;
++			info->back->name = info->pending.name;
++			info->back->pitch = info->pending.bo->pitch;
++			info->back->flags = info->pending.flags;
++			info->pending.bo = NULL;
++
++			assert(get_private(info->back)->bo != get_private(info->front)->bo);
++
++			if (can_xchg(info->sna, info->draw, info->front, info->back))
++				sna_dri2_xchg(info->draw, info->front, info->back);
++			else if (can_xchg_crtc(info->sna, info->draw, info->crtc,
++						 info->front, info->back))
++				sna_dri2_xchg_crtc(info->sna, info->draw, info->crtc,
++						   info->front, info->back);
++			else
++				__sna_dri2_copy_event(info, info->sync | DRI2_BO);
++
++			sna_dri2_cache_bo(info->sna, info->draw,
++					  get_private(info->back)->bo,
++					  info->back->name,
++					  get_private(info->back)->size,
++					  info->back->flags);
++
++			get_private(info->back)->bo = current_back.bo;
++			get_private(info->back)->size = current_back.size;
++			info->back->name = current_back.name;
++			info->back->pitch = current_back.bo->pitch;
++			info->back->flags = current_back.flags;
++
++			DBG(("%s: restored current back handle=%d [name=%d, active=%d], active=%d], front handle=%d [name=%d, active=%d]\n",
++			     __FUNCTION__,
++			     get_private(info->back)->bo->handle, info->back->name, get_private(info->back)->bo->active_scanout,
++			     get_private(info->front)->bo->handle, info->front->name, get_private(info->front)->bo->active_scanout));
++
++			assert(info->draw);
++			assert(!info->signal);
++			info->keepalive++;
++			info->signal = true;
++		}
++
++		if (--info->keepalive) {
++			if (sna_next_vblank(info))
++				return;
++
++			if (info->signal) {
++				DBG(("%s: triple buffer swap complete, unblocking client (frame=%d, tv=%d.%06d)\n", __FUNCTION__,
++				     event->sequence, event->tv_sec, event->tv_usec));
++				frame_swap_complete(info, DRI2_BLIT_COMPLETE);
++			}
++		}
+ 		break;
+ 
+ 	case WAITMSC:
+@@ -2218,11 +2728,11 @@ void sna_dri2_vblank_handler(struct drm_event_vblank *event)
+ 	}
+ 
+ 	if (info->chain) {
++		DBG(("%s: continuing chain\n", __FUNCTION__));
+ 		assert(info->chain != info);
+ 		assert(info->draw == draw);
+-		sna_dri2_remove_event((WindowPtr)draw, info);
++		sna_dri2_remove_event(info);
+ 		chain_swap(info->chain);
+-		info->draw = NULL;
+ 	}
+ 
+ done:
+@@ -2230,101 +2740,148 @@ done:
+ 	DBG(("%s complete\n", __FUNCTION__));
+ }
+ 
+-static bool
++static void
+ sna_dri2_immediate_blit(struct sna *sna,
+ 			struct sna_dri2_event *info,
+-			bool sync, bool event)
++			bool sync)
+ {
+-	DrawablePtr draw = info->draw;
+-	bool ret = false;
++	struct sna_dri2_event *chain = dri2_chain(info->draw);
+ 
+ 	if (sna->flags & SNA_NO_WAIT)
+ 		sync = false;
+ 
+-	DBG(("%s: emitting immediate blit, throttling client, synced? %d, chained? %d, send-event? %d\n",
+-	     __FUNCTION__, sync, dri2_chain(draw) != info,
+-	     event));
++	DBG(("%s: emitting immediate blit, throttling client, synced? %d, chained? %d, pipe %d\n",
++	     __FUNCTION__, sync, chain != info, info->pipe));
++	assert(chain);
+ 
+-	info->type = SWAP_THROTTLE;
+-	if (!sync || dri2_chain(draw) == info) {
+-		DBG(("%s: no pending blit, starting chain\n",
+-		     __FUNCTION__));
++	info->type = SWAP_COMPLETE;
++	info->sync = sync;
++	info->keepalive = KEEPALIVE;
+ 
+-		info->queued = true;
+-		info->bo = __sna_dri2_copy_region(sna, draw, NULL,
+-						  info->back,
+-						  info->front,
+-						  sync);
+-		if (event) {
+-			if (sync) {
+-				union drm_wait_vblank vbl;
+-
+-				VG_CLEAR(vbl);
+-				vbl.request.type =
+-					DRM_VBLANK_RELATIVE |
+-					DRM_VBLANK_EVENT;
+-				vbl.request.sequence = 1;
+-				vbl.request.signal = (uintptr_t)info;
+-				ret = !sna_wait_vblank(sna, &vbl, info->pipe);
+-				if (ret)
+-					event = !swap_limit(draw, 2);
+-			}
+-			if (event) {
+-				DBG(("%s: fake triple buffering, unblocking client\n", __FUNCTION__));
+-				frame_swap_complete(info, DRI2_BLIT_COMPLETE);
+-			}
++	if (chain == info) {
++		DBG(("%s: no pending blit, starting chain\n", __FUNCTION__));
++
++		assert(info->front != info->back);
++		if (can_xchg(info->sna, info->draw, info->front, info->back)) {
++			sna_dri2_xchg(info->draw, info->front, info->back);
++		} else if (can_xchg_crtc(info->sna, info->draw, info->crtc,
++					 info->front, info->back)) {
++			sna_dri2_xchg_crtc(info->sna, info->draw, info->crtc,
++					   info->front, info->back);
++		} else
++			__sna_dri2_copy_event(info, sync | DRI2_BO);
++
++		assert(info->signal);
++
++		if ((!swap_limit(info->draw, 2 + !sync) && !sync) ||
++		    !sna_next_vblank(info)) {
++			DBG(("%s: fake triple buffering, unblocking client\n", __FUNCTION__));
++			frame_swap_complete(info, DRI2_BLIT_COMPLETE);
++			sna_dri2_event_free(info);
++		}
++		return;
++	}
++
++	DBG(("%s: current event front=%d [name=%d, active?=%d], back=%d [name=%d, active?=%d]\n", __FUNCTION__,
++	     get_private(chain->front)->bo->handle, chain->front->name, get_private(chain->front)->bo->active_scanout,
++	     get_private(chain->back)->bo->handle, chain->back->name, get_private(chain->back)->bo->active_scanout));
++
++	if (chain->type == SWAP_COMPLETE && chain->front == info->front) {
++		assert(chain->draw == info->draw);
++		assert(chain->client == info->client);
++		assert(chain->event_complete == info->event_complete);
++		assert(chain->event_data == info->event_data);
++		assert(chain->queued);
++
++		if ((!sync || !chain->sync) && chain->pending.bo) {
++			bool signal = chain->signal;
++
++			DBG(("%s: swap elision, unblocking client\n", __FUNCTION__));
++			assert(chain->draw);
++			chain->signal = true;
++			frame_swap_complete(chain, DRI2_EXCHANGE_COMPLETE);
++			chain->signal = signal;
++
++			assert(chain->pending.bo->active_scanout > 0);
++			chain->pending.bo->active_scanout--;
++
++			sna_dri2_cache_bo(chain->sna, chain->draw,
++					  chain->pending.bo,
++					  chain->pending.name,
++					  chain->pending.size,
++					  chain->pending.flags);
++			chain->pending.bo = NULL;
++		}
++
++		if (chain->pending.bo == NULL && swap_limit(info->draw, 2 + !sync)) {
++			DBG(("%s: setting handle=%d as pending blit (current event front=%d, back=%d)\n", __FUNCTION__,
++			     get_private(info->back)->bo->handle,
++			     get_private(chain->front)->bo->handle,
++			     get_private(chain->back)->bo->handle));
++			chain->pending.bo = ref(get_private(info->back)->bo);
++			chain->pending.size = get_private(info->back)->size;
++			chain->pending.name = info->back->name;
++			chain->pending.flags = info->back->flags;
++			chain->sync = sync;
++			info->signal = false; /* transfer signal to pending */
++
++			/* Prevent us from handing it back on next GetBuffers */
++			chain->pending.bo->active_scanout++;
++
++			sna_dri2_event_free(info);
++			return;
+ 		}
+-	} else {
+-		DBG(("%s: pending blit, chained\n", __FUNCTION__));
+-		ret = true;
+ 	}
+ 
+-	DBG(("%s: continue? %d\n", __FUNCTION__, ret));
+-	return ret;
++	DBG(("%s: pending blit, chained\n", __FUNCTION__));
+ }
+ 
+ static bool
+ sna_dri2_flip_continue(struct sna_dri2_event *info)
+ {
+-	DBG(("%s(mode=%d)\n", __FUNCTION__, info->mode));
++	struct kgem_bo *bo = get_private(info->front)->bo;
+ 
+-	if (info->mode > 0){
+-		struct kgem_bo *bo = get_private(info->front)->bo;
++	DBG(("%s(mode=%d)\n", __FUNCTION__, info->flip_continue));
++	assert(info->flip_continue > 0);
++	info->type = info->flip_continue;
++	info->flip_continue = 0;
+ 
+-		info->type = info->mode;
++	assert(!info->signal);
++	info->signal = info->type == FLIP_THROTTLE && info->draw;
+ 
+-		if (bo != sna_pixmap(info->sna->front)->gpu_bo)
+-			return false;
++	if (info->sna->mode.front_active == 0)
++		return false;
+ 
+-		if (!sna_page_flip(info->sna, bo, sna_dri2_flip_handler, info))
+-			return false;
++	if (bo != sna_pixmap(info->sna->front)->gpu_bo)
++		return false;
+ 
+-		assert(info->sna->dri2.flip_pending == NULL ||
+-		       info->sna->dri2.flip_pending == info);
+-		info->sna->dri2.flip_pending = info;
+-		assert(info->queued);
+-	} else {
+-		info->type = -info->mode;
++	assert(!info->queued);
++	if (!sna_page_flip(info->sna, bo, sna_dri2_flip_handler, info))
++		return false;
+ 
+-		if (!info->draw)
+-			return false;
++	DBG(("%s: queued flip=%p\n", __FUNCTION__, info));
++	assert(info->sna->dri2.flip_pending == NULL ||
++	       info->sna->dri2.flip_pending == info);
++	info->sna->dri2.flip_pending = info;
++	info->queued = true;
+ 
+-		if (!can_flip(info->sna, info->draw, info->front, info->back, info->crtc))
+-			return false;
++	return true;
++}
+ 
+-		assert(sna_pixmap_get_buffer(get_drawable_pixmap(info->draw)) == info->front);
+-		if (!sna_dri2_flip(info))
+-			return false;
++static bool
++sna_dri2_flip_keepalive(struct sna_dri2_event *info)
++{
++	DBG(("%s(keepalive?=%d)\n", __FUNCTION__, info->keepalive-1));
++	assert(info->keepalive > 0);
++	if (!--info->keepalive)
++		return false;
+ 
+-		if (!xorg_can_triple_buffer()) {
+-			sna_dri2_get_back(info->sna, info->draw, info->back, info);
+-			DBG(("%s: fake triple buffering, unblocking client\n", __FUNCTION__));
+-			frame_swap_complete(info, DRI2_FLIP_COMPLETE);
+-		}
+-	}
++	if (info->draw == NULL)
++		return false;
+ 
+-	info->mode = 0;
+-	return true;
++	DBG(("%s: marking next flip as complete\n", __FUNCTION__));
++	info->flip_continue = FLIP_COMPLETE;
++	return sna_dri2_flip_continue(info);
+ }
+ 
+ static void chain_flip(struct sna *sna)
+@@ -2332,8 +2889,8 @@ static void chain_flip(struct sna *sna)
+ 	struct sna_dri2_event *chain = sna->dri2.flip_pending;
+ 
+ 	assert(chain->type == FLIP);
+-	DBG(("%s: chaining type=%d, cancelled?=%d\n",
+-	     __FUNCTION__, chain->type, chain->draw == NULL));
++	DBG(("%s: chaining type=%d, cancelled?=%d window=%ld\n",
++	     __FUNCTION__, chain->type, chain->draw == NULL, chain->draw ? chain->draw->id : 0));
+ 
+ 	sna->dri2.flip_pending = NULL;
+ 	if (chain->draw == NULL) {
+@@ -2343,31 +2900,18 @@ static void chain_flip(struct sna *sna)
+ 
+ 	assert(chain == dri2_chain(chain->draw));
+ 	assert(!chain->queued);
+-	chain->queued = true;
+ 
+ 	if (can_flip(sna, chain->draw, chain->front, chain->back, chain->crtc) &&
+ 	    sna_dri2_flip(chain)) {
+ 		DBG(("%s: performing chained flip\n", __FUNCTION__));
+ 	} else {
+ 		DBG(("%s: emitting chained vsync'ed blit\n", __FUNCTION__));
+-		chain->bo = __sna_dri2_copy_region(sna, chain->draw, NULL,
+-						  chain->back, chain->front,
+-						  true);
++		__sna_dri2_copy_event(chain, DRI2_SYNC);
+ 
+ 		if (xorg_can_triple_buffer()) {
+-			union drm_wait_vblank vbl;
+-
+-			VG_CLEAR(vbl);
+-
+-			chain->type = SWAP_WAIT;
+-			vbl.request.type =
+-				DRM_VBLANK_RELATIVE |
+-				DRM_VBLANK_EVENT;
+-			vbl.request.sequence = 1;
+-			vbl.request.signal = (uintptr_t)chain;
+-
+-			assert(chain->queued);
+-			if (!sna_wait_vblank(sna, &vbl, chain->pipe))
++			chain->type = SWAP_COMPLETE;
++			assert(chain->signal);
++			if (sna_next_vblank(chain))
+ 				return;
+ 		}
+ 
+@@ -2381,8 +2925,10 @@ static void sna_dri2_flip_event(struct sna_dri2_event *flip)
+ {
+ 	struct sna *sna = flip->sna;
+ 
+-	DBG(("%s(pipe=%d, event=%d)\n", __FUNCTION__, flip->pipe, flip->type));
+-	assert(flip->queued);
++	DBG(("%s flip=%p (pipe=%d, event=%d, queued?=%d)\n", __FUNCTION__, flip, flip->pipe, flip->type, flip->queued));
++	if (!flip->queued) /* pageflip died whilst being queued */
++		return;
++	flip->queued = false;
+ 
+ 	if (sna->dri2.flip_pending == flip)
+ 		sna->dri2.flip_pending = NULL;
+@@ -2390,8 +2936,10 @@ static void sna_dri2_flip_event(struct sna_dri2_event *flip)
+ 	/* We assume our flips arrive in order, so we don't check the frame */
+ 	switch (flip->type) {
+ 	case FLIP:
+-		DBG(("%s: swap complete, unblocking client\n", __FUNCTION__));
+-		frame_swap_complete(flip, DRI2_FLIP_COMPLETE);
++		if (flip->signal) {
++			DBG(("%s: swap complete, unblocking client\n", __FUNCTION__));
++			frame_swap_complete(flip, DRI2_FLIP_COMPLETE);
++		}
+ 		sna_dri2_event_free(flip);
+ 
+ 		if (sna->dri2.flip_pending)
+@@ -2399,27 +2947,35 @@ static void sna_dri2_flip_event(struct sna_dri2_event *flip)
+ 		break;
+ 
+ 	case FLIP_THROTTLE:
+-		DBG(("%s: triple buffer swap complete, unblocking client\n", __FUNCTION__));
+-		frame_swap_complete(flip, DRI2_FLIP_COMPLETE);
++		if (flip->signal) {
++			DBG(("%s: triple buffer swap complete, unblocking client\n", __FUNCTION__));
++			frame_swap_complete(flip, DRI2_FLIP_COMPLETE);
++		}
+ 	case FLIP_COMPLETE:
++		assert(!flip->signal);
+ 		if (sna->dri2.flip_pending) {
++			DBG(("%s: pending flip\n", __FUNCTION__));
+ 			sna_dri2_event_free(flip);
+ 			chain_flip(sna);
+-		} else if (!flip->mode) {
++		} else if (!flip->flip_continue) {
+ 			DBG(("%s: flip chain complete\n", __FUNCTION__));
++			if (!sna_dri2_flip_keepalive(flip)) {
++				if (flip->chain) {
++					sna_dri2_remove_event(flip);
++					chain_swap(flip->chain);
++				}
+ 
+-			if (flip->chain) {
+-				sna_dri2_remove_event((WindowPtr)flip->draw,
+-						      flip);
+-				chain_swap(flip->chain);
+-				flip->draw = NULL;
++				sna_dri2_event_free(flip);
+ 			}
+-
+-			sna_dri2_event_free(flip);
+ 		} else if (!sna_dri2_flip_continue(flip)) {
+ 			DBG(("%s: no longer able to flip\n", __FUNCTION__));
+-			if (flip->draw == NULL || !sna_dri2_immediate_blit(sna, flip, false, flip->mode < 0))
+-				sna_dri2_event_free(flip);
++			if (flip->draw != NULL)
++				__sna_dri2_copy_event(flip, 0);
++			if (flip->signal) {
++				DBG(("%s: fake triple buffering, unblocking client\n", __FUNCTION__));
++				frame_swap_complete(flip, DRI2_BLIT_COMPLETE);
++			}
++			sna_dri2_event_free(flip);
+ 		}
+ 		break;
+ 
+@@ -2433,17 +2989,27 @@ static void sna_dri2_flip_event(struct sna_dri2_event *flip)
+ 	}
+ }
+ 
++static int
++sna_query_vblank(struct sna *sna, xf86CrtcPtr crtc, union drm_wait_vblank *vbl)
++{
++	VG_CLEAR(*vbl);
++	vbl->request.type =
++		_DRM_VBLANK_RELATIVE | pipe_select(sna_crtc_pipe(crtc));
++	vbl->request.sequence = 0;
++
++	return drmIoctl(sna->kgem.fd, DRM_IOCTL_WAIT_VBLANK, vbl);
++}
++
+ static uint64_t
+ get_current_msc(struct sna *sna, DrawablePtr draw, xf86CrtcPtr crtc)
+ {
+ 	union drm_wait_vblank vbl;
+-	uint64_t ret = -1;
++	uint64_t ret;
+ 
+-	VG_CLEAR(vbl);
+-	vbl.request.type = _DRM_VBLANK_RELATIVE;
+-	vbl.request.sequence = 0;
+-	if (sna_wait_vblank(sna, &vbl, sna_crtc_to_pipe(crtc)) == 0)
++	if (sna_query_vblank(sna, crtc, &vbl) == 0)
+ 		ret = sna_crtc_record_vblank(crtc, &vbl);
++	else
++		ret = sna_crtc_last_swap(crtc)->msc;
+ 
+ 	return draw_current_msc(draw, crtc, ret);
+ }
+@@ -2494,12 +3060,18 @@ static int use_triple_buffer(struct sna *sna, ClientPtr client, bool async)
+ }
+ 
+ static bool immediate_swap(struct sna *sna,
+-			   uint64_t target_msc,
+-			   uint64_t divisor,
+ 			   DrawablePtr draw,
+ 			   xf86CrtcPtr crtc,
++			   uint64_t *target_msc,
++			   uint64_t divisor,
++			   uint64_t remainder,
+ 			   uint64_t *current_msc)
+ {
++	/*
++	 * If divisor is zero, or current_msc is smaller than target_msc
++	 * we just need to make sure target_msc passes before initiating
++	 * the swap.
++	 */
+ 	if (divisor == 0) {
+ 		*current_msc = -1;
+ 
+@@ -2508,72 +3080,97 @@ static bool immediate_swap(struct sna *sna,
+ 			return true;
+ 		}
+ 
+-		if (target_msc)
++		if (*target_msc)
+ 			*current_msc = get_current_msc(sna, draw, crtc);
+ 
+ 		DBG(("%s: current_msc=%ld, target_msc=%ld -- %s\n",
+-		     __FUNCTION__, (long)*current_msc, (long)target_msc,
+-		     (*current_msc >= target_msc - 1) ? "yes" : "no"));
+-		return *current_msc >= target_msc - 1;
++		     __FUNCTION__, (long)*current_msc, (long)*target_msc,
++		     (*current_msc >= *target_msc - 1) ? "yes" : "no"));
++		return *current_msc >= *target_msc - 1;
+ 	}
+ 
+ 	DBG(("%s: explicit waits requests, divisor=%ld\n",
+ 	     __FUNCTION__, (long)divisor));
+ 	*current_msc = get_current_msc(sna, draw, crtc);
+-	return false;
++	if (*current_msc >= *target_msc) {
++		DBG(("%s: missed target, queueing event for next: current=%lld, target=%lld, divisor=%lld, remainder=%lld\n",
++		     __FUNCTION__,
++		     (long long)*current_msc,
++		     (long long)*target_msc,
++		     (long long)divisor,
++		     (long long)remainder));
++
++		*target_msc = *current_msc + remainder - *current_msc % divisor;
++		if (*target_msc <= *current_msc)
++			*target_msc += divisor;
++	}
++
++	DBG(("%s: target_msc=%lld, current_msc=%lld, immediate?=%d\n",
++	     __FUNCTION__, (long long)*target_msc, (long long)*current_msc,
++	     *current_msc >= *target_msc - 1));
++	return *current_msc >= *target_msc - 1;
+ }
+ 
+ static bool
+ sna_dri2_schedule_flip(ClientPtr client, DrawablePtr draw, xf86CrtcPtr crtc,
+ 		       DRI2BufferPtr front, DRI2BufferPtr back,
+-		       CARD64 *target_msc, CARD64 divisor, CARD64 remainder,
++		       bool immediate, CARD64 *target_msc, CARD64 current_msc,
+ 		       DRI2SwapEventPtr func, void *data)
+ {
+ 	struct sna *sna = to_sna_from_drawable(draw);
+ 	struct sna_dri2_event *info;
+-	uint64_t current_msc;
+-
+-	if (immediate_swap(sna, *target_msc, divisor, draw, crtc, &current_msc)) {
+-		int type;
+ 
++	if (immediate) {
++		bool signal = false;
+ 		info = sna->dri2.flip_pending;
+ 		DBG(("%s: performing immediate swap on pipe %d, pending? %d, mode: %d, continuation? %d\n",
+-		     __FUNCTION__, sna_crtc_to_pipe(crtc),
+-		     info != NULL, info ? info->mode : 0,
++		     __FUNCTION__, sna_crtc_pipe(crtc),
++		     info != NULL, info ? info->flip_continue : 0,
+ 		     info && info->draw == draw));
+ 
+ 		if (info && info->draw == draw) {
+ 			assert(info->type != FLIP);
+-			assert(info->front == front);
++			assert(info->queued);
++			assert(info->front != info->back);
++			if (info->front != front) {
++				assert(info->front != NULL);
++				_sna_dri2_destroy_buffer(sna, draw, info->front);
++				info->front = sna_dri2_reference_buffer(front);
++			}
+ 			if (info->back != back) {
+-				_sna_dri2_destroy_buffer(sna, info->back);
++				assert(info->back != NULL);
++				_sna_dri2_destroy_buffer(sna, draw, info->back);
+ 				info->back = sna_dri2_reference_buffer(back);
+ 			}
+-			if (info->mode || current_msc >= *target_msc) {
+-				DBG(("%s: executing xchg of pending flip\n",
+-				     __FUNCTION__));
+-				sna_dri2_xchg(draw, front, back);
+-				info->mode = type = FLIP_COMPLETE;
+-				goto new_back;
+-			} else {
++			assert(info->front != info->back);
++			DBG(("%s: executing xchg of pending flip: flip_continue=%d, keepalive=%d, chain?=%d\n", __FUNCTION__, info->flip_continue, info->keepalive, current_msc < *target_msc));
++			sna_dri2_xchg(draw, front, back);
++			info->keepalive = KEEPALIVE;
++			if (xorg_can_triple_buffer() &&
++			    current_msc < *target_msc) {
+ 				DBG(("%s: chaining flip\n", __FUNCTION__));
+-				type = FLIP_THROTTLE;
+-				if (xorg_can_triple_buffer())
+-					info->mode = -type;
+-				else
+-					info->mode = -FLIP_COMPLETE;
++				info->flip_continue = FLIP_THROTTLE;
+ 				goto out;
++			} else {
++				info->flip_continue = FLIP_COMPLETE;
++				signal = info->signal;
++				assert(info->draw);
++				info->signal = true;
++				goto new_back;
+ 			}
+ 		}
+ 
+-		info = sna_dri2_add_event(sna, draw, client);
++		info = sna_dri2_add_event(sna, draw, client, crtc);
+ 		if (info == NULL)
+ 			return false;
+ 
+ 		assert(info->crtc == crtc);
+ 		info->event_complete = func;
+ 		info->event_data = data;
++		assert(info->draw);
++		info->signal = true;
+ 
++		assert(front != back);
+ 		info->front = sna_dri2_reference_buffer(front);
+ 		info->back = sna_dri2_reference_buffer(back);
+ 
+@@ -2584,26 +3181,33 @@ sna_dri2_schedule_flip(ClientPtr client, DrawablePtr draw, xf86CrtcPtr crtc,
+ 			 */
+ 			DBG(("%s: queueing flip after pending completion\n",
+ 			     __FUNCTION__));
+-			info->type = type = FLIP;
++			info->type = FLIP;
+ 			sna->dri2.flip_pending = info;
+-			assert(info->queued);
+ 			current_msc++;
++		} else if (sna->mode.flip_active) {
++			DBG(("%s: %d outstanding flips from old client, queueing\n",
++			     __FUNCTION__, sna->mode.flip_active));
++			goto queue;
+ 		} else {
+-			info->type = type = use_triple_buffer(sna, client, *target_msc == 0);
++			info->type = use_triple_buffer(sna, client, *target_msc == 0);
+ 			if (!sna_dri2_flip(info)) {
+ 				DBG(("%s: flip failed, falling back\n", __FUNCTION__));
++				info->signal = false;
+ 				sna_dri2_event_free(info);
+ 				return false;
+ 			}
++			assert(get_private(info->front)->bo->active_scanout);
+ 		}
+ 
+-		swap_limit(draw, 1 + (type == FLIP_THROTTLE));
+-		if (type >= FLIP_COMPLETE) {
++		swap_limit(draw, 1 + (info->type == FLIP_THROTTLE));
++		if (info->type >= FLIP_COMPLETE) {
+ new_back:
+ 			if (!xorg_can_triple_buffer())
+-				sna_dri2_get_back(sna, draw, back, info);
++				sna_dri2_get_back(sna, draw, back);
+ 			DBG(("%s: fake triple buffering, unblocking client\n", __FUNCTION__));
+ 			frame_swap_complete(info, DRI2_EXCHANGE_COMPLETE);
++			assert(info->draw);
++			info->signal = signal;
+ 			if (info->type == FLIP_ASYNC)
+ 				sna_dri2_event_free(info);
+ 		}
+@@ -2613,57 +3217,34 @@ out:
+ 		return true;
+ 	}
+ 
+-	info = sna_dri2_add_event(sna, draw, client);
++queue:
++	if (KEEPALIVE > 1 && sna->dri2.flip_pending) {
++		info = sna->dri2.flip_pending;
++		info->keepalive = 1;
++	}
++
++	info = sna_dri2_add_event(sna, draw, client, crtc);
+ 	if (info == NULL)
+ 		return false;
+ 
+ 	assert(info->crtc == crtc);
+ 	info->event_complete = func;
+ 	info->event_data = data;
++	assert(info->draw);
++	info->signal = true;
+ 	info->type = FLIP;
+ 
++	assert(front != back);
+ 	info->front = sna_dri2_reference_buffer(front);
+ 	info->back = sna_dri2_reference_buffer(back);
+ 
+-	/*
+-	 * If divisor is zero, or current_msc is smaller than target_msc
+-	 * we just need to make sure target_msc passes before initiating
+-	 * the swap.
+-	 */
+-	if (divisor && current_msc >= *target_msc) {
+-		DBG(("%s: missed target, queueing event for next: current=%lld, target=%lld, divisor=%lld, remainder=%lld\n",
+-		     __FUNCTION__,
+-		     (long long)current_msc,
+-		     (long long)*target_msc,
+-		     (long long)divisor,
+-		     (long long)remainder));
+-
+-		*target_msc = current_msc + remainder - current_msc % divisor;
+-		if (*target_msc <= current_msc)
+-			*target_msc += divisor;
+-	}
+-
+-	if (*target_msc <= current_msc + 1) {
+-		if (!sna_dri2_flip(info)) {
+-			sna_dri2_event_free(info);
+-			return false;
+-		}
++	if (*target_msc <= current_msc + 1 && sna_dri2_flip(info)) {
+ 		*target_msc = current_msc + 1;
+ 	} else {
+-		union drm_wait_vblank vbl;
+-
+-		VG_CLEAR(vbl);
+-
+-		vbl.request.type =
+-			DRM_VBLANK_ABSOLUTE |
+-			DRM_VBLANK_EVENT;
+-
+ 		/* Account for 1 frame extra pageflip delay */
+-		vbl.reply.sequence = draw_target_seq(draw, *target_msc - 1);
+-		vbl.request.signal = (uintptr_t)info;
+-
+-		info->queued = true;
+-		if (sna_wait_vblank(sna, &vbl, info->pipe)) {
++		if (!sna_wait_vblank(info,
++				     draw_target_seq(draw, *target_msc - 1))) {
++			info->signal = false;
+ 			sna_dri2_event_free(info);
+ 			return false;
+ 		}
+@@ -2674,128 +3255,6 @@ out:
+ 	return true;
+ }
+ 
+-static bool
+-sna_dri2_schedule_xchg(ClientPtr client, DrawablePtr draw, xf86CrtcPtr crtc,
+-		       DRI2BufferPtr front, DRI2BufferPtr back,
+-		       CARD64 *target_msc, CARD64 divisor, CARD64 remainder,
+-		       DRI2SwapEventPtr func, void *data)
+-{
+-	struct sna *sna = to_sna_from_drawable(draw);
+-	uint64_t current_msc;
+-	bool sync, event;
+-
+-	if (!immediate_swap(sna, *target_msc, divisor, draw, crtc, &current_msc))
+-		return false;
+-
+-	sync = current_msc < *target_msc;
+-	event = dri2_chain(draw) == NULL;
+-	if (!sync || event) {
+-		DBG(("%s: performing immediate xchg on pipe %d\n",
+-		     __FUNCTION__, sna_crtc_to_pipe(crtc)));
+-		sna_dri2_xchg(draw, front, back);
+-	}
+-	if (sync) {
+-		struct sna_dri2_event *info;
+-
+-		info = sna_dri2_add_event(sna, draw, client);
+-		if (!info)
+-			goto complete;
+-
+-		info->event_complete = func;
+-		info->event_data = data;
+-
+-		info->front = sna_dri2_reference_buffer(front);
+-		info->back = sna_dri2_reference_buffer(back);
+-		info->type = SWAP_THROTTLE;
+-
+-		if (event) {
+-			union drm_wait_vblank vbl;
+-
+-			VG_CLEAR(vbl);
+-			vbl.request.type =
+-				DRM_VBLANK_RELATIVE |
+-				DRM_VBLANK_EVENT;
+-			vbl.request.sequence = 1;
+-			vbl.request.signal = (uintptr_t)info;
+-
+-			info->queued = true;
+-			if (sna_wait_vblank(sna, &vbl, info->pipe)) {
+-				sna_dri2_event_free(info);
+-				goto complete;
+-			}
+-
+-			swap_limit(draw, 2);
+-		}
+-	} else {
+-complete:
+-		fake_swap_complete(sna, client, draw, crtc, DRI2_EXCHANGE_COMPLETE, func, data);
+-	}
+-
+-	*target_msc = current_msc + 1;
+-	return true;
+-}
+-
+-static bool
+-sna_dri2_schedule_xchg_crtc(ClientPtr client, DrawablePtr draw, xf86CrtcPtr crtc,
+-			    DRI2BufferPtr front, DRI2BufferPtr back,
+-			    CARD64 *target_msc, CARD64 divisor, CARD64 remainder,
+-			    DRI2SwapEventPtr func, void *data)
+-{
+-	struct sna *sna = to_sna_from_drawable(draw);
+-	uint64_t current_msc;
+-	bool sync, event;
+-
+-	if (!immediate_swap(sna, *target_msc, divisor, draw, crtc, &current_msc))
+-		return false;
+-
+-	sync = current_msc < *target_msc;
+-	event = dri2_chain(draw) == NULL;
+-	if (!sync || event) {
+-		DBG(("%s: performing immediate xchg only on pipe %d\n",
+-		     __FUNCTION__, sna_crtc_to_pipe(crtc)));
+-		sna_dri2_xchg_crtc(sna, draw, crtc, front, back);
+-	}
+-	if (sync) {
+-		struct sna_dri2_event *info;
+-
+-		info = sna_dri2_add_event(sna, draw, client);
+-		if (!info)
+-			goto complete;
+-
+-		info->event_complete = func;
+-		info->event_data = data;
+-
+-		info->front = sna_dri2_reference_buffer(front);
+-		info->back = sna_dri2_reference_buffer(back);
+-		info->type = SWAP_THROTTLE;
+-
+-		if (event) {
+-			union drm_wait_vblank vbl;
+-
+-			VG_CLEAR(vbl);
+-			vbl.request.type =
+-				DRM_VBLANK_RELATIVE |
+-				DRM_VBLANK_EVENT;
+-			vbl.request.sequence = 1;
+-			vbl.request.signal = (uintptr_t)info;
+-
+-			info->queued = true;
+-			if (sna_wait_vblank(sna, &vbl, info->pipe)) {
+-				sna_dri2_event_free(info);
+-				goto complete;
+-			}
+-
+-			swap_limit(draw, 2);
+-		}
+-	} else {
+-complete:
+-		fake_swap_complete(sna, client, draw, crtc, DRI2_EXCHANGE_COMPLETE, func, data);
+-	}
+-
+-	*target_msc = current_msc + 1;
+-	return true;
+-}
+-
+ static bool has_pending_events(struct sna *sna)
+ {
+ 	struct pollfd pfd;
+@@ -2830,11 +3289,11 @@ sna_dri2_schedule_swap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
+ 		       CARD64 remainder, DRI2SwapEventPtr func, void *data)
+ {
+ 	struct sna *sna = to_sna_from_drawable(draw);
+-	union drm_wait_vblank vbl;
+ 	xf86CrtcPtr crtc = NULL;
+ 	struct sna_dri2_event *info = NULL;
+ 	int type = DRI2_EXCHANGE_COMPLETE;
+ 	CARD64 current_msc;
++	bool immediate;
+ 
+ 	DBG(("%s: draw=%lu %dx%d, pixmap=%ld %dx%d, back=%u (refs=%d/%d, flush=%d) , front=%u (refs=%d/%d, flush=%d)\n",
+ 	     __FUNCTION__,
+@@ -2860,6 +3319,7 @@ sna_dri2_schedule_swap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
+ 	assert(get_private(front)->refcnt);
+ 	assert(get_private(back)->refcnt);
+ 
++	assert(get_private(back)->bo != get_private(front)->bo);
+ 	assert(get_private(front)->bo->refcnt);
+ 	assert(get_private(back)->bo->refcnt);
+ 
+@@ -2876,17 +3336,17 @@ sna_dri2_schedule_swap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
+ 		goto skip;
+ 	}
+ 
+-	assert(sna_pixmap_from_drawable(draw)->flush);
+-
+ 	if (draw->type != DRAWABLE_PIXMAP) {
+ 		WindowPtr win = (WindowPtr)draw;
+ 		struct dri2_window *priv = dri2_window(win);
++
+ 		if (priv->front) {
+-			assert(front == priv->front);
+-			assert(get_private(priv->front)->refcnt > 1);
+-			get_private(priv->front)->refcnt--;
+-			priv->front = NULL;
++			front = priv->front;
++			assert(front->attachment == DRI2BufferFrontLeft);
++			assert(get_private(front)->refcnt);
++			assert(get_private(front)->pixmap == get_drawable_pixmap(draw));
+ 		}
++
+ 		if (win->clipList.extents.x2 <= win->clipList.extents.x1 ||
+ 		    win->clipList.extents.y2 <= win->clipList.extents.y1) {
+ 			DBG(("%s: window clipped (%d, %d), (%d, %d)\n",
+@@ -2899,6 +3359,10 @@ sna_dri2_schedule_swap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
+ 		}
+ 	}
+ 
++	DBG(("%s: using front handle=%d, active_scanout?=%d, flush?=%d\n", __FUNCTION__, get_private(front)->bo->handle, get_private(front)->bo->active_scanout, sna_pixmap_from_drawable(draw)->flush));
++	assert(get_private(front)->bo->active_scanout);
++	assert(sna_pixmap_from_drawable(draw)->flush);
++
+ 	/* Drawable not displayed... just complete the swap */
+ 	if ((sna->flags & SNA_NO_WAIT) == 0)
+ 		crtc = sna_dri2_get_crtc(draw);
+@@ -2914,109 +3378,112 @@ sna_dri2_schedule_swap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
+ 		sna_mode_wakeup(sna);
+ 	}
+ 
+-	if (can_xchg(sna, draw, front, back) &&
+-	    sna_dri2_schedule_xchg(client, draw, crtc, front, back,
++	immediate = immediate_swap(sna, draw, crtc,
+ 				   target_msc, divisor, remainder,
+-				   func, data))
+-		return TRUE;
+-
+-	if (can_xchg_crtc(sna, draw, front, back, crtc) &&
+-	    sna_dri2_schedule_xchg_crtc(client, draw, crtc, front, back,
+-					target_msc, divisor, remainder,
+-					func, data))
+-		return TRUE;
++				   &current_msc);
+ 
+ 	if (can_flip(sna, draw, front, back, crtc) &&
+ 	    sna_dri2_schedule_flip(client, draw, crtc, front, back,
+-				  target_msc, divisor, remainder,
++				  immediate, target_msc, current_msc,
+ 				  func, data))
+ 		return TRUE;
+ 
+-	VG_CLEAR(vbl);
+-
+-	info = sna_dri2_add_event(sna, draw, client);
++	info = sna_dri2_add_event(sna, draw, client, crtc);
+ 	if (!info)
+ 		goto blit;
+ 
+ 	assert(info->crtc == crtc);
+ 	info->event_complete = func;
+ 	info->event_data = data;
++	assert(info->draw);
++	info->signal = true;
+ 
++	assert(front != back);
+ 	info->front = sna_dri2_reference_buffer(front);
+ 	info->back = sna_dri2_reference_buffer(back);
+ 
+-	if (immediate_swap(sna, *target_msc, divisor, draw, crtc, &current_msc)) {
++	if (immediate) {
+ 		bool sync = current_msc < *target_msc;
+-		if (!sna_dri2_immediate_blit(sna, info, sync, true))
+-			sna_dri2_event_free(info);
++		sna_dri2_immediate_blit(sna, info, sync);
+ 		*target_msc = current_msc + sync;
++		DBG(("%s: reported target_msc=%llu\n",
++		     __FUNCTION__, *target_msc));
+ 		return TRUE;
+ 	}
+ 
+-	vbl.request.type =
+-		DRM_VBLANK_ABSOLUTE |
+-		DRM_VBLANK_EVENT;
+-	vbl.request.signal = (uintptr_t)info;
+-
+-	/*
+-	 * If divisor is zero, or current_msc is smaller than target_msc
+-	 * we just need to make sure target_msc passes before initiating
+-	 * the swap.
+-	 */
+ 	info->type = SWAP;
+-	info->queued = true;
+-	if (divisor && current_msc >= *target_msc) {
+-		DBG(("%s: missed target, queueing event for next: current=%lld, target=%lld, divisor=%lld, remainder=%lld\n",
+-		     __FUNCTION__,
+-		     (long long)current_msc,
+-		     (long long)*target_msc,
+-		     (long long)divisor,
+-		     (long long)remainder));
+-
+-		*target_msc = current_msc + remainder - current_msc % divisor;
+-		if (*target_msc <= current_msc)
+-			*target_msc += divisor;
+-	}
+-	vbl.request.sequence = draw_target_seq(draw, *target_msc - 1);
+ 	if (*target_msc <= current_msc + 1) {
+ 		DBG(("%s: performing blit before queueing\n", __FUNCTION__));
+-		assert(info->queued);
+-		info->bo = __sna_dri2_copy_region(sna, draw, NULL,
+-						  back, front,
+-						  true);
+-		info->type = SWAP_WAIT;
+-
+-		vbl.request.type =
+-			DRM_VBLANK_RELATIVE |
+-			DRM_VBLANK_EVENT;
+-		vbl.request.sequence = 1;
++		__sna_dri2_copy_event(info, DRI2_SYNC);
++		info->type = SWAP_COMPLETE;
++		if (!sna_next_vblank(info))
++			goto fake;
++
++		DBG(("%s: reported target_msc=%llu\n",
++		     __FUNCTION__, *target_msc));
+ 		*target_msc = current_msc + 1;
+-	}
++		swap_limit(draw, 2);
++	} else {
++		if (!sna_wait_vblank(info,
++				     draw_target_seq(draw, *target_msc - 1)))
++			goto blit;
+ 
+-	assert(info->queued);
+-	if (sna_wait_vblank(sna, &vbl, info->pipe))
+-		goto blit;
++		DBG(("%s: reported target_msc=%llu (in)\n",
++		     __FUNCTION__, *target_msc));
++		swap_limit(draw, 1);
++	}
+ 
+-	DBG(("%s: reported target_msc=%llu\n", __FUNCTION__, *target_msc));
+-	swap_limit(draw, 1 + (info->type == SWAP_WAIT));
+ 	return TRUE;
+ 
+ blit:
+ 	DBG(("%s -- blit\n", __FUNCTION__));
+-	if (info)
+-		sna_dri2_event_free(info);
+ 	if (can_xchg(sna, draw, front, back)) {
+ 		sna_dri2_xchg(draw, front, back);
+ 	} else {
+-		__sna_dri2_copy_region(sna, draw, NULL, back, front, false);
++		__sna_dri2_copy_region(sna, draw, NULL, back, front, 0);
++		front->flags = back->flags;
+ 		type = DRI2_BLIT_COMPLETE;
+ 	}
++	if (draw->type == DRAWABLE_PIXMAP)
++		goto fake;
+ skip:
+ 	DBG(("%s: unable to show frame, unblocking client\n", __FUNCTION__));
+-	if (crtc == NULL)
+-		crtc = sna_mode_first_crtc(sna);
+-	fake_swap_complete(sna, client, draw, crtc, type, func, data);
+-	*target_msc = 0; /* offscreen, so zero out target vblank count */
++	if (crtc == NULL && (sna->flags & SNA_NO_WAIT) == 0)
++		crtc = sna_primary_crtc(sna);
++	if (crtc && sna_crtc_is_on(crtc)) {
++		if (info == NULL)
++			info = sna_dri2_add_event(sna, draw, client, crtc);
++		if (info != dri2_chain(draw))
++			goto fake;
++
++		assert(info->crtc == crtc);
++
++		info->type = SWAP_COMPLETE;
++		info->event_complete = func;
++		info->event_data = data;
++		assert(info->draw);
++		info->signal = true;
++
++		if (info->front == NULL)
++			info->front = sna_dri2_reference_buffer(front);
++		if (info->back == NULL)
++			info->back = sna_dri2_reference_buffer(back);
++
++		if (!sna_next_vblank(info))
++			goto fake;
++
++		swap_limit(draw, 1);
++	} else {
++fake:
++		/* XXX Use a Timer to throttle the client? */
++		fake_swap_complete(sna, client, draw, crtc, type, func, data);
++		if (info) {
++			assert(info->draw);
++			info->signal = false;
++			sna_dri2_event_free(info);
++		}
++	}
++	DBG(("%s: reported target_msc=%llu (in)\n", __FUNCTION__, *target_msc));
+ 	return TRUE;
+ }
+ 
+@@ -3030,27 +3497,25 @@ sna_dri2_get_msc(DrawablePtr draw, CARD64 *ust, CARD64 *msc)
+ 	struct sna *sna = to_sna_from_drawable(draw);
+ 	xf86CrtcPtr crtc = sna_dri2_get_crtc(draw);
+ 	const struct ust_msc *swap;
++	union drm_wait_vblank vbl;
+ 
+ 	DBG(("%s(draw=%ld, pipe=%d)\n", __FUNCTION__, draw->id,
+-	     crtc ? sna_crtc_to_pipe(crtc) : -1));
++	     crtc ? sna_crtc_pipe(crtc) : -1));
+ 
+-	if (crtc != NULL) {
+-		union drm_wait_vblank vbl;
++	/* Drawable not displayed, make up a *monotonic* value */
++	if (crtc == NULL)
++		crtc = sna_primary_crtc(sna);
++	if (crtc == NULL)
++		return FALSE;
+ 
+-		VG_CLEAR(vbl);
+-		vbl.request.type = _DRM_VBLANK_RELATIVE;
+-		vbl.request.sequence = 0;
+-		if (sna_wait_vblank(sna, &vbl, sna_crtc_to_pipe(crtc)) == 0)
+-			sna_crtc_record_vblank(crtc, &vbl);
+-	} else
+-		/* Drawable not displayed, make up a *monotonic* value */
+-		crtc = sna_mode_first_crtc(sna);
++	if (sna_query_vblank(sna, crtc, &vbl) == 0)
++		sna_crtc_record_vblank(crtc, &vbl);
+ 
+ 	swap = sna_crtc_last_swap(crtc);
+ 	*msc = draw_current_msc(draw, crtc, swap->msc);
+ 	*ust = ust64(swap->tv_sec, swap->tv_usec);
+-	DBG(("%s: msc=%llu, ust=%llu\n", __FUNCTION__,
+-	     (long long)*msc, (long long)*ust));
++	DBG(("%s: msc=%llu [raw=%llu], ust=%llu\n", __FUNCTION__,
++	     (long long)*msc, swap->msc, (long long)*ust));
+ 	return TRUE;
+ }
+ 
+@@ -3068,32 +3533,22 @@ sna_dri2_schedule_wait_msc(ClientPtr client, DrawablePtr draw, CARD64 target_msc
+ 	struct sna_dri2_event *info = NULL;
+ 	xf86CrtcPtr crtc;
+ 	CARD64 current_msc;
+-	union drm_wait_vblank vbl;
+ 	const struct ust_msc *swap;
+-	int pipe;
+ 
+ 	crtc = sna_dri2_get_crtc(draw);
+ 	DBG(("%s(pipe=%d, target_msc=%llu, divisor=%llu, rem=%llu)\n",
+-	     __FUNCTION__, crtc ? sna_crtc_to_pipe(crtc) : -1,
++	     __FUNCTION__, crtc ? sna_crtc_pipe(crtc) : -1,
+ 	     (long long)target_msc,
+ 	     (long long)divisor,
+ 	     (long long)remainder));
+ 
+ 	/* Drawable not visible, return immediately */
+ 	if (crtc == NULL)
+-		goto out_complete;
+-
+-	pipe = sna_crtc_to_pipe(crtc);
+-
+-	VG_CLEAR(vbl);
+-
+-	/* Get current count */
+-	vbl.request.type = _DRM_VBLANK_RELATIVE;
+-	vbl.request.sequence = 0;
+-	if (sna_wait_vblank(sna, &vbl, pipe))
+-		goto out_complete;
++		crtc = sna_primary_crtc(sna);
++	if (crtc == NULL)
++		return FALSE;
+ 
+-	current_msc = draw_current_msc(draw, crtc, sna_crtc_record_vblank(crtc, &vbl));
++	current_msc = get_current_msc(sna, draw, crtc);
+ 
+ 	/* If target_msc already reached or passed, set it to
+ 	 * current_msc to ensure we return a reasonable value back
+@@ -3104,15 +3559,13 @@ sna_dri2_schedule_wait_msc(ClientPtr client, DrawablePtr draw, CARD64 target_msc
+ 	if (divisor == 0 && current_msc >= target_msc)
+ 		goto out_complete;
+ 
+-	info = sna_dri2_add_event(sna, draw, client);
++	info = sna_dri2_add_event(sna, draw, client, crtc);
+ 	if (!info)
+ 		goto out_complete;
+ 
+ 	assert(info->crtc == crtc);
+ 	info->type = WAITMSC;
+ 
+-	vbl.request.signal = (uintptr_t)info;
+-	vbl.request.type = DRM_VBLANK_ABSOLUTE | DRM_VBLANK_EVENT;
+ 	/*
+ 	 * If divisor is zero, or current_msc is smaller than target_msc,
+ 	 * we just need to make sure target_msc passes before waking up the
+@@ -3129,10 +3582,8 @@ sna_dri2_schedule_wait_msc(ClientPtr client, DrawablePtr draw, CARD64 target_msc
+ 		if (target_msc <= current_msc)
+ 			target_msc += divisor;
+ 	}
+-	vbl.request.sequence = draw_target_seq(draw, target_msc);
+ 
+-	info->queued = true;
+-	if (sna_wait_vblank(sna, &vbl, pipe))
++	if (!sna_wait_vblank(info, draw_target_seq(draw, target_msc)))
+ 		goto out_free_info;
+ 
+ 	DRI2BlockClient(client, draw);
+@@ -3141,8 +3592,6 @@ sna_dri2_schedule_wait_msc(ClientPtr client, DrawablePtr draw, CARD64 target_msc
+ out_free_info:
+ 	sna_dri2_event_free(info);
+ out_complete:
+-	if (crtc == NULL)
+-		crtc = sna_mode_first_crtc(sna);
+ 	swap = sna_crtc_last_swap(crtc);
+ 	DRI2WaitMSCComplete(client, draw,
+ 			    draw_current_msc(draw, crtc, swap->msc),
+@@ -3231,9 +3680,18 @@ static bool is_level(const char **str)
+ 	return false;
+ }
+ 
++static const char *options_get_dri(struct sna *sna)
++{
++#if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,7,99,901,0)
++	return xf86GetOptValString(sna->Options, OPTION_DRI);
++#else
++	return NULL;
++#endif
++}
++
+ static const char *dri_driver_name(struct sna *sna)
+ {
+-	const char *s = xf86GetOptValString(sna->Options, OPTION_DRI);
++	const char *s = options_get_dri(sna);
+ 
+ 	if (is_level(&s)) {
+ 		if (sna->kgem.gen < 030)
+@@ -3259,7 +3717,7 @@ bool sna_dri2_open(struct sna *sna, ScreenPtr screen)
+ 
+ 	if (wedged(sna)) {
+ 		xf86DrvMsg(sna->scrn->scrnIndex, X_WARNING,
+-			   "loading DRI2 whilst the GPU is wedged.\n");
++			   "loading DRI2 whilst acceleration is disabled.\n");
+ 	}
+ 
+ 	if (xf86LoaderCheckSymbol("DRI2Version"))
+@@ -3274,7 +3732,7 @@ bool sna_dri2_open(struct sna *sna, ScreenPtr screen)
+ 	memset(&info, '\0', sizeof(info));
+ 	info.fd = sna->kgem.fd;
+ 	info.driverName = dri_driver_name(sna);
+-	info.deviceName = intel_get_client_name(sna->dev);
++	info.deviceName = intel_get_master_name(sna->dev);
+ 
+ 	DBG(("%s: loading dri driver '%s' [gen=%d] for device '%s'\n",
+ 	     __FUNCTION__, info.driverName, sna->kgem.gen, info.deviceName));
+@@ -3299,11 +3757,12 @@ bool sna_dri2_open(struct sna *sna, ScreenPtr screen)
+ 	info.numDrivers = 2;
+ 	info.driverNames = driverNames;
+ 	driverNames[0] = info.driverName;
+-	driverNames[1] = info.driverName;
++	driverNames[1] = "va_gl";
+ #endif
+ 
+ #if DRI2INFOREC_VERSION >= 6
+ 	if (xorg_can_triple_buffer()) {
++		DBG(("%s: enabling Xorg triple buffering\n", __FUNCTION__));
+ 		info.version = 6;
+ 		info.SwapLimitValidate = sna_dri2_swap_limit_validate;
+ 		info.ReuseBufferNotify = sna_dri2_reuse_buffer;
+@@ -3311,8 +3770,10 @@ bool sna_dri2_open(struct sna *sna, ScreenPtr screen)
+ #endif
+ 
+ #if USE_ASYNC_SWAP
++	DBG(("%s: enabled async swap and buffer age\n", __FUNCTION__));
+ 	info.version = 10;
+ 	info.scheduleSwap0 = 1;
++	info.bufferAge = 1;
+ #endif
+ 
+ 	return DRI2ScreenInit(screen, &info);
+diff --git a/src/sna/sna_dri3.c b/src/sna/sna_dri3.c
+index f586e242..ce4970ae 100644
+--- a/src/sna/sna_dri3.c
++++ b/src/sna/sna_dri3.c
+@@ -55,11 +55,14 @@ static inline void mark_dri3_pixmap(struct sna *sna, struct sna_pixmap *priv, st
+ 	if (bo->exec)
+ 		sna->kgem.flush = 1;
+ 	if (bo == priv->gpu_bo)
+-		priv->flush |= 3;
++		priv->flush |= FLUSH_READ | FLUSH_WRITE;
+ 	else
+ 		priv->shm = true;
+ 
+-	sna_accel_watch_flush(sna, 1);
++	sna_watch_flush(sna, 1);
++
++	kgem_bo_submit(&sna->kgem, bo);
++	kgem_bo_unclean(&sna->kgem, bo);
+ }
+ 
+ static void sna_sync_flush(struct sna *sna, struct sna_pixmap *priv)
+@@ -270,6 +273,8 @@ static PixmapPtr sna_dri3_pixmap_from_fd(ScreenPtr screen,
+ 		priv->ptr = MAKE_STATIC_PTR(pixmap->devPrivate.ptr);
+ 	} else {
+ 		assert(priv->gpu_bo == bo);
++		priv->create = kgem_can_create_2d(&sna->kgem,
++						  width, height, depth);
+ 		priv->pinned |= PIN_DRI3;
+ 	}
+ 	list_add(&priv->cow_list, &sna->dri3.pixmaps);
+@@ -325,6 +330,15 @@ static int sna_dri3_fd_from_pixmap(ScreenPtr screen,
+ 		return -1;
+ 	}
+ 
++	if (bo->tiling && !sna->kgem.can_fence) {
++		if (!sna_pixmap_change_tiling(pixmap, I915_TILING_NONE)) {
++			DBG(("%s: unable to discard GPU tiling (%d) for DRI3 protocol\n",
++			     __FUNCTION__, bo->tiling));
++			return -1;
++		}
++		bo = priv->gpu_bo;
++	}
++
+ 	fd = kgem_bo_export_to_prime(&sna->kgem, bo);
+ 	if (fd == -1) {
+ 		DBG(("%s: exporting handle=%d to fd failed\n", __FUNCTION__, bo->handle));
+diff --git a/src/sna/sna_driver.c b/src/sna/sna_driver.c
+index 8a3599c7..1b4015de 100644
+--- a/src/sna/sna_driver.c
++++ b/src/sna/sna_driver.c
+@@ -57,6 +57,13 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #include <mi.h>
+ #include <micmap.h>
+ 
++#if defined(HAVE_X11_EXTENSIONS_DPMSCONST_H)
++#include <X11/extensions/dpmsconst.h>
++#else
++#define DPMSModeOn 0
++#define DPMSModeOff 3
++#endif
++
+ #include <sys/ioctl.h>
+ #include <sys/fcntl.h>
+ #include <sys/poll.h>
+@@ -69,6 +76,8 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ #if HAVE_DOT_GIT
+ #include "git_version.h"
++#else
++#define git_version "not compiled from git"
+ #endif
+ 
+ #ifdef TEARFREE
+@@ -185,12 +194,12 @@ sna_set_fallback_mode(ScrnInfoPtr scrn)
+ 
+ 	xf86DisableUnusedFunctions(scrn);
+ #ifdef RANDR_12_INTERFACE
+-	if (get_root_window(scrn->pScreen))
+-		xf86RandR12TellChanged(scrn->pScreen);
++	if (get_root_window(xf86ScrnToScreen(scrn)))
++		xf86RandR12TellChanged(xf86ScrnToScreen(scrn));
+ #endif
+ }
+ 
+-static Bool sna_set_desired_mode(struct sna *sna)
++static void sna_set_desired_mode(struct sna *sna)
+ {
+ 	ScrnInfoPtr scrn = sna->scrn;
+ 
+@@ -203,7 +212,6 @@ static Bool sna_set_desired_mode(struct sna *sna)
+ 	}
+ 
+ 	sna_mode_check(sna);
+-	return TRUE;
+ }
+ 
+ /**
+@@ -222,7 +230,7 @@ static Bool sna_create_screen_resources(ScreenPtr screen)
+ 	     screen->width, screen->height, screen->rootDepth));
+ 
+ 	assert(sna->scrn == xf86ScreenToScrn(screen));
+-	assert(sna->scrn->pScreen == screen);
++	assert(to_screen_from_sna(sna) == screen);
+ 
+ 	/* free the data used during miInitScreen */
+ 	free(screen->devPrivate);
+@@ -273,33 +281,89 @@ static Bool sna_create_screen_resources(ScreenPtr screen)
+ 		if (serverGeneration == 1 && (sna->flags & SNA_IS_HOSTED) == 0)
+ 			sna_copy_fbcon(sna);
+ 
+-		(void)sna_set_desired_mode(sna);
++		sna_set_desired_mode(sna);
+ 	}
+ 
+ 	return TRUE;
+ }
+ 
+-static Bool sna_save_screen(ScreenPtr screen, int mode)
++static void sna_dpms_set(ScrnInfoPtr scrn, int mode, int flags)
+ {
+-	ScrnInfoPtr scrn = xf86ScreenToScrn(screen);
++	xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(scrn);
++	struct sna *sna = to_sna(scrn);
++	bool changed = false;
++	int i;
+ 
+-	DBG(("%s(mode=%d)\n", __FUNCTION__, mode));
++	DBG(("%s(mode=%d, flags=%d), vtSema=%d => off?=%d\n",
++	     __FUNCTION__, mode, flags, scrn->vtSema, mode!=DPMSModeOn));
+ 	if (!scrn->vtSema)
+-		return FALSE;
++		return;
+ 
+-	xf86SaveScreen(screen, mode);
+-	sna_crtc_config_notify(screen);
+-	return TRUE;
++	/* Opencoded version of xf86DPMSSet().
++	 *
++	 * The principle difference is to skip calling crtc->dpms() when
++	 * turning off the display. This (on recent enough kernels at
++	 * least) should be equivalent in power consumption, but require
++	 * less work (hence quicker and less likely to fail) when switching
++	 * back on.
++	 */
++	if (mode != DPMSModeOn) {
++		if (sna->mode.hidden == 0 && !(sna->flags & SNA_NO_DPMS)) {
++			DBG(("%s: hiding %d outputs\n",
++			     __FUNCTION__, config->num_output));
++			for (i = 0; i < config->num_output; i++) {
++				xf86OutputPtr output = config->output[i];
++				if (output->crtc != NULL)
++					output->funcs->dpms(output, mode);
++			}
++			sna->mode.hidden = sna->mode.front_active + 1;
++			sna->mode.front_active = 0;
++			changed = true;
++		}
++	} else {
++		/* Re-enable CRTC that have been forced off via other means */
++		if (sna->mode.hidden != 0) {
++			DBG(("%s: unhiding %d crtc, %d outputs\n",
++			     __FUNCTION__, config->num_crtc, config->num_output));
++			sna->mode.front_active = sna->mode.hidden - 1;
++			sna->mode.hidden = 0;
++			for (i = 0; i < config->num_crtc; i++) {
++				xf86CrtcPtr crtc = config->crtc[i];
++				if (crtc->enabled)
++					crtc->funcs->dpms(crtc, mode);
++			}
++
++			for (i = 0; i < config->num_output; i++) {
++				xf86OutputPtr output = config->output[i];
++				if (output->crtc != NULL)
++					output->funcs->dpms(output, mode);
++			}
++			changed = true;
++		}
++	}
++
++	DBG(("%s: hiding outputs? %d, front active? %d, changed? %d\n",
++	     __FUNCTION__, sna->mode.hidden, sna->mode.front_active, changed));
++
++	if (changed)
++		sna_crtc_config_notify(xf86ScrnToScreen(scrn));
+ }
+ 
+-static void sna_dpms_set(ScrnInfoPtr scrn, int mode, int flags)
++static Bool sna_save_screen(ScreenPtr screen, int mode)
+ {
+-	DBG(("%s(mode=%d, flags=%d)\n", __FUNCTION__, mode));
+-	if (!scrn->vtSema)
+-		return;
++	ScrnInfoPtr scrn = xf86ScreenToScrn(screen);
++
++	DBG(("%s(mode=%d [unblank=%d])\n",
++	     __FUNCTION__, mode, xf86IsUnblank(mode)));
+ 
+-	xf86DPMSSet(scrn, mode, flags);
+-	sna_crtc_config_notify(xf86ScrnToScreen(scrn));
++	/* We have to unroll xf86SaveScreen() here as it is called
++	 * by DPMSSet() nullifying our special handling crtc->dpms()
++	 * in sna_dpms_set().
++	 */
++	sna_dpms_set(scrn,
++		     xf86IsUnblank(mode) ? DPMSModeOn : DPMSModeOff,
++		     0);
++	return TRUE;
+ }
+ 
+ static void sna_selftest(void)
+@@ -330,107 +394,6 @@ static void sna_setup_capabilities(ScrnInfoPtr scrn, int fd)
+ #endif
+ }
+ 
+-static int
+-namecmp(const char *s1, const char *s2)
+-{
+-	char c1, c2;
+-
+-	if (!s1 || *s1 == 0) {
+-		if (!s2 || *s2 == 0)
+-			return 0;
+-		else
+-			return 1;
+-	}
+-
+-	while (*s1 == '_' || *s1 == ' ' || *s1 == '\t')
+-		s1++;
+-
+-	while (*s2 == '_' || *s2 == ' ' || *s2 == '\t')
+-		s2++;
+-
+-	c1 = isupper(*s1) ? tolower(*s1) : *s1;
+-	c2 = isupper(*s2) ? tolower(*s2) : *s2;
+-	while (c1 == c2) {
+-		if (c1 == '\0')
+-			return 0;
+-
+-		s1++;
+-		while (*s1 == '_' || *s1 == ' ' || *s1 == '\t')
+-			s1++;
+-
+-		s2++;
+-		while (*s2 == '_' || *s2 == ' ' || *s2 == '\t')
+-			s2++;
+-
+-		c1 = isupper(*s1) ? tolower(*s1) : *s1;
+-		c2 = isupper(*s2) ? tolower(*s2) : *s2;
+-	}
+-
+-	return c1 - c2;
+-}
+-
+-static Bool sna_option_cast_to_bool(struct sna *sna, int id, Bool val)
+-{
+-	const char *str = xf86GetOptValString(sna->Options, id);
+-
+-	if (str == NULL)
+-		return val;
+-
+-	if (*str == '\0')
+-		return TRUE;
+-
+-	if (namecmp(str, "1") == 0)
+-		return TRUE;
+-	if (namecmp(str, "on") == 0)
+-		return TRUE;
+-	if (namecmp(str, "true") == 0)
+-		return TRUE;
+-	if (namecmp(str, "yes") == 0)
+-		return TRUE;
+-
+-	if (namecmp(str, "0") == 0)
+-		return FALSE;
+-	if (namecmp(str, "off") == 0)
+-		return FALSE;
+-	if (namecmp(str, "false") == 0)
+-		return FALSE;
+-	if (namecmp(str, "no") == 0)
+-		return FALSE;
+-
+-	return val;
+-}
+-
+-static unsigned sna_option_cast_to_unsigned(struct sna *sna, int id, unsigned val)
+-{
+-	const char *str = xf86GetOptValString(sna->Options, id);
+-	unsigned v;
+-
+-	if (str == NULL || *str == '\0')
+-		return val;
+-
+-	if (namecmp(str, "on") == 0)
+-		return val;
+-	if (namecmp(str, "true") == 0)
+-		return val;
+-	if (namecmp(str, "yes") == 0)
+-		return val;
+-
+-	if (namecmp(str, "0") == 0)
+-		return 0;
+-	if (namecmp(str, "off") == 0)
+-		return 0;
+-	if (namecmp(str, "false") == 0)
+-		return 0;
+-	if (namecmp(str, "no") == 0)
+-		return 0;
+-
+-	v = atoi(str);
+-	if (v)
+-		return v;
+-
+-	return val;
+-}
+-
+ static Bool fb_supports_depth(int fd, int depth)
+ {
+ 	struct drm_i915_gem_create create;
+@@ -470,16 +433,24 @@ static void setup_dri(struct sna *sna)
+ 	unsigned level;
+ 
+ 	sna->dri2.available = false;
++	sna->dri2.enable = false;
+ 	sna->dri3.available = false;
++	sna->dri3.enable = false;
++	sna->dri3.override = false;
+ 
+-	level = sna_option_cast_to_unsigned(sna, OPTION_DRI, ~0);
++	level = intel_option_cast_to_unsigned(sna->Options, OPTION_DRI, DEFAULT_DRI_LEVEL);
+ #if HAVE_DRI3
+-	if (level >= 3)
+-		sna->dri3.available = !!xf86LoadSubModule(sna->scrn, "dri3");
++	sna->dri3.available = !!xf86LoadSubModule(sna->scrn, "dri3");
++	sna->dri3.override =
++		!sna->dri3.available ||
++		xf86IsOptionSet(sna->Options, OPTION_DRI);
++	if (level >= 3 && sna->kgem.gen >= 040)
++		sna->dri3.enable = sna->dri3.available;
+ #endif
+ #if HAVE_DRI2
++	sna->dri2.available = !!xf86LoadSubModule(sna->scrn, "dri2");
+ 	if (level >= 2)
+-		sna->dri2.available = !!xf86LoadSubModule(sna->scrn, "dri2");
++		sna->dri2.enable = sna->dri2.available;
+ #endif
+ }
+ 
+@@ -498,13 +469,13 @@ static bool enable_tear_free(struct sna *sna)
+ 	return ENABLE_TEAR_FREE;
+ }
+ 
+-static void setup_tear_free(struct sna *sna)
++static bool setup_tear_free(struct sna *sna)
+ {
+ 	MessageType from;
+ 	Bool enable;
+ 
+ 	if (sna->flags & SNA_LINEAR_FB)
+-		return;
++		return false;
+ 
+ 	if ((sna->flags & SNA_HAS_FLIP) == 0) {
+ 		from = X_PROBED;
+@@ -518,11 +489,12 @@ static void setup_tear_free(struct sna *sna)
+ 		from = X_CONFIG;
+ 
+ 	if (enable)
+-		sna->flags |= SNA_TEAR_FREE;
++		sna->flags |= SNA_WANT_TEAR_FREE | SNA_TEAR_FREE;
+ 
+ done:
+ 	xf86DrvMsg(sna->scrn->scrnIndex, from, "TearFree %sabled\n",
+ 		   sna->flags & SNA_TEAR_FREE ? "en" : "dis");
++	return sna->flags & SNA_TEAR_FREE;
+ }
+ 
+ /**
+@@ -612,8 +584,10 @@ static Bool sna_pre_init(ScrnInfoPtr scrn, int probe)
+ 	}
+ 
+ 	intel_detect_chipset(scrn, sna->dev);
+-	xf86DrvMsg(scrn->scrnIndex, X_PROBED, "CPU: %s\n",
+-		   sna_cpu_features_to_string(sna->cpu_features, buf));
++	xf86DrvMsg(scrn->scrnIndex, X_PROBED,
++		   "CPU: %s; using a maximum of %d threads\n",
++		   sna_cpu_features_to_string(sna->cpu_features, buf),
++		   sna_use_threads(64*1024, 64*1024, 1));
+ 
+ 	if (!xf86SetDepthBpp(scrn, 24, 0, 0,
+ 			     Support32bppFb |
+@@ -651,18 +625,11 @@ static Bool sna_pre_init(ScrnInfoPtr scrn, int probe)
+ 	kgem_init(&sna->kgem, fd,
+ 		  xf86GetPciInfoForEntity(pEnt->index),
+ 		  sna->info->gen);
+-	if (xf86ReturnOptValBool(sna->Options, OPTION_ACCEL_DISABLE, FALSE) ||
+-	    !sna_option_cast_to_bool(sna, OPTION_ACCEL_METHOD, TRUE)) {
+-		xf86DrvMsg(sna->scrn->scrnIndex, X_CONFIG,
+-			   "Disabling hardware acceleration.\n");
+-		sna->kgem.wedged = true;
+-	}
+ 
+ 	if (xf86ReturnOptValBool(sna->Options, OPTION_TILING_FB, FALSE))
+ 		sna->flags |= SNA_LINEAR_FB;
+-
+-	if (xf86ReturnOptValBool(sna->Options, OPTION_DELETE_DP12, FALSE))
+-		sna->flags |= SNA_REMOVE_OUTPUTS;
++	if (!sna->kgem.can_fence)
++		sna->flags |= SNA_LINEAR_FB;
+ 
+ 	if (!xf86ReturnOptValBool(sna->Options, OPTION_SWAPBUFFERS_WAIT, TRUE))
+ 		sna->flags |= SNA_NO_WAIT;
+@@ -695,7 +662,8 @@ static Bool sna_pre_init(ScrnInfoPtr scrn, int probe)
+ 	}
+ 	scrn->currentMode = scrn->modes;
+ 
+-	setup_tear_free(sna);
++	if (!setup_tear_free(sna) && sna_mode_wants_tear_free(sna))
++		sna->kgem.needs_dirtyfb = sna->kgem.has_dirtyfb;
+ 
+ 	xf86SetGamma(scrn, zeros);
+ 	xf86SetDpi(scrn, 0, 0);
+@@ -721,11 +689,13 @@ cleanup:
+ 	return FALSE;
+ }
+ 
++#if !HAVE_NOTIFY_FD
+ static bool has_shadow(struct sna *sna)
+ {
+-	if (!sna->mode.shadow_damage)
++	if (!sna->mode.shadow_enabled)
+ 		return false;
+ 
++	assert(sna->mode.shadow_damage);
+ 	if (RegionNil(DamageRegion(sna->mode.shadow_damage)))
+ 		return false;
+ 
+@@ -748,7 +718,7 @@ sna_block_handler(BLOCKHANDLER_ARGS_DECL)
+ 	sna->BlockHandler(BLOCKHANDLER_ARGS);
+ 
+ 	if (*tv == NULL || ((*tv)->tv_usec | (*tv)->tv_sec) || has_shadow(sna))
+-		sna_accel_block_handler(sna, tv);
++		sna_accel_block(sna, tv);
+ }
+ 
+ static void
+@@ -770,52 +740,102 @@ sna_wakeup_handler(WAKEUPHANDLER_ARGS_DECL)
+ 
+ 	sna->WakeupHandler(WAKEUPHANDLER_ARGS);
+ 
+-	sna_accel_wakeup_handler(sna);
+-
+ 	if (FD_ISSET(sna->kgem.fd, (fd_set*)read_mask)) {
+ 		sna_mode_wakeup(sna);
+ 		/* Clear the flag so that subsequent ZaphodHeads don't block  */
+ 		FD_CLR(sna->kgem.fd, (fd_set*)read_mask);
+ 	}
+ }
++#else
++static void
++sna_block_handler(void *data, void *_timeout)
++{
++	struct sna *sna = data;
++	int *timeout = _timeout;
++	struct timeval tv, *tvp;
++
++	DBG(("%s (timeout=%d)\n", __FUNCTION__, *timeout));
++	if (*timeout == 0)
++		return;
++
++	if (*timeout < 0) {
++		tvp = NULL;
++	} else {
++		tv.tv_sec = *timeout / 1000;
++		tv.tv_usec = (*timeout % 1000) * 1000;
++		tvp = &tv;
++	}
++
++	sna_accel_block(sna, &tvp);
++	if (tvp)
++		*timeout = tvp->tv_sec * 1000 + tvp->tv_usec / 1000;
++}
++#endif
+ 
+ #if HAVE_UDEV
++#include <sys/stat.h>
++
+ static void
+ sna_handle_uevents(int fd, void *closure)
+ {
+ 	struct sna *sna = closure;
+-	struct udev_device *dev;
+-	const char *str;
+ 	struct stat s;
+-	dev_t udev_devnum;
++	struct pollfd pfd;
++	bool hotplug = false;
+ 
+ 	DBG(("%s\n", __FUNCTION__));
+ 
+-	dev = udev_monitor_receive_device(sna->uevent_monitor);
+-	if (!dev)
+-		return;
++	pfd.fd = udev_monitor_get_fd(sna->uevent_monitor);
++	pfd.events = POLLIN;
++
++	if (fstat(sna->kgem.fd, &s))
++		memset(&s, 0, sizeof(s));
++
++	while (poll(&pfd, 1, 0) > 0) {
++		struct udev_device *dev;
++		dev_t devnum;
++
++		dev = udev_monitor_receive_device(sna->uevent_monitor);
++		if (dev == NULL)
++			break;
++
++		devnum = udev_device_get_devnum(dev);
++		if (memcmp(&s.st_rdev, &devnum, sizeof(dev_t)) == 0) {
++			const char *str;
++
++			str = udev_device_get_property_value(dev, "HOTPLUG");
++			if (str && atoi(str) == 1) {
++				str = udev_device_get_property_value(dev, "CONNECTOR");
++				if (str) {
++					hotplug |= sna_mode_find_hotplug_connector(sna, atoi(str));
++				} else {
++					sna->flags |= SNA_REPROBE;
++					hotplug = true;
++				}
++			}
++		}
+ 
+-	udev_devnum = udev_device_get_devnum(dev);
+-	if (fstat(sna->kgem.fd, &s) || memcmp(&s.st_rdev, &udev_devnum, sizeof (dev_t))) {
+ 		udev_device_unref(dev);
+-		return;
+ 	}
+ 
+-	str = udev_device_get_property_value(dev, "HOTPLUG");
+-	if (str && atoi(str) == 1) {
+-		ScrnInfoPtr scrn = sna->scrn;
+-
+-		DBG(("%s: hotplug event (vtSema?=%d)\n", __FUNCTION__, scrn->vtSema));
++	if (hotplug) {
++		DBG(("%s: hotplug event (vtSema?=%d)\n",
++		     __FUNCTION__, sna->scrn->vtSema));
+ 
+-		if (scrn->vtSema) {
+-			sna_mode_discover(sna);
+-			sna_mode_check(sna);
+-			RRGetInfo(xf86ScrnToScreen(scrn), TRUE);
+-		} else
++		if (sna->scrn->vtSema)
++			sna_mode_discover(sna, true);
++		else
+ 			sna->flags |= SNA_REPROBE;
+ 	}
++}
+ 
+-	udev_device_unref(dev);
++static bool has_randr(void)
++{
++#if HAS_DIXREGISTERPRIVATEKEY
++	return dixPrivateKeyRegistered(rrPrivKey);
++#else
++	return *rrPrivKey;
++#endif
+ }
+ 
+ static void
+@@ -833,7 +853,7 @@ sna_uevent_init(struct sna *sna)
+ 	/* RandR will be disabled if Xinerama is active, and so generating
+ 	 * RR hotplug events is then verboten.
+ 	 */
+-	if (!dixPrivateKeyRegistered(rrPrivKey))
++	if (!has_randr())
+ 		goto out;
+ 
+ 	u = NULL;
+@@ -861,7 +881,8 @@ sna_uevent_init(struct sna *sna)
+ 
+ 	sna->uevent_monitor = mon;
+ out:
+-	xf86DrvMsg(sna->scrn->scrnIndex, from, "display hotplug detection %s\n",
++	xf86DrvMsg(sna->scrn->scrnIndex, from,
++		   "Display hotplug detection %s\n",
+ 		   sna->uevent_monitor ? "enabled" : "disabled");
+ 	return;
+ 
+@@ -874,17 +895,10 @@ err_dev:
+ 
+ static bool sna_uevent_poll(struct sna *sna)
+ {
+-	struct pollfd pfd;
+-
+ 	if (sna->uevent_monitor == NULL)
+ 		return false;
+ 
+-	pfd.fd = udev_monitor_get_fd(sna->uevent_monitor);
+-	pfd.events = POLLIN;
+-
+-	while (poll(&pfd, 1, 0) > 0)
+-		sna_handle_uevents(pfd.fd, sna);
+-
++	sna_handle_uevents(udev_monitor_get_fd(sna->uevent_monitor), sna);
+ 	return true;
+ }
+ 
+@@ -918,8 +932,10 @@ sna_randr_getinfo(ScreenPtr screen, Rotation *rotations)
+ {
+ 	struct sna *sna = to_sna_from_screen(screen);
+ 
++	DBG(("%s()\n", __FUNCTION__));
++
+ 	if (!sna_uevent_poll(sna))
+-		sna_mode_discover(sna);
++		sna_mode_discover(sna, false);
+ 
+ 	return sna->mode.rrGetInfo(screen, rotations);
+ }
+@@ -931,8 +947,8 @@ static void sna_leave_vt(VT_FUNC_ARGS_DECL)
+ 
+ 	DBG(("%s\n", __FUNCTION__));
+ 
+-	sna_accel_leave(sna);
+ 	sna_mode_reset(sna);
++	sna_accel_leave(sna);
+ 
+ 	if (intel_put_master(sna->dev))
+ 		xf86DrvMsg(scrn->scrnIndex, X_WARNING,
+@@ -948,6 +964,12 @@ static Bool sna_early_close_screen(CLOSE_SCREEN_ARGS_DECL)
+ 
+ 	/* XXX Note that we will leak kernel resources if !vtSema */
+ 
++#if HAVE_NOTIFY_FD
++	RemoveBlockAndWakeupHandlers(sna_block_handler,
++				     (ServerWakeupHandlerProcPtr)NoopDDA,
++				     sna);
++#endif
++
+ 	sna_uevent_fini(sna);
+ 	sna_mode_close(sna);
+ 
+@@ -1047,12 +1069,13 @@ static void sna_dri_init(struct sna *sna, ScreenPtr screen)
+ {
+ 	char str[128] = "";
+ 
+-	if (sna->dri2.available)
++	if (sna->dri2.enable)
+ 		sna->dri2.open = sna_dri2_open(sna, screen);
+ 	if (sna->dri2.open)
+ 		strcat(str, "DRI2 ");
+ 
+-	if (sna->dri3.available)
++	/* Load DRI3 in case DRI2 doesn't work, e.g. vgaarb */
++	if (sna->dri3.enable || (!sna->dri2.open && !sna->dri3.override))
+ 		sna->dri3.open = sna_dri3_open(sna, screen);
+ 	if (sna->dri3.open)
+ 		strcat(str, "DRI3 ");
+@@ -1098,7 +1121,8 @@ sna_screen_init(SCREEN_INIT_ARGS_DECL)
+ 	DBG(("%s\n", __FUNCTION__));
+ 
+ 	assert(sna->scrn == scrn);
+-	assert(scrn->pScreen == NULL); /* set afterwards */
++	assert(to_screen_from_sna(sna) == NULL || /* set afterwards */
++	       to_screen_from_sna(sna) == screen);
+ 
+ 	assert(sna->freed_pixmap == NULL);
+ 
+@@ -1166,11 +1190,17 @@ sna_screen_init(SCREEN_INIT_ARGS_DECL)
+ 	 * later memory should be bound when allocating, e.g rotate_mem */
+ 	scrn->vtSema = TRUE;
+ 
++#if !HAVE_NOTIFY_FD
+ 	sna->BlockHandler = screen->BlockHandler;
+ 	screen->BlockHandler = sna_block_handler;
+ 
+ 	sna->WakeupHandler = screen->WakeupHandler;
+ 	screen->WakeupHandler = sna_wakeup_handler;
++#else
++	RegisterBlockAndWakeupHandlers(sna_block_handler,
++				       (ServerWakeupHandlerProcPtr)NoopDDA,
++				       sna);
++#endif
+ 
+ 	screen->SaveScreen = sna_save_screen;
+ 	screen->CreateScreenResources = sna_create_screen_resources;
+@@ -1190,6 +1220,8 @@ sna_screen_init(SCREEN_INIT_ARGS_DECL)
+ 				 CMAP_PALETTED_TRUECOLOR))
+ 		return FALSE;
+ 
++	if (!xf86CheckBoolOption(scrn->options, "dpms", TRUE))
++		sna->flags |= SNA_NO_DPMS;
+ 	xf86DPMSInit(screen, sna_dpms_set, 0);
+ 
+ 	sna_uevent_init(sna);
+@@ -1244,20 +1276,15 @@ static Bool sna_enter_vt(VT_FUNC_ARGS_DECL)
+ 	if (intel_get_master(sna->dev))
+ 		return FALSE;
+ 
++	sna_accel_enter(sna);
++
+ 	if (sna->flags & SNA_REPROBE) {
+-		DBG(("%s: reporting deferred hotplug event\n",
+-		     __FUNCTION__));
+-		sna_mode_discover(sna);
+-		RRGetInfo(xf86ScrnToScreen(scrn), TRUE);
+-		sna->flags &= ~SNA_REPROBE;
++		DBG(("%s: reporting deferred hotplug event\n", __FUNCTION__));
++		sna_mode_discover(sna, true);
+ 	}
+ 
+-	if (!sna_set_desired_mode(sna)) {
+-		intel_put_master(sna->dev);
+-		return FALSE;
+-	}
++	sna_set_desired_mode(sna);
+ 
+-	sna_accel_enter(sna);
+ 	return TRUE;
+ }
+ 
+@@ -1379,6 +1406,9 @@ static void describe_sna(ScrnInfoPtr scrn)
+ 	xf86DrvMsg(scrn->scrnIndex, X_INFO,
+ 		   "SNA compiled: %s\n", BUILDER_DESCRIPTION);
+ #endif
++#if HAS_DEBUG_FULL
++	ErrorF("SNA compiled with full debug logging; expect to run slowly\n");
++#endif
+ #if !NDEBUG
+ 	xf86DrvMsg(scrn->scrnIndex, X_INFO,
+ 		   "SNA compiled with assertions enabled\n");
+@@ -1400,6 +1430,7 @@ static void describe_sna(ScrnInfoPtr scrn)
+ 		   "SNA compiled for use with valgrind\n");
+ 	VALGRIND_PRINTF("SNA compiled for use with valgrind\n");
+ #endif
++	DBG(("xf86-video-intel version: %s\n", git_version));
+ 	DBG(("pixman version: %s\n", pixman_version_string()));
+ }
+ 
+diff --git a/src/sna/sna_glyphs.c b/src/sna/sna_glyphs.c
+index a5dfb06b..6ee40336 100644
+--- a/src/sna/sna_glyphs.c
++++ b/src/sna/sna_glyphs.c
+@@ -74,7 +74,7 @@
+ #define NO_GLYPHS_VIA_MASK 0
+ #define FORCE_SMALL_MASK 0 /* -1 = never, 1 = always */
+ #define NO_GLYPHS_SLOW 0
+-#define NO_DISCARD_MASK 0
++#define DISCARD_MASK 0 /* -1 = never, 1 = always */
+ 
+ #define CACHE_PICTURE_SIZE 1024
+ #define GLYPH_MIN_SIZE 8
+@@ -185,7 +185,7 @@ void sna_glyphs_close(struct sna *sna)
+  */
+ bool sna_glyphs_create(struct sna *sna)
+ {
+-	ScreenPtr screen = sna->scrn->pScreen;
++	ScreenPtr screen = to_screen_from_sna(sna);
+ 	pixman_color_t white = { 0xffff, 0xffff, 0xffff, 0xffff };
+ 	unsigned int formats[] = {
+ 		PIXMAN_a8,
+@@ -1094,6 +1094,9 @@ sna_glyph_get_image(GlyphPtr g, ScreenPtr s)
+ 
+ static inline bool use_small_mask(struct sna *sna, int16_t width, int16_t height, int depth)
+ {
++	if (depth < 8)
++		return true;
++
+ 	if (FORCE_SMALL_MASK)
+ 		return FORCE_SMALL_MASK > 0;
+ 
+@@ -1156,12 +1159,6 @@ glyphs_via_mask(struct sna *sna,
+ 	src_x += box.x1 - list->xOff;
+ 	src_y += box.y1 - list->yOff;
+ 
+-	if (format->depth < 8) {
+-		format = PictureMatchFormat(screen, 8, PICT_a8);
+-		if (!format)
+-			return false;
+-	}
+-
+ 	component_alpha = NeedsComponent(format->format);
+ 	if (use_small_mask(sna, width, height, format->depth)) {
+ 		pixman_image_t *mask_image;
+@@ -1179,7 +1176,7 @@ use_small_mask:
+ 			return false;
+ 
+ 		mask_image =
+-			pixman_image_create_bits(format->depth << 24 | format->format,
++			pixman_image_create_bits(pixmap->drawable.bitsPerPixel << 24 | format->format,
+ 						 width, height,
+ 						 pixmap->devPrivate.ptr,
+ 						 pixmap->devKind);
+@@ -1386,10 +1383,11 @@ next_image:
+ 					DBG(("%s: atlas format=%08x, mask format=%08x\n",
+ 					     __FUNCTION__,
+ 					     (int)p->atlas->format,
+-					     (int)(format->depth << 24 | format->format)));
++					     (int)mask->format));
+ 
+ 					memset(&tmp, 0, sizeof(tmp));
+-					if (p->atlas->format == (format->depth << 24 | format->format)) {
++					if (p->atlas->format == mask->format ||
++					    alphaless(p->atlas->format) == mask->format) {
+ 						ok = sna->render.composite(sna, PictOpAdd,
+ 									   p->atlas, NULL, mask,
+ 									   0, 0, 0, 0, 0, 0,
+@@ -1561,6 +1559,9 @@ skip_glyph:
+ 		}
+ 	}
+ 
++	assert(format);
++	DBG(("%s: format=%08d, depth=%d\n",
++	     __FUNCTION__, format->format, format->depth));
+ out:
+ 	if (list_extents != stack_extents)
+ 		free(list_extents);
+@@ -1573,24 +1574,34 @@ static bool can_discard_mask(uint8_t op, PicturePtr src, PictFormatPtr mask,
+ 	PictFormatPtr g;
+ 	uint32_t color;
+ 
+-	if (NO_DISCARD_MASK)
+-		return false;
++	if (DISCARD_MASK)
++		return DISCARD_MASK > 0;
+ 
+ 	DBG(("%s: nlist=%d, mask=%08x, depth %d, op=%d (bounded? %d)\n",
+ 	     __FUNCTION__, nlist,
+ 	     mask ? (unsigned)mask->format : 0, mask ? mask->depth : 0,
+ 	     op, op_is_bounded(op)));
+ 
+-	if (nlist == 1 && list->len == 1)
+-		return true;
++	if (nlist == 1 && list->len == 1) {
++		if (mask == list->format)
++			return true;
++
++		g = list->format;
++		goto skip;
++	}
+ 
+-	if (!op_is_bounded(op))
++	if (!op_is_bounded(op)) {
++		DBG(("%s: unbounded op, not discarding\n", __FUNCTION__));
+ 		return false;
++	}
+ 
+ 	/* No glyphs overlap and we are not performing a mask conversion. */
+ 	g = glyphs_format(nlist, list, glyphs);
+-	if (mask == g)
++	if (mask == g) {
++		DBG(("%s: mask matches glyphs format, no conversion, so discard mask\n",
++		     __FUNCTION__));
+ 		return true;
++	}
+ 
+ 	DBG(("%s: preferred mask format %08x, depth %d\n",
+ 	     __FUNCTION__, g ? (unsigned)g->format : 0,  g ? g->depth : 0));
+@@ -1605,18 +1616,41 @@ static bool can_discard_mask(uint8_t op, PicturePtr src, PictFormatPtr mask,
+ 
+ 			list++;
+ 		}
++
++		if (!sna_picture_is_solid(src, &color))
++			return false;
++
++		return color >> 24 == 0xff;
+ 	} else {
+-		if (PICT_FORMAT_A(mask->format) >= PICT_FORMAT_A(g->format))
++skip:
++		if (mask->format == g->format)
+ 			return true;
+ 
+-		if (g->depth != 1)
+-			return false;
+-	}
++		if (mask->format == alphaless(g->format))
++			return true;
++
++		if (PICT_FORMAT_TYPE(g->format) == PICT_TYPE_A &&
++		    PICT_FORMAT_TYPE(mask->format) != PICT_TYPE_A)
++			return true;
+ 
+-	if (!sna_picture_is_solid(src, &color))
+ 		return false;
++	}
++}
+ 
+-	return color >> 24 == 0xff;
++static uint32_t pixman_format(PictFormatPtr short_format)
++{
++	uint32_t bpp;
++
++	bpp = short_format->depth;
++	if (bpp <= 1)
++		bpp = 1;
++	else if (bpp <= 8)
++		bpp = 8;
++	else if (bpp <= 16)
++		bpp = 16;
++	else
++		bpp = 32;
++	return bpp << 24 | short_format->format;
+ }
+ 
+ static void
+@@ -1756,7 +1790,7 @@ next:
+ 		if (sigtrap_get() == 0) {
+ 			if (mask_format) {
+ 				pixman_composite_glyphs(op, src_image, dst_image,
+-							mask_format->format | (mask_format->depth << 24),
++							pixman_format(mask_format),
+ 							src_x + src_dx + region.extents.x1 - dst_x,
+ 							src_y + src_dy + region.extents.y1 - dst_y,
+ 							region.extents.x1, region.extents.y1,
+@@ -1815,10 +1849,10 @@ out:
+ 			     x, y,
+ 			     mask_format->depth,
+ 			     (long)mask_format->format,
+-			     (long)(mask_format->depth << 24 | mask_format->format),
++			     (long)pixman_format(mask_format),
+ 			     NeedsComponent(mask_format->format)));
+ 			mask_image =
+-				pixman_image_create_bits(mask_format->depth << 24 | mask_format->format,
++				pixman_image_create_bits(pixman_format(mask_format),
+ 							 region.extents.x2 - region.extents.x1,
+ 							 region.extents.y2 - region.extents.y1,
+ 							 NULL, 0);
+@@ -2086,12 +2120,6 @@ glyphs_via_image(struct sna *sna,
+ 	src_x += box.x1 - list->xOff;
+ 	src_y += box.y1 - list->yOff;
+ 
+-	if (format->depth < 8) {
+-		format = PictureMatchFormat(screen, 8, PICT_a8);
+-		if (!format)
+-			return false;
+-	}
+-
+ 	DBG(("%s: small mask [format=%lx, depth=%d, size=%d], rendering glyphs to upload buffer\n",
+ 	     __FUNCTION__, (unsigned long)format->format,
+ 	     format->depth, (uint32_t)width*height*format->depth));
+@@ -2104,7 +2132,7 @@ glyphs_via_image(struct sna *sna,
+ 		return false;
+ 
+ 	mask_image =
+-		pixman_image_create_bits(format->depth << 24 | format->format,
++		pixman_image_create_bits(pixmap->drawable.bitsPerPixel << 24 | format->format,
+ 					 width, height,
+ 					 pixmap->devPrivate.ptr,
+ 					 pixmap->devKind);
+diff --git a/src/sna/sna_io.c b/src/sna/sna_io.c
+index d6aa1294..d32bd583 100644
+--- a/src/sna/sna_io.c
++++ b/src/sna/sna_io.c
+@@ -105,8 +105,10 @@ read_boxes_inplace__cpu(struct kgem *kgem,
+ 	if (!download_inplace__cpu(kgem, dst, bo, box, n))
+ 		return false;
+ 
++	if (bo->tiling == I915_TILING_Y)
++		return false;
++
+ 	assert(kgem_bo_can_map__cpu(kgem, bo, false));
+-	assert(bo->tiling != I915_TILING_Y);
+ 
+ 	src = kgem_bo_map__cpu(kgem, bo);
+ 	if (src == NULL)
+@@ -281,6 +283,9 @@ fallback:
+ 		if (box[n].y2 > extents.y2)
+ 			extents.y2 = box[n].y2;
+ 	}
++	if (!can_blt && sna->render.max_3d_size == 0)
++		goto fallback;
++
+ 	if (kgem_bo_can_map(kgem, src_bo)) {
+ 		/* Is it worth detiling? */
+ 		if ((extents.y2 - extents.y1 - 1) * src_bo->pitch < 4096)
+@@ -477,6 +482,7 @@ fallback:
+ 			goto fallback;
+ 		_kgem_set_mode(kgem, KGEM_BLT);
+ 	}
++	kgem_bcs_set_tiling(&sna->kgem, src_bo, NULL);
+ 
+ 	tmp_nbox = nbox;
+ 	tmp_box = box;
+@@ -539,6 +545,7 @@ fallback:
+ 				break;
+ 
+ 			_kgem_set_mode(kgem, KGEM_BLT);
++			kgem_bcs_set_tiling(&sna->kgem, src_bo, NULL);
+ 			tmp_box += nbox_this_time;
+ 		} while (1);
+ 	} else {
+@@ -597,6 +604,7 @@ fallback:
+ 				break;
+ 
+ 			_kgem_set_mode(kgem, KGEM_BLT);
++			kgem_bcs_set_tiling(&sna->kgem, src_bo, NULL);
+ 			tmp_box += nbox_this_time;
+ 		} while (1);
+ 	}
+@@ -666,8 +674,10 @@ write_boxes_inplace__tiled(struct kgem *kgem,
+ {
+ 	uint8_t *dst;
+ 
++	if (bo->tiling == I915_TILING_Y)
++		return false;
++
+ 	assert(kgem->has_wc_mmap || kgem_bo_can_map__cpu(kgem, bo, true));
+-	assert(bo->tiling != I915_TILING_Y);
+ 
+ 	if (kgem_bo_can_map__cpu(kgem, bo, true)) {
+ 		dst = kgem_bo_map__cpu(kgem, bo);
+@@ -778,6 +788,15 @@ static bool __upload_inplace(struct kgem *kgem,
+ 	if (FORCE_INPLACE)
+ 		return FORCE_INPLACE > 0;
+ 
++	if (bo->exec)
++		return false;
++
++	if (bo->flush)
++		return true;
++
++	if (kgem_bo_can_map__cpu(kgem, bo, true))
++		return true;
++
+ 	/* If we are writing through the GTT, check first if we might be
+ 	 * able to almagamate a series of small writes into a single
+ 	 * operation.
+@@ -849,6 +868,8 @@ bool sna_write_boxes(struct sna *sna, PixmapPtr dst,
+ 		if (box[n].y2 > extents.y2)
+ 			extents.y2 = box[n].y2;
+ 	}
++	if (!can_blt && sna->render.max_3d_size == 0)
++		goto fallback;
+ 
+ 	/* Try to avoid switching rings... */
+ 	if (!can_blt || kgem->ring == KGEM_RENDER ||
+@@ -1038,6 +1059,7 @@ tile:
+ 			goto fallback;
+ 		_kgem_set_mode(kgem, KGEM_BLT);
+ 	}
++	kgem_bcs_set_tiling(&sna->kgem, NULL, dst_bo);
+ 
+ 	if (kgem->gen >= 0100) {
+ 		cmd |= 8;
+@@ -1129,6 +1151,7 @@ tile:
+ 			if (nbox) {
+ 				_kgem_submit(kgem);
+ 				_kgem_set_mode(kgem, KGEM_BLT);
++				kgem_bcs_set_tiling(&sna->kgem, NULL, dst_bo);
+ 			}
+ 
+ 			kgem_bo_destroy(kgem, src_bo);
+@@ -1224,6 +1247,7 @@ tile:
+ 			if (nbox) {
+ 				_kgem_submit(kgem);
+ 				_kgem_set_mode(kgem, KGEM_BLT);
++				kgem_bcs_set_tiling(&sna->kgem, NULL, dst_bo);
+ 			}
+ 
+ 			kgem_bo_destroy(kgem, src_bo);
+@@ -1541,6 +1565,7 @@ tile:
+ 			goto fallback;
+ 		_kgem_set_mode(kgem, KGEM_BLT);
+ 	}
++	kgem_bcs_set_tiling(&sna->kgem, NULL, dst_bo);
+ 
+ 	if (sna->kgem.gen >= 0100) {
+ 		cmd |= 8;
+@@ -1636,6 +1661,7 @@ tile:
+ 			if (nbox) {
+ 				_kgem_submit(kgem);
+ 				_kgem_set_mode(kgem, KGEM_BLT);
++				kgem_bcs_set_tiling(&sna->kgem, NULL, dst_bo);
+ 			}
+ 
+ 			kgem_bo_destroy(kgem, src_bo);
+@@ -1732,6 +1758,7 @@ tile:
+ 			if (nbox) {
+ 				_kgem_submit(kgem);
+ 				_kgem_set_mode(kgem, KGEM_BLT);
++				kgem_bcs_set_tiling(&sna->kgem, NULL, dst_bo);
+ 			}
+ 
+ 			kgem_bo_destroy(kgem, src_bo);
+diff --git a/src/sna/sna_present.c b/src/sna/sna_present.c
+index 6dd6fe88..2796d972 100644
+--- a/src/sna/sna_present.c
++++ b/src/sna/sna_present.c
+@@ -27,6 +27,7 @@
+ #include <sys/types.h>
+ #include <fcntl.h>
+ #include <unistd.h>
++#include <sys/poll.h>
+ #include <errno.h>
+ #include <xf86drm.h>
+ 
+@@ -38,21 +39,73 @@
+ static present_screen_info_rec present_info;
+ 
+ struct sna_present_event {
+-	uint64_t event_id;
+ 	xf86CrtcPtr crtc;
++	struct sna *sna;
++	struct list link;
++	uint64_t *event_id;
++	uint64_t target_msc;
++	int n_event_id;
++	bool queued;
+ };
+ 
++static void sna_present_unflip(ScreenPtr screen, uint64_t event_id);
++static bool sna_present_queue(struct sna_present_event *info,
++			      uint64_t last_msc);
++
+ static inline struct sna_present_event *
+ to_present_event(uintptr_t  data)
+ {
+ 	return (struct sna_present_event *)(data & ~3);
+ }
+ 
++static struct sna_present_event *info_alloc(struct sna *sna)
++{
++	struct sna_present_event *info;
++
++	info = sna->present.freed_info;
++	if (info) {
++		sna->present.freed_info = NULL;
++		return info;
++	}
++
++	return malloc(sizeof(struct sna_present_event) + sizeof(uint64_t));
++}
++
++static void info_free(struct sna_present_event *info)
++{
++	struct sna *sna = info->sna;
++
++	if (sna->present.freed_info)
++		free(sna->present.freed_info);
++
++	sna->present.freed_info = info;
++}
++
++static inline bool msc_before(uint64_t msc, uint64_t target)
++{
++	return (int64_t)(msc - target) < 0;
++}
++
+ #define MARK_PRESENT(x) ((void *)((uintptr_t)(x) | 2))
+ 
+-static int pipe_from_crtc(RRCrtcPtr crtc)
++static inline xf86CrtcPtr unmask_crtc(xf86CrtcPtr crtc)
++{
++	return (xf86CrtcPtr)((uintptr_t)crtc & ~1);
++}
++
++static inline xf86CrtcPtr mark_crtc(xf86CrtcPtr crtc)
++{
++	return (xf86CrtcPtr)((uintptr_t)crtc | 1);
++}
++
++static inline bool has_vblank(xf86CrtcPtr crtc)
++{
++	return (uintptr_t)crtc & 1;
++}
++
++static inline int pipe_from_crtc(RRCrtcPtr crtc)
+ {
+-	return crtc ? sna_crtc_to_pipe(crtc->devPrivate) : -1;
++	return crtc ? sna_crtc_pipe(crtc->devPrivate) : -1;
+ }
+ 
+ static uint32_t pipe_select(int pipe)
+@@ -74,6 +127,215 @@ static inline int sna_wait_vblank(struct sna *sna, union drm_wait_vblank *vbl, i
+ 	return drmIoctl(sna->kgem.fd, DRM_IOCTL_WAIT_VBLANK, vbl);
+ }
+ 
++static uint64_t gettime_ust64(void)
++{
++	struct timespec tv;
++
++	if (clock_gettime(CLOCK_MONOTONIC, &tv))
++		return GetTimeInMicros();
++
++	return ust64(tv.tv_sec, tv.tv_nsec / 1000);
++}
++
++static void vblank_complete(struct sna_present_event *info,
++			    uint64_t ust, uint64_t msc)
++{
++	int n;
++
++	if (msc_before(msc, info->target_msc)) {
++		DBG(("%s: event=%d too early, now %lld, expected %lld\n",
++		     __FUNCTION__,
++		     info->event_id[0],
++		     (long long)msc, (long long)info->target_msc));
++		if (sna_present_queue(info, msc))
++			return;
++	}
++
++	DBG(("%s: %d events complete\n", __FUNCTION__, info->n_event_id));
++	for (n = 0; n < info->n_event_id; n++) {
++		DBG(("%s: pipe=%d tv=%d.%06d msc=%lld (target=%lld), event=%lld complete%s\n", __FUNCTION__,
++		     sna_crtc_pipe(info->crtc),
++		     (int)(ust / 1000000), (int)(ust % 1000000),
++		     (long long)msc, (long long)info->target_msc,
++		     (long long)info->event_id[n],
++		     info->target_msc && msc == (uint32_t)info->target_msc ? "" : ": MISS"));
++		present_event_notify(info->event_id[n], ust, msc);
++	}
++	if (info->n_event_id > 1)
++		free(info->event_id);
++	list_del(&info->link);
++	info_free(info);
++}
++
++static uint32_t msc_to_delay(xf86CrtcPtr crtc, uint64_t target)
++{
++	const DisplayModeRec *mode = &crtc->desiredMode;
++	const struct ust_msc *swap = sna_crtc_last_swap(crtc);
++	int64_t delay, subframe;
++
++	assert(mode->Clock);
++
++	delay = target - swap->msc;
++	assert(delay >= 0);
++	if (delay > 1) { /* try to use the hw vblank for the last frame */
++		delay--;
++		subframe = 0;
++	} else {
++		subframe = gettime_ust64() - swap_ust(swap);
++		subframe += 500;
++		subframe /= 1000;
++	}
++	delay *= mode->VTotal * mode->HTotal / mode->Clock;
++	if (subframe < delay)
++		delay -= subframe;
++	else
++		delay = 0;
++
++	DBG(("%s: sleep %d frames, %llu ms\n", __FUNCTION__,
++	     (int)(target - swap->msc), (long long)delay));
++	assert(delay >= 0);
++	return MIN(delay, INT32_MAX);
++}
++
++static CARD32 sna_fake_vblank_handler(OsTimerPtr timer, CARD32 now, void *data)
++{
++	struct sna_present_event *info = data;
++	union drm_wait_vblank vbl;
++	uint64_t msc, ust;
++
++	DBG(("%s(event=%lldx%d, now=%d)\n", __FUNCTION__, (long long)info->event_id[0], info->n_event_id, now));
++
++	VG_CLEAR(vbl);
++	vbl.request.type = DRM_VBLANK_RELATIVE;
++	vbl.request.sequence = 0;
++	if (sna_wait_vblank(info->sna, &vbl, sna_crtc_pipe(info->crtc)) == 0) {
++		ust = ust64(vbl.reply.tval_sec, vbl.reply.tval_usec);
++		msc = sna_crtc_record_vblank(info->crtc, &vbl);
++		DBG(("%s: event=%lld, target msc=%lld, now %lld\n",
++		     __FUNCTION__, (long long)info->event_id[0], (long long)info->target_msc, (long long)msc));
++		if (msc_before(msc, info->target_msc)) {
++			int delta = info->target_msc - msc;
++			uint32_t delay;
++
++			DBG(("%s: too early, requeuing delta=%d\n", __FUNCTION__, delta));
++			assert(info->target_msc - msc < 1ull<<31);
++			if (delta <= 2) {
++				vbl.request.type = DRM_VBLANK_ABSOLUTE | DRM_VBLANK_EVENT;
++				vbl.request.sequence = info->target_msc;
++				vbl.request.signal = (uintptr_t)MARK_PRESENT(info);
++				if (sna_wait_vblank(info->sna, &vbl, sna_crtc_pipe(info->crtc)) == 0) {
++					DBG(("%s: scheduled new vblank event for %lld\n", __FUNCTION__, (long long)info->target_msc));
++					info->queued = true;
++					if (delta == 1) {
++						sna_crtc_set_vblank(info->crtc);
++						info->crtc = mark_crtc(info->crtc);
++					}
++					free(timer);
++					return 0;
++				}
++			}
++
++			delay = msc_to_delay(info->crtc, info->target_msc);
++			if (delay) {
++				DBG(("%s: requeueing timer for %dms delay\n", __FUNCTION__, delay));
++				return delay;
++			}
++
++			/* As a last resort use a blocking wait.
++			 * Less than a millisecond for (hopefully) a rare case.
++			 */
++			DBG(("%s: blocking wait!\n", __FUNCTION__));
++			vbl.request.type = DRM_VBLANK_ABSOLUTE;
++			vbl.request.sequence = info->target_msc;
++			if (sna_wait_vblank(info->sna, &vbl, sna_crtc_pipe(info->crtc)) == 0) {
++				ust = ust64(vbl.reply.tval_sec, vbl.reply.tval_usec);
++				msc = sna_crtc_record_vblank(info->crtc, &vbl);
++			} else {
++				DBG(("%s: blocking wait failed, fudging\n",
++				     __FUNCTION__));
++				goto fixup;
++			}
++		}
++	} else {
++fixup:
++		ust = gettime_ust64();
++		msc = info->target_msc;
++		DBG(("%s: event=%lld, CRTC OFF, target msc=%lld, was %lld (off)\n",
++		     __FUNCTION__, (long long)info->event_id[0], (long long)info->target_msc, (long long)sna_crtc_last_swap(info->crtc)->msc));
++	}
++
++	vblank_complete(info, ust, msc);
++	free(timer);
++	return 0;
++}
++
++static bool sna_fake_vblank(struct sna_present_event *info)
++{
++	const struct ust_msc *swap = sna_crtc_last_swap(info->crtc);
++	uint32_t delay;
++
++	if (msc_before(swap->msc, info->target_msc))
++		delay = msc_to_delay(info->crtc, info->target_msc);
++	else
++		delay = 0;
++
++	DBG(("%s(event=%lldx%d, target_msc=%lld, msc=%lld, delay=%ums)\n",
++	     __FUNCTION__, (long long)info->event_id[0], info->n_event_id,
++	     (long long)info->target_msc, (long long)swap->msc, delay));
++	if (delay == 0) {
++		uint64_t ust, msc;
++
++		if (msc_before(swap->msc, info->target_msc)) {
++			/* Fixup and pretend it completed immediately */
++			msc = info->target_msc;
++			ust = gettime_ust64();
++		} else {
++			msc = swap->msc;
++			ust = swap_ust(swap);
++		}
++
++		vblank_complete(info, ust, msc);
++		return true;
++	}
++
++	return TimerSet(NULL, 0, delay, sna_fake_vblank_handler, info);
++}
++
++static bool sna_present_queue(struct sna_present_event *info,
++			      uint64_t last_msc)
++{
++	union drm_wait_vblank vbl;
++	int delta = info->target_msc - last_msc;
++
++	DBG(("%s: target msc=%llu, seq=%u (last_msc=%llu), delta=%d\n",
++	     __FUNCTION__,
++	     (long long)info->target_msc,
++	     (unsigned)info->target_msc,
++	     (long long)last_msc,
++	     delta));
++	assert(info->target_msc - last_msc < 1ull<<31);
++	assert(delta >= 0);
++
++	VG_CLEAR(vbl);
++	vbl.request.type = DRM_VBLANK_ABSOLUTE | DRM_VBLANK_EVENT;
++	vbl.request.sequence = info->target_msc;
++	vbl.request.signal = (uintptr_t)MARK_PRESENT(info);
++	if (delta > 2 ||
++	    sna_wait_vblank(info->sna, &vbl, sna_crtc_pipe(info->crtc))) {
++		DBG(("%s: vblank enqueue failed, faking delta=%d\n", __FUNCTION__, delta));
++		if (!sna_fake_vblank(info))
++			return false;
++	} else {
++		info->queued = true;
++		if (delta == 1) {
++			sna_crtc_set_vblank(info->crtc);
++			info->crtc = mark_crtc(info->crtc);
++		}
++	}
++
++	return true;
++}
++
+ static RRCrtcPtr
+ sna_present_get_crtc(WindowPtr window)
+ {
+@@ -81,7 +343,10 @@ sna_present_get_crtc(WindowPtr window)
+ 	BoxRec box;
+ 	xf86CrtcPtr crtc;
+ 
+-	DBG(("%s\n", __FUNCTION__));
++	DBG(("%s: window=%ld (pixmap=%ld), box=(%d, %d)x(%d, %d)\n",
++	     __FUNCTION__, window->drawable.id, get_window_pixmap(window)->drawable.serialNumber,
++	     window->drawable.x, window->drawable.y,
++	     window->drawable.width, window->drawable.height));
+ 
+ 	box.x1 = window->drawable.x;
+ 	box.y1 = window->drawable.y;
+@@ -99,26 +364,59 @@ static int
+ sna_present_get_ust_msc(RRCrtcPtr crtc, CARD64 *ust, CARD64 *msc)
+ {
+ 	struct sna *sna = to_sna_from_screen(crtc->pScreen);
+-	int pipe = pipe_from_crtc(crtc);
+ 	union drm_wait_vblank vbl;
+ 
+-	DBG(("%s(pipe=%d)\n", __FUNCTION__, pipe));
++	DBG(("%s(pipe=%d)\n", __FUNCTION__, sna_crtc_pipe(crtc->devPrivate)));
++	if (sna_crtc_has_vblank(crtc->devPrivate)) {
++		DBG(("%s: vblank active, reusing last swap msc/ust\n",
++		     __FUNCTION__));
++		goto last;
++	}
+ 
+ 	VG_CLEAR(vbl);
+ 	vbl.request.type = DRM_VBLANK_RELATIVE;
+ 	vbl.request.sequence = 0;
+-	if (sna_wait_vblank(sna, &vbl, pipe) == 0) {
++	if (sna_wait_vblank(sna, &vbl, sna_crtc_pipe(crtc->devPrivate)) == 0) {
++		struct sna_present_event *info;
++
+ 		*ust = ust64(vbl.reply.tval_sec, vbl.reply.tval_usec);
+ 		*msc = sna_crtc_record_vblank(crtc->devPrivate, &vbl);
++
++		info = info_alloc(sna);
++		if (info) {
++			info->crtc = crtc->devPrivate;
++			info->sna = sna;
++			info->target_msc = *msc + 1;
++			info->event_id = (uint64_t *)(info + 1);
++			info->n_event_id = 0;
++
++			vbl.request.type =
++				DRM_VBLANK_ABSOLUTE | DRM_VBLANK_EVENT;
++			vbl.request.sequence = info->target_msc;
++			vbl.request.signal = (uintptr_t)MARK_PRESENT(info);
++
++			if (sna_wait_vblank(info->sna, &vbl,
++					    sna_crtc_pipe(info->crtc)) == 0) {
++				list_add(&info->link,
++					 &sna->present.vblank_queue);
++				info->queued = true;
++				sna_crtc_set_vblank(info->crtc);
++				info->crtc = mark_crtc(info->crtc);
++			} else
++				info_free(info);
++		}
+ 	} else {
+-		const struct ust_msc *swap = sna_crtc_last_swap(crtc->devPrivate);
+-		*ust = ust64(swap->tv_sec, swap->tv_usec);
++		const struct ust_msc *swap;
++last:
++		swap = sna_crtc_last_swap(crtc->devPrivate);
++		*ust = swap_ust(swap);
+ 		*msc = swap->msc;
+ 	}
+ 
+-	DBG(("%s: pipe=%d, tv=%d.%06d msc=%lld\n", __FUNCTION__, pipe,
++	DBG(("%s: pipe=%d, tv=%d.%06d seq=%d msc=%lld\n", __FUNCTION__,
++	     sna_crtc_pipe(crtc->devPrivate),
+ 	     (int)(*ust / 1000000), (int)(*ust % 1000000),
+-	     (long long)*msc));
++	     vbl.reply.sequence, (long long)*msc));
+ 
+ 	return Success;
+ }
+@@ -127,43 +425,106 @@ void
+ sna_present_vblank_handler(struct drm_event_vblank *event)
+ {
+ 	struct sna_present_event *info = to_present_event(event->user_data);
++	uint64_t msc;
+ 
+-	DBG(("%s: pipe=%d tv=%d.%06d msc=%d, event %lld complete\n", __FUNCTION__,
+-	     sna_crtc_to_pipe(info->crtc),
+-	     event->tv_sec, event->tv_usec, event->sequence,
+-	     (long long)info->event_id));
+-	present_event_notify(info->event_id,
+-			     ust64(event->tv_sec, event->tv_usec),
+-			     sna_crtc_record_event(info->crtc, event));
+-	free(info);
++	if (!info->queued) {
++		DBG(("%s: arrived unexpectedly early (not queued)\n", __FUNCTION__));
++		assert(!has_vblank(info->crtc));
++		return;
++	}
++
++	if (has_vblank(info->crtc)) {
++		DBG(("%s: clearing immediate flag\n", __FUNCTION__));
++		info->crtc = unmask_crtc(info->crtc);
++		sna_crtc_clear_vblank(info->crtc);
++	}
++
++	msc = sna_crtc_record_event(info->crtc, event);
++
++	if (info->sna->mode.shadow_wait) {
++		DBG(("%s: recursed from TearFree\n", __FUNCTION__));
++		if (TimerSet(NULL, 0, 1, sna_fake_vblank_handler, info))
++			return;
++	}
++
++	vblank_complete(info, ust64(event->tv_sec, event->tv_usec), msc);
+ }
+ 
+ static int
+ sna_present_queue_vblank(RRCrtcPtr crtc, uint64_t event_id, uint64_t msc)
+ {
+ 	struct sna *sna = to_sna_from_screen(crtc->pScreen);
+-	struct sna_present_event *event;
+-	union drm_wait_vblank vbl;
+-
+-	DBG(("%s(pipe=%d, event=%lld, msc=%lld)\n",
+-	     __FUNCTION__, pipe_from_crtc(crtc),
+-	     (long long)event_id, (long long)msc));
++	struct sna_present_event *info, *tmp;
++	const struct ust_msc *swap;
+ 
+-	event = malloc(sizeof(struct sna_present_event));
+-	if (event == NULL)
++	if (!sna_crtc_is_on(crtc->devPrivate))
+ 		return BadAlloc;
+ 
+-	event->event_id = event_id;
+-	event->crtc = crtc->devPrivate;
++	swap = sna_crtc_last_swap(crtc->devPrivate);
++	DBG(("%s(pipe=%d, event=%lld, msc=%lld, last swap=%lld)\n",
++	     __FUNCTION__, sna_crtc_pipe(crtc->devPrivate),
++	     (long long)event_id, (long long)msc, (long long)swap->msc));
+ 
+-	VG_CLEAR(vbl);
+-	vbl.request.type = DRM_VBLANK_ABSOLUTE | DRM_VBLANK_EVENT;
+-	vbl.request.sequence = msc;
+-	vbl.request.signal = (uintptr_t)MARK_PRESENT(event);
+-	if (sna_wait_vblank(sna, &vbl, sna_crtc_to_pipe(event->crtc))) {
+-		DBG(("%s: vblank enqueue failed\n", __FUNCTION__));
+-		free(event);
+-		return BadMatch;
++	if (warn_unless((int64_t)(msc - swap->msc) >= 0)) {
++		DBG(("%s: pipe=%d tv=%d.%06d msc=%lld (target=%lld), event=%lld complete\n", __FUNCTION__,
++		     sna_crtc_pipe(crtc->devPrivate),
++		     swap->tv_sec, swap->tv_usec,
++		     (long long)swap->msc, (long long)msc,
++		     (long long)event_id));
++		present_event_notify(event_id, swap_ust(swap), swap->msc);
++		return Success;
++	}
++	if (warn_unless(msc - swap->msc < 1ull<<31))
++		return BadValue;
++
++	list_for_each_entry(tmp, &sna->present.vblank_queue, link) {
++		if (tmp->target_msc == msc &&
++		    unmask_crtc(tmp->crtc) == crtc->devPrivate) {
++			uint64_t *events = tmp->event_id;
++
++			if (tmp->n_event_id &&
++			    is_power_of_two(tmp->n_event_id)) {
++				events = malloc(2*sizeof(uint64_t)*tmp->n_event_id);
++				if (events == NULL)
++					return BadAlloc;
++
++				memcpy(events,
++				       tmp->event_id,
++				       tmp->n_event_id*sizeof(uint64_t));
++				if (tmp->n_event_id != 1)
++					free(tmp->event_id);
++				tmp->event_id = events;
++			}
++
++			DBG(("%s: appending event=%lld to vblank %lld x %d\n",
++			     __FUNCTION__, (long long)event_id, (long long)msc, tmp->n_event_id+1));
++			events[tmp->n_event_id++] = event_id;
++			return Success;
++		}
++		if ((int64_t)(tmp->target_msc - msc) > 0) {
++			DBG(("%s: previous target_msc=%lld invalid for coalescing\n",
++			     __FUNCTION__, (long long)tmp->target_msc));
++			break;
++		}
++	}
++
++	info = info_alloc(sna);
++	if (info == NULL)
++		return BadAlloc;
++
++	info->crtc = crtc->devPrivate;
++	info->sna = sna;
++	info->target_msc = msc;
++	info->event_id = (uint64_t *)(info + 1);
++	info->event_id[0] = event_id;
++	info->n_event_id = 1;
++	list_add_tail(&info->link, &tmp->link);
++	info->queued = false;
++
++	if (!sna_present_queue(info, swap->msc)) {
++		list_del(&info->link);
++		info_free(info);
++		return BadAlloc;
+ 	}
+ 
+ 	return Success;
+@@ -180,14 +541,6 @@ sna_present_abort_vblank(RRCrtcPtr crtc, uint64_t event_id, uint64_t msc)
+ static void
+ sna_present_flush(WindowPtr window)
+ {
+-	PixmapPtr pixmap = get_window_pixmap(window);
+-	struct sna_pixmap *priv;
+-
+-	DBG(("%s(pixmap=%ld)\n", __FUNCTION__, pixmap->drawable.serialNumber));
+-
+-	priv = sna_pixmap_move_to_gpu(pixmap, MOVE_READ | MOVE_ASYNC_HINT | __MOVE_FORCE);
+-	if (priv && priv->gpu_bo)
+-		kgem_scanout_flush(&to_sna_from_pixmap(pixmap)->kgem, priv->gpu_bo);
+ }
+ 
+ static bool
+@@ -201,8 +554,13 @@ check_flip__crtc(struct sna *sna,
+ 
+ 	assert(sna->scrn->vtSema);
+ 
+-	if (sna->mode.shadow_active) {
+-		DBG(("%s: shadow buffer active\n", __FUNCTION__));
++	if (!sna->mode.front_active) {
++		DBG(("%s: DPMS off, no flips\n", __FUNCTION__));
++		return FALSE;
++	}
++
++	if (sna->mode.rr_active) {
++		DBG(("%s: RandR transformation active\n", __FUNCTION__));
+ 		return false;
+ 	}
+ 
+@@ -224,6 +582,11 @@ sna_present_check_flip(RRCrtcPtr crtc,
+ 	     pixmap->drawable.serialNumber,
+ 	     sync_flip));
+ 
++	if (!sna->scrn->vtSema) {
++		DBG(("%s: VT switched away, no flips\n", __FUNCTION__));
++		return FALSE;
++	}
++
+ 	if (sna->flags & SNA_NO_FLIP) {
+ 		DBG(("%s: flips not suported\n", __FUNCTION__));
+ 		return FALSE;
+@@ -231,7 +594,7 @@ sna_present_check_flip(RRCrtcPtr crtc,
+ 
+ 	if (sync_flip) {
+ 		if ((sna->flags & SNA_HAS_FLIP) == 0) {
+-			DBG(("%s: async flips not suported\n", __FUNCTION__));
++			DBG(("%s: sync flips not suported\n", __FUNCTION__));
+ 			return FALSE;
+ 		}
+ 	} else {
+@@ -257,24 +620,39 @@ sna_present_check_flip(RRCrtcPtr crtc,
+ 		return FALSE;
+ 	}
+ 
+-	return TRUE;
+-}
+-
+-static uint64_t gettime_ust64(void)
+-{
+-	struct timespec tv;
++	if (flip->pinned) {
++		assert(flip->gpu_bo);
++		if (sna->flags & SNA_LINEAR_FB) {
++			if (flip->gpu_bo->tiling != I915_TILING_NONE) {
++				DBG(("%s: pined bo, tilng=%d needs NONE\n",
++				     __FUNCTION__, flip->gpu_bo->tiling));
++				return FALSE;
++			}
++		} else {
++			if (!sna->kgem.can_scanout_y &&
++			    flip->gpu_bo->tiling == I915_TILING_Y) {
++				DBG(("%s: pined bo, tilng=%d and can't scanout Y\n",
++				     __FUNCTION__, flip->gpu_bo->tiling));
++				return FALSE;
++			}
++		}
+ 
+-	if (clock_gettime(CLOCK_MONOTONIC, &tv))
+-		return 0;
++		if (flip->gpu_bo->pitch & 63) {
++			DBG(("%s: pined bo, bad pitch=%d\n",
++			     __FUNCTION__, flip->gpu_bo->pitch));
++			return FALSE;
++		}
++	}
+ 
+-	return ust64(tv.tv_sec, tv.tv_nsec / 1000);
++	return TRUE;
+ }
+ 
+ static Bool
+-page_flip__async(RRCrtcPtr crtc,
+-		 uint64_t event_id,
+-		 uint64_t target_msc,
+-		 struct kgem_bo *bo)
++flip__async(struct sna *sna,
++	    RRCrtcPtr crtc,
++	    uint64_t event_id,
++	    uint64_t target_msc,
++	    struct kgem_bo *bo)
+ {
+ 	DBG(("%s(pipe=%d, event=%lld, handle=%d)\n",
+ 	     __FUNCTION__,
+@@ -282,17 +660,17 @@ page_flip__async(RRCrtcPtr crtc,
+ 	     (long long)event_id,
+ 	     bo->handle));
+ 
+-	if (!sna_page_flip(to_sna_from_screen(crtc->pScreen), bo, NULL, NULL)) {
++	if (!sna_page_flip(sna, bo, NULL, NULL)) {
+ 		DBG(("%s: async pageflip failed\n", __FUNCTION__));
+ 		present_info.capabilities &= ~PresentCapabilityAsync;
+ 		return FALSE;
+ 	}
+ 
+-	DBG(("%s: pipe=%d tv=%d.%06d msc=%d, event %lld complete\n", __FUNCTION__,
++	DBG(("%s: pipe=%d tv=%ld.%06d msc=%lld (target=%lld), event=%lld complete\n", __FUNCTION__,
+ 	     pipe_from_crtc(crtc),
+-	     gettime_ust64() / 1000000, gettime_ust64() % 1000000,
+-	     sna_crtc_last_swap(crtc->devPrivate)->msc,
+-	     (long long)event_id));
++	     (long)(gettime_ust64() / 1000000), (int)(gettime_ust64() % 1000000),
++	     crtc ? (long long)sna_crtc_last_swap(crtc->devPrivate)->msc : 0LL,
++	     (long long)target_msc, (long long)event_id));
+ 	present_event_notify(event_id, gettime_ust64(), target_msc);
+ 	return TRUE;
+ }
+@@ -303,7 +681,12 @@ present_flip_handler(struct drm_event_vblank *event, void *data)
+ 	struct sna_present_event *info = data;
+ 	struct ust_msc swap;
+ 
+-	DBG(("%s(sequence=%d)\n", __FUNCTION__, event->sequence));
++	DBG(("%s(sequence=%d): event=%lld\n", __FUNCTION__, event->sequence, (long long)info->event_id[0]));
++	assert(info->n_event_id == 1);
++	if (!info->queued) {
++		DBG(("%s: arrived unexpectedly early (not queued)\n", __FUNCTION__));
++		return;
++	}
+ 
+ 	if (info->crtc == NULL) {
+ 		swap.tv_sec = event->tv_sec;
+@@ -312,22 +695,33 @@ present_flip_handler(struct drm_event_vblank *event, void *data)
+ 	} else
+ 		swap = *sna_crtc_last_swap(info->crtc);
+ 
+-	DBG(("%s: pipe=%d, tv=%d.%06d msc %lld, event %lld complete\n", __FUNCTION__,
+-	     info->crtc ? sna_crtc_to_pipe(info->crtc) : -1,
++	DBG(("%s: pipe=%d, tv=%d.%06d msc=%lld (target %lld), event=%lld complete%s\n", __FUNCTION__,
++	     info->crtc ? sna_crtc_pipe(info->crtc) : -1,
+ 	     swap.tv_sec, swap.tv_usec, (long long)swap.msc,
+-	     (long long)info->event_id));
+-	present_event_notify(info->event_id, ust64(swap.tv_sec, swap.tv_usec), swap.msc);
+-	free(info);
++	     (long long)info->target_msc,
++	     (long long)info->event_id[0],
++	     info->target_msc && info->target_msc == swap.msc ? "" : ": MISS"));
++	present_event_notify(info->event_id[0], swap_ust(&swap), swap.msc);
++	if (info->crtc)
++		sna_crtc_clear_vblank(info->crtc);
++
++	if (info->sna->present.unflip) {
++		DBG(("%s: executing queued unflip (event=%lld)\n", __FUNCTION__, (long long)info->sna->present.unflip));
++		sna_present_unflip(xf86ScrnToScreen(info->sna->scrn),
++				   info->sna->present.unflip);
++		info->sna->present.unflip = 0;
++	}
++	info_free(info);
+ }
+ 
+ static Bool
+-page_flip(ScreenPtr screen,
+-	  RRCrtcPtr crtc,
+-	  uint64_t event_id,
+-	  struct kgem_bo *bo)
++flip(struct sna *sna,
++     RRCrtcPtr crtc,
++     uint64_t event_id,
++     uint64_t target_msc,
++     struct kgem_bo *bo)
+ {
+-	struct sna *sna = to_sna_from_screen(screen);
+-	struct sna_present_event *event;
++	struct sna_present_event *info;
+ 
+ 	DBG(("%s(pipe=%d, event=%lld, handle=%d)\n",
+ 	     __FUNCTION__,
+@@ -335,18 +729,27 @@ page_flip(ScreenPtr screen,
+ 	     (long long)event_id,
+ 	     bo->handle));
+ 
+-	event = malloc(sizeof(struct sna_present_event));
+-	if (event == NULL)
++	info = info_alloc(sna);
++	if (info == NULL)
+ 		return FALSE;
+ 
+-	event->event_id = event_id;
+-	event->crtc = crtc ? crtc->devPrivate : NULL;
+-	if (!sna_page_flip(sna, bo, present_flip_handler, event)) {
++	info->crtc = crtc ? crtc->devPrivate : NULL;
++	info->sna = sna;
++	info->event_id = (uint64_t *)(info + 1);
++	info->event_id[0] = event_id;
++	info->n_event_id = 1;
++	info->target_msc = target_msc;
++	info->queued = false;
++
++	if (!sna_page_flip(sna, bo, present_flip_handler, info)) {
+ 		DBG(("%s: pageflip failed\n", __FUNCTION__));
+-		free(event);
++		info_free(info);
+ 		return FALSE;
+ 	}
+ 
++	info->queued = true;
++	if (info->crtc)
++		sna_crtc_set_vblank(info->crtc);
+ 	return TRUE;
+ }
+ 
+@@ -358,12 +761,48 @@ get_flip_bo(PixmapPtr pixmap)
+ 
+ 	DBG(("%s(pixmap=%ld)\n", __FUNCTION__, pixmap->drawable.serialNumber));
+ 
+-	priv = sna_pixmap_move_to_gpu(pixmap, MOVE_READ | __MOVE_FORCE);
++	priv = sna_pixmap_move_to_gpu(pixmap, MOVE_READ | __MOVE_SCANOUT | __MOVE_FORCE);
+ 	if (priv == NULL) {
+ 		DBG(("%s: cannot force pixmap to the GPU\n", __FUNCTION__));
+ 		return NULL;
+ 	}
+ 
++	if (priv->gpu_bo->scanout)
++		return priv->gpu_bo;
++
++	if (sna->kgem.has_llc && !wedged(sna) && !priv->pinned) {
++		struct kgem_bo *bo;
++		uint32_t tiling;
++
++		tiling = I915_TILING_NONE;
++		if ((sna->flags & SNA_LINEAR_FB) == 0)
++			tiling = I915_TILING_X;
++
++		bo = kgem_create_2d(&sna->kgem,
++				    pixmap->drawable.width,
++				    pixmap->drawable.height,
++				    pixmap->drawable.bitsPerPixel,
++				    tiling, CREATE_SCANOUT | CREATE_CACHED);
++		if (bo) {
++			BoxRec box;
++
++			box.x1 = box.y1 = 0;
++			box.x2 = pixmap->drawable.width;
++			box.y2 = pixmap->drawable.height;
++
++			if (sna->render.copy_boxes(sna, GXcopy,
++						   &pixmap->drawable, priv->gpu_bo, 0, 0,
++						   &pixmap->drawable, bo, 0, 0,
++						   &box, 1, 0)) {
++				sna_pixmap_unmap(pixmap, priv);
++				kgem_bo_destroy(&sna->kgem, priv->gpu_bo);
++
++				priv->gpu_bo = bo;
++			} else
++				kgem_bo_destroy(&sna->kgem, bo);
++		}
++	}
++
+ 	if (sna->flags & SNA_LINEAR_FB &&
+ 	    priv->gpu_bo->tiling &&
+ 	    !sna_pixmap_change_tiling(pixmap, I915_TILING_NONE)) {
+@@ -372,12 +811,17 @@ get_flip_bo(PixmapPtr pixmap)
+ 	}
+ 
+ 	if (priv->gpu_bo->tiling == I915_TILING_Y &&
++	    !sna->kgem.can_scanout_y &&
+ 	    !sna_pixmap_change_tiling(pixmap, I915_TILING_X)) {
+ 		DBG(("%s: invalid Y-tiling, cannot convert\n", __FUNCTION__));
+ 		return NULL;
+ 	}
+ 
+-	priv->pinned |= PIN_SCANOUT;
++	if (priv->gpu_bo->pitch & 63) {
++		DBG(("%s: invalid pitch, no conversion\n", __FUNCTION__));
++		return NULL;
++	}
++
+ 	return priv->gpu_bo;
+ }
+ 
+@@ -388,6 +832,7 @@ sna_present_flip(RRCrtcPtr crtc,
+ 		 PixmapPtr pixmap,
+ 		 Bool sync_flip)
+ {
++	struct sna *sna = to_sna_from_pixmap(pixmap);
+ 	struct kgem_bo *bo;
+ 
+ 	DBG(("%s(pipe=%d, event=%lld, msc=%lld, pixmap=%ld, sync?=%d)\n",
+@@ -397,11 +842,32 @@ sna_present_flip(RRCrtcPtr crtc,
+ 	     (long long)target_msc,
+ 	     pixmap->drawable.serialNumber, sync_flip));
+ 
+-	if (!check_flip__crtc(to_sna_from_pixmap(pixmap), crtc)) {
++	if (!check_flip__crtc(sna, crtc)) {
+ 		DBG(("%s: flip invalid for CRTC\n", __FUNCTION__));
+ 		return FALSE;
+ 	}
+ 
++	assert(sna->present.unflip == 0);
++
++	if (sna->flags & SNA_TEAR_FREE) {
++		DBG(("%s: disabling TearFree (was %s) in favour of Present flips\n",
++		     __FUNCTION__, sna->mode.shadow_enabled ? "enabled" : "disabled"));
++		sna->mode.shadow_enabled = false;
++	}
++	assert(!sna->mode.shadow_enabled);
++
++	if (sna->mode.flip_active) {
++		struct pollfd pfd;
++
++		DBG(("%s: flips still pending, stalling\n", __FUNCTION__));
++		pfd.fd = sna->kgem.fd;
++		pfd.events = POLLIN;
++		while (poll(&pfd, 1, 0) == 1)
++			sna_mode_wakeup(sna);
++		if (sna->mode.flip_active)
++			return FALSE;
++	}
++
+ 	bo = get_flip_bo(pixmap);
+ 	if (bo == NULL) {
+ 		DBG(("%s: flip invalid bo\n", __FUNCTION__));
+@@ -409,9 +875,9 @@ sna_present_flip(RRCrtcPtr crtc,
+ 	}
+ 
+ 	if (sync_flip)
+-		return page_flip(crtc->pScreen, crtc, event_id, bo);
++		return flip(sna, crtc, event_id, target_msc, bo);
+ 	else
+-		return page_flip__async(crtc, event_id, target_msc, bo);
++		return flip__async(sna, crtc, event_id, target_msc, bo);
+ }
+ 
+ static void
+@@ -421,29 +887,70 @@ sna_present_unflip(ScreenPtr screen, uint64_t event_id)
+ 	struct kgem_bo *bo;
+ 
+ 	DBG(("%s(event=%lld)\n", __FUNCTION__, (long long)event_id));
+-	if (sna->mode.front_active == 0 || sna->mode.shadow_active) {
++	if (sna->mode.front_active == 0 || sna->mode.rr_active) {
+ 		const struct ust_msc *swap;
+ 
+ 		DBG(("%s: no CRTC active, perform no-op flip\n", __FUNCTION__));
+ 
+ notify:
+-		swap = sna_crtc_last_swap(sna_mode_first_crtc(sna));
+-		DBG(("%s: pipe=%d, tv=%d.%06d msc %lld, event %lld complete\n", __FUNCTION__,
++		swap = sna_crtc_last_swap(sna_primary_crtc(sna));
++		DBG(("%s: pipe=%d, tv=%d.%06d msc=%lld, event=%lld complete\n", __FUNCTION__,
+ 		     -1,
+ 		     swap->tv_sec, swap->tv_usec, (long long)swap->msc,
+ 		     (long long)event_id));
+-		present_event_notify(event_id,
+-				     ust64(swap->tv_sec, swap->tv_usec),
+-				     swap->msc);
++		present_event_notify(event_id, swap_ust(swap), swap->msc);
++		return;
++	}
++
++	assert(!sna->mode.shadow_enabled);
++	if (sna->mode.flip_active) {
++		DBG(("%s: %d outstanding flips, queueing unflip\n", __FUNCTION__, sna->mode.flip_active));
++		assert(sna->present.unflip == 0);
++		sna->present.unflip = event_id;
+ 		return;
+ 	}
+ 
++	if (sna->flags & SNA_TEAR_FREE) {
++		DBG(("%s: %s TearFree after Present flips\n",
++		     __FUNCTION__, sna->mode.shadow_damage != NULL ? "enabling" : "disabling"));
++		sna->mode.shadow_enabled = sna->mode.shadow_damage != NULL;
++	}
++
+ 	bo = get_flip_bo(screen->GetScreenPixmap(screen));
+-	if (bo == NULL || !page_flip(screen, NULL, event_id, bo)) {
++	if (bo == NULL) {
++reset_mode:
+ 		DBG(("%s: failed, trying to restore original mode\n", __FUNCTION__));
+ 		xf86SetDesiredModes(sna->scrn);
+ 		goto notify;
+ 	}
++
++	/* Are we unflipping after a failure that left our ScreenP in place? */
++	if (!sna_needs_page_flip(sna, bo))
++		goto notify;
++
++	assert(sna_pixmap(screen->GetScreenPixmap(screen))->pinned & PIN_SCANOUT);
++
++	if (sna->flags & SNA_HAS_ASYNC_FLIP) {
++		DBG(("%s: trying async flip restore\n", __FUNCTION__));
++		if (flip__async(sna, NULL, event_id, 0, bo))
++			return;
++	}
++
++	if (!flip(sna, NULL, event_id, 0, bo))
++		goto reset_mode;
++}
++
++void sna_present_cancel_flip(struct sna *sna)
++{
++	if (sna->present.unflip) {
++		const struct ust_msc *swap;
++
++		swap = sna_crtc_last_swap(sna_primary_crtc(sna));
++		present_event_notify(sna->present.unflip,
++				     swap_ust(swap), swap->msc);
++
++		sna->present.unflip = 0;
++	}
+ }
+ 
+ static present_screen_info_rec present_info = {
+@@ -463,10 +970,13 @@ static present_screen_info_rec present_info = {
+ 
+ bool sna_present_open(struct sna *sna, ScreenPtr screen)
+ {
++	DBG(("%s(num_crtc=%d)\n", __FUNCTION__, sna->mode.num_real_crtc));
++
+ 	if (sna->mode.num_real_crtc == 0)
+ 		return false;
+ 
+ 	sna_present_update(sna);
++	list_init(&sna->present.vblank_queue);
+ 
+ 	return present_screen_init(screen, &present_info);
+ }
+diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
+index 3fbb9ecb..3e935d57 100644
+--- a/src/sna/sna_render.c
++++ b/src/sna/sna_render.c
+@@ -54,7 +54,7 @@ sna_format_for_depth(int depth)
+ {
+ 	switch (depth) {
+ 	case 1: return PICT_a1;
+-	case 4: return PICT_a4;
++	case 4: return PICT_x4a4;
+ 	case 8: return PICT_a8;
+ 	case 15: return PICT_x1r5g5b5;
+ 	case 16: return PICT_r5g6b5;
+@@ -272,18 +272,6 @@ no_render_context_switch(struct kgem *kgem,
+ }
+ 
+ static void
+-no_render_retire(struct kgem *kgem)
+-{
+-	(void)kgem;
+-}
+-
+-static void
+-no_render_expire(struct kgem *kgem)
+-{
+-	(void)kgem;
+-}
+-
+-static void
+ no_render_fini(struct sna *sna)
+ {
+ 	(void)sna;
+@@ -316,8 +304,6 @@ const char *no_render_init(struct sna *sna)
+ 	render->fini = no_render_fini;
+ 
+ 	sna->kgem.context_switch = no_render_context_switch;
+-	sna->kgem.retire = no_render_retire;
+-	sna->kgem.expire = no_render_expire;
+ 	if (sna->kgem.has_blt)
+ 		sna->kgem.ring = KGEM_BLT;
+ 
+@@ -407,10 +393,7 @@ use_cpu_bo(struct sna *sna, PixmapPtr pixmap, const BoxRec *box, bool blt)
+ 		}
+ 	}
+ 
+-	if (priv->shm) {
+-		assert(!priv->flush);
+-		sna_add_flush_pixmap(sna, priv, priv->cpu_bo);
+-	}
++	add_shm_flush(sna, priv);
+ 
+ 	DBG(("%s for box=(%d, %d), (%d, %d)\n",
+ 	     __FUNCTION__, box->x1, box->y1, box->x2, box->y2));
+@@ -567,6 +550,7 @@ static struct kgem_bo *upload(struct sna *sna,
+ 			assert(priv->gpu_damage == NULL);
+ 			assert(priv->gpu_bo == NULL);
+ 			assert(bo->proxy != NULL);
++			sna_damage_all(&priv->cpu_damage, pixmap);
+ 			kgem_proxy_bo_attach(bo, &priv->gpu_bo);
+ 		}
+ 	}
+@@ -627,10 +611,7 @@ sna_render_pixmap_bo(struct sna *sna,
+ 		    !priv->cpu_bo->snoop && priv->cpu_bo->pitch < 4096) {
+ 			DBG(("%s: CPU all damaged\n", __FUNCTION__));
+ 			channel->bo = priv->cpu_bo;
+-			if (priv->shm) {
+-				assert(!priv->flush);
+-				sna_add_flush_pixmap(sna, priv, priv->cpu_bo);
+-			}
++			add_shm_flush(sna, priv);
+ 			goto done;
+ 		}
+ 	}
+@@ -1275,6 +1256,7 @@ sna_render_picture_extract(struct sna *sna,
+ 			assert(priv->gpu_damage == NULL);
+ 			assert(priv->gpu_bo == NULL);
+ 			assert(bo->proxy != NULL);
++			sna_damage_all(&priv->cpu_damage, pixmap);
+ 			kgem_proxy_bo_attach(bo, &priv->gpu_bo);
+ 		}
+ 	}
+@@ -1338,6 +1320,8 @@ sna_render_picture_convolve(struct sna *sna,
+ 	 */
+ 	DBG(("%s: origin=(%d,%d) kernel=%dx%d, size=%dx%d\n",
+ 	     __FUNCTION__, x_off, y_off, cw, ch, w, h));
++	if (cw*ch > 32) /* too much loss of precision from quantization! */
++		return -1;
+ 
+ 	assert(picture->pDrawable);
+ 	assert(picture->filter == PictFilterConvolution);
+@@ -1388,9 +1372,9 @@ sna_render_picture_convolve(struct sna *sna,
+ 			alpha = CreateSolidPicture(0, &color, &error);
+ 			if (alpha) {
+ 				sna_composite(PictOpAdd, picture, alpha, tmp,
+-					      x, y,
++					      x-(x_off+i), y-(y_off+j),
++					      0, 0,
+ 					      0, 0,
+-					      x_off+i, y_off+j,
+ 					      w, h);
+ 				FreePicture(alpha, 0);
+ 			}
+@@ -2183,11 +2167,11 @@ copy_overlap(struct sna *sna, uint8_t alu,
+ 	ret = (sna->render.copy_boxes(sna, GXcopy,
+ 				      draw, bo, src_dx, src_dy,
+ 				      &tmp->drawable, tmp_bo, -extents->x1, -extents->y1,
+-				      box, n , 0) &&
++				      box, n, 0) &&
+ 	       sna->render.copy_boxes(sna, alu,
+ 				      &tmp->drawable, tmp_bo, -extents->x1, -extents->y1,
+ 				      draw, bo, dst_dx, dst_dy,
+-				      box, n , 0));
++				      box, n, 0));
+ 
+ 	screen->DestroyPixmap(tmp);
+ 	return ret;
+@@ -2308,16 +2292,22 @@ static bool can_copy_cpu(struct sna *sna,
+ 			 struct kgem_bo *src,
+ 			 struct kgem_bo *dst)
+ {
+-	if (src->tiling != dst->tiling)
+-		return false;
++	DBG(("%s: tiling=%d:%d, pitch=%d:%d, can_map=%d:%d[%d]\n",
++	     __FUNCTION__,
++	     src->tiling, dst->tiling,
++	     src->pitch, dst->pitch,
++	     kgem_bo_can_map__cpu(&sna->kgem, src, false),
++	     kgem_bo_can_map__cpu(&sna->kgem, dst, true),
++	     sna->kgem.has_wc_mmap));
+ 
+-	if (src->pitch != dst->pitch)
++	if (src->tiling != dst->tiling)
+ 		return false;
+ 
+ 	if (!kgem_bo_can_map__cpu(&sna->kgem, src, false))
+ 		return false;
+ 
+-	if (!kgem_bo_can_map__cpu(&sna->kgem, dst, true))
++	if (!kgem_bo_can_map__cpu(&sna->kgem, dst, true) &&
++	    !sna->kgem.has_wc_mmap)
+ 		return false;
+ 
+ 	DBG(("%s -- yes, src handle=%d, dst handle=%d\n", __FUNCTION__, src->handle, dst->handle));
+@@ -2330,31 +2320,62 @@ memcpy_copy_boxes(struct sna *sna, uint8_t op,
+ 		  const DrawableRec *dst_draw, struct kgem_bo *dst_bo, int16_t dx, int16_t dy,
+ 		  const BoxRec *box, int n, unsigned flags)
+ {
++	memcpy_box_func detile = NULL;
+ 	void *dst, *src;
+-	bool clipped;
+ 
+ 	if (op != GXcopy)
+ 		return false;
+ 
+-	clipped = (n > 1 ||
+-		   box->x1 + dx > 0 ||
+-		   box->y1 + dy > 0 ||
+-		   box->x2 + dx < dst_draw->width ||
+-		   box->y2 + dy < dst_draw->height);
++	if (src_draw->depth != dst_draw->depth)
++		return false;
+ 
+ 	dst = src = NULL;
+-	if (!clipped && can_copy_cpu(sna, src_bo, dst_bo)) {
+-		dst = kgem_bo_map__cpu(&sna->kgem, dst_bo);
++	if (can_copy_cpu(sna, src_bo, dst_bo)) {
++		if (src_bo->pitch != dst_bo->pitch ||
++		    dx != sx || dy != sy || n > 1 ||
++		    box->x1 + dx > 0 ||
++		    box->y1 + dy > 0 ||
++		    box->x2 + dx < dst_draw->width ||
++		    box->y2 + dy < dst_draw->height) {
++			if (dx != sx) /* not implemented in memcpy yet */
++				goto use_gtt;
++
++			switch (dst_bo->tiling) {
++			default:
++			case I915_TILING_Y:
++				goto use_gtt;
++
++			case I915_TILING_X:
++				detile = sna->kgem.memcpy_between_tiled_x;
++				if (detile == NULL)
++					goto use_gtt;
++				break;
++
++			case I915_TILING_NONE:
++				break;
++			}
++		}
++
++		if (kgem_bo_can_map__cpu(&sna->kgem, dst_bo, true))
++			dst = kgem_bo_map__cpu(&sna->kgem, dst_bo);
++		else
++			dst = kgem_bo_map__wc(&sna->kgem, dst_bo);
+ 		src = kgem_bo_map__cpu(&sna->kgem, src_bo);
+ 	}
+ 
+ 	if (dst == NULL || src == NULL) {
++use_gtt:
+ 		dst = kgem_bo_map__gtt(&sna->kgem, dst_bo);
+ 		src = kgem_bo_map__gtt(&sna->kgem, src_bo);
+ 		if (dst == NULL || src == NULL)
+ 			return false;
++
++		detile = NULL;
+ 	} else {
+-		kgem_bo_sync__cpu_full(&sna->kgem, dst_bo, true);
++		if (dst == dst_bo->map__wc)
++			kgem_bo_sync__gtt(&sna->kgem, dst_bo);
++		else
++			kgem_bo_sync__cpu_full(&sna->kgem, dst_bo, true);
+ 		kgem_bo_sync__cpu_full(&sna->kgem, src_bo, false);
+ 	}
+ 
+@@ -2362,7 +2383,16 @@ memcpy_copy_boxes(struct sna *sna, uint8_t op,
+ 	     __FUNCTION__, sx, sy, dx, dy, n));
+ 
+ 	if (sigtrap_get() == 0) {
+-		do {
++		if (detile) {
++			do {
++				detile(src, dst, dst_draw->bitsPerPixel,
++				       src_bo->pitch, dst_bo->pitch,
++				       box->x1 + sx, box->y1 + sy,
++				       box->x1 + dx, box->y1 + dy,
++				       box->x2 - box->x1, box->y2 - box->y1);
++				box++;
++			} while (--n);
++		} else do {
+ 			memcpy_blt(src, dst, dst_draw->bitsPerPixel,
+ 				   src_bo->pitch, dst_bo->pitch,
+ 				   box->x1 + sx, box->y1 + sy,
+@@ -2380,4 +2410,5 @@ void
+ sna_render_mark_wedged(struct sna *sna)
+ {
+ 	sna->render.copy_boxes = memcpy_copy_boxes;
++	sna->render.prefer_gpu = 0;
+ }
+diff --git a/src/sna/sna_render.h b/src/sna/sna_render.h
+index 6e1fa480..4ba345a7 100644
+--- a/src/sna/sna_render.h
++++ b/src/sna/sna_render.h
+@@ -148,6 +148,10 @@ struct sna_composite_op {
+ 		struct {
+ 			uint32_t flags;
+ 		} gen8;
++
++		struct {
++			uint32_t flags;
++		} gen9;
+ 	} u;
+ 
+ 	void *priv;
+@@ -238,8 +242,9 @@ struct sna_render {
+ 			  int16_t w, int16_t h,
+ 			  unsigned flags,
+ 			  struct sna_composite_op *tmp);
+-#define COMPOSITE_PARTIAL 0x1
+-#define COMPOSITE_FALLBACK 0x80000000
++#define COMPOSITE_PARTIAL	0x1
++#define COMPOSITE_UPLOAD	0x40000000
++#define COMPOSITE_FALLBACK	0x80000000
+ 
+ 	bool (*check_composite_spans)(struct sna *sna, uint8_t op,
+ 				      PicturePtr dst, PicturePtr src,
+@@ -286,6 +291,8 @@ struct sna_render {
+ #define COPY_LAST 0x1
+ #define COPY_SYNC 0x2
+ #define COPY_NO_OVERLAP 0x4
++#define COPY_SMALL 0x8
++#define COPY_DRI 0x10
+ 
+ 	bool (*copy)(struct sna *sna, uint8_t alu,
+ 		     PixmapPtr src, struct kgem_bo *src_bo,
+@@ -481,6 +488,7 @@ enum {
+ 
+ 	GEN7_WM_KERNEL_VIDEO_PLANAR,
+ 	GEN7_WM_KERNEL_VIDEO_PACKED,
++	GEN7_WM_KERNEL_VIDEO_RGB,
+ 	GEN7_WM_KERNEL_COUNT
+ };
+ 
+@@ -533,12 +541,13 @@ enum {
+ 
+ 	GEN8_WM_KERNEL_VIDEO_PLANAR,
+ 	GEN8_WM_KERNEL_VIDEO_PACKED,
++	GEN8_WM_KERNEL_VIDEO_RGB,
+ 	GEN8_WM_KERNEL_COUNT
+ };
+ 
+ struct gen8_render_state {
+ 	unsigned gt;
+-
++	const struct gt_info *info;
+ 	struct kgem_bo *general_bo;
+ 
+ 	uint32_t vs_state;
+@@ -565,6 +574,58 @@ struct gen8_render_state {
+ 	bool emit_flush;
+ };
+ 
++enum {
++	GEN9_WM_KERNEL_NOMASK = 0,
++	GEN9_WM_KERNEL_NOMASK_P,
++
++	GEN9_WM_KERNEL_MASK,
++	GEN9_WM_KERNEL_MASK_P,
++
++	GEN9_WM_KERNEL_MASKCA,
++	GEN9_WM_KERNEL_MASKCA_P,
++
++	GEN9_WM_KERNEL_MASKSA,
++	GEN9_WM_KERNEL_MASKSA_P,
++
++	GEN9_WM_KERNEL_OPACITY,
++	GEN9_WM_KERNEL_OPACITY_P,
++
++	GEN9_WM_KERNEL_VIDEO_PLANAR,
++	GEN9_WM_KERNEL_VIDEO_PACKED,
++	GEN9_WM_KERNEL_VIDEO_RGB,
++	GEN9_WM_KERNEL_COUNT
++};
++
++struct gen9_render_state {
++	unsigned gt;
++	const struct gt_info *info;
++	struct kgem_bo *general_bo;
++
++	uint32_t vs_state;
++	uint32_t sf_state;
++	uint32_t sf_mask_state;
++	uint32_t wm_state;
++	uint32_t wm_kernel[GEN9_WM_KERNEL_COUNT][3];
++
++	uint32_t cc_blend;
++
++	uint32_t drawrect_offset;
++	uint32_t drawrect_limit;
++	uint32_t blend;
++	uint32_t samplers;
++	uint32_t kernel;
++
++	uint16_t num_sf_outputs;
++	uint16_t ve_id;
++	uint16_t last_primitive;
++	int16_t floats_per_vertex;
++	uint16_t surface_table;
++
++	bool needs_invariant;
++	bool emit_flush;
++	bool ve_dirty;
++};
++
+ struct sna_static_stream {
+ 	uint32_t size, used;
+ 	uint8_t *data;
+@@ -620,6 +681,7 @@ const char *gen5_render_init(struct sna *sna, const char *backend);
+ const char *gen6_render_init(struct sna *sna, const char *backend);
+ const char *gen7_render_init(struct sna *sna, const char *backend);
+ const char *gen8_render_init(struct sna *sna, const char *backend);
++const char *gen9_render_init(struct sna *sna, const char *backend);
+ 
+ void sna_render_mark_wedged(struct sna *sna);
+ 
+diff --git a/src/sna/sna_render_inline.h b/src/sna/sna_render_inline.h
+index 10fbbfe2..e162e37f 100644
+--- a/src/sna/sna_render_inline.h
++++ b/src/sna/sna_render_inline.h
+@@ -304,6 +304,12 @@ color_convert(uint32_t pixel,
+ 	return pixel;
+ }
+ 
++inline static uint32_t
++solid_color(uint32_t format, uint32_t pixel)
++{
++	return color_convert(pixel, format, PICT_a8r8g8b8);
++}
++
+ inline static bool dst_use_gpu(PixmapPtr pixmap)
+ {
+ 	struct sna_pixmap *priv = sna_pixmap(pixmap);
+diff --git a/src/sna/sna_tiling.c b/src/sna/sna_tiling.c
+index 308efc0a..8e2627f7 100644
+--- a/src/sna/sna_tiling.c
++++ b/src/sna/sna_tiling.c
+@@ -369,8 +369,7 @@ sna_tiling_composite_spans_boxes(struct sna *sna,
+ 				 const BoxRec *box, int nbox, float opacity)
+ {
+ 	while (nbox--)
+-		sna_tiling_composite_spans_box(sna, op->base.priv, box++, opacity);
+-	(void)sna;
++		sna_tiling_composite_spans_box(sna, op, box++, opacity);
+ }
+ 
+ fastcall static void
+@@ -581,6 +580,7 @@ sna_tiling_composite_spans(uint32_t op,
+ 	tile->rects = tile->rects_embedded;
+ 	tile->rect_count = 0;
+ 	tile->rect_size = ARRAY_SIZE(tile->rects_embedded);
++	COMPILE_TIME_ASSERT(sizeof(tile->rects_embedded[0]) >= sizeof(struct sna_tile_span));
+ 
+ 	tmp->box   = sna_tiling_composite_spans_box;
+ 	tmp->boxes = sna_tiling_composite_spans_boxes;
+diff --git a/src/sna/sna_trapezoids_boxes.c b/src/sna/sna_trapezoids_boxes.c
+index 9900e3f0..bbf83759 100644
+--- a/src/sna/sna_trapezoids_boxes.c
++++ b/src/sna/sna_trapezoids_boxes.c
+@@ -198,7 +198,7 @@ composite_aligned_boxes(struct sna *sna,
+ 	if (op == PictOpClear && sna->clear)
+ 		src = sna->clear;
+ 
+-	DBG(("%s: clipped extents (%d, %d), (%d, %d);  now offset by (%d, %d), orgin (%d, %d)\n",
++	DBG(("%s: clipped extents (%d, %d), (%d, %d);  now offset by (%d, %d), origin (%d, %d)\n",
+ 	     __FUNCTION__,
+ 	     clip.extents.x1, clip.extents.y1,
+ 	     clip.extents.x2, clip.extents.y2,
+@@ -592,6 +592,8 @@ lerp32_opacity(PixmapPtr scratch,
+ 	uint32_t *ptr;
+ 	int stride, i;
+ 
++	sigtrap_assert_active();
++
+ 	ptr = (uint32_t*)((uint8_t *)scratch->devPrivate.ptr + scratch->devKind * y);
+ 	ptr += x;
+ 	stride = scratch->devKind / 4;
+diff --git a/src/sna/sna_trapezoids_imprecise.c b/src/sna/sna_trapezoids_imprecise.c
+index 37def2f9..8bc7c8a8 100644
+--- a/src/sna/sna_trapezoids_imprecise.c
++++ b/src/sna/sna_trapezoids_imprecise.c
+@@ -962,6 +962,16 @@ tor_add_trapezoid(struct tor *tor,
+ 		  const xTrapezoid *t,
+ 		  int dx, int dy)
+ {
++	if (!xTrapezoidValid(t)) {
++		__DBG(("%s: skipping invalid trapezoid: top=%d, bottom=%d, left=(%d, %d), (%d, %d), right=(%d, %d), (%d, %d)\n",
++		       __FUNCTION__,
++		       t->top, t->bottom,
++		       t->left.p1.x, t->left.p1.y,
++		       t->left.p2.x, t->left.p2.y,
++		       t->right.p1.x, t->right.p1.y,
++		       t->right.p2.x, t->right.p2.y));
++		return;
++	}
+ 	polygon_add_edge(tor->polygon, t, &t->left, 1, dx, dy);
+ 	polygon_add_edge(tor->polygon, t, &t->right, -1, dx, dy);
+ }
+@@ -1687,31 +1697,27 @@ struct span_thread {
+ #define SPAN_THREAD_MAX_BOXES (8192/sizeof(struct sna_opacity_box))
+ struct span_thread_boxes {
+ 	const struct sna_composite_spans_op *op;
++	const BoxRec *clip_start, *clip_end;
+ 	int num_boxes;
+ 	struct sna_opacity_box boxes[SPAN_THREAD_MAX_BOXES];
+ };
+ 
+-static void span_thread_add_boxes(struct sna *sna, void *data,
+-				  const BoxRec *box, int count, float alpha)
++static void span_thread_add_box(struct sna *sna, void *data,
++				const BoxRec *box, float alpha)
+ {
+ 	struct span_thread_boxes *b = data;
+ 
+-	__DBG(("%s: adding %d boxes with alpha=%f\n",
+-	       __FUNCTION__, count, alpha));
++	__DBG(("%s: adding box with alpha=%f\n", __FUNCTION__, alpha));
+ 
+-	assert(count > 0 && count <= SPAN_THREAD_MAX_BOXES);
+-	if (unlikely(b->num_boxes + count > SPAN_THREAD_MAX_BOXES)) {
+-		DBG(("%s: flushing %d boxes, adding %d\n", __FUNCTION__, b->num_boxes, count));
+-		assert(b->num_boxes <= SPAN_THREAD_MAX_BOXES);
++	if (unlikely(b->num_boxes == SPAN_THREAD_MAX_BOXES)) {
++		DBG(("%s: flushing %d boxes\n", __FUNCTION__, b->num_boxes));
+ 		b->op->thread_boxes(sna, b->op, b->boxes, b->num_boxes);
+ 		b->num_boxes = 0;
+ 	}
+ 
+-	do {
+-		b->boxes[b->num_boxes].box = *box++;
+-		b->boxes[b->num_boxes].alpha = alpha;
+-		b->num_boxes++;
+-	} while (--count);
++	b->boxes[b->num_boxes].box = *box++;
++	b->boxes[b->num_boxes].alpha = alpha;
++	b->num_boxes++;
+ 	assert(b->num_boxes <= SPAN_THREAD_MAX_BOXES);
+ }
+ 
+@@ -1722,8 +1728,22 @@ span_thread_box(struct sna *sna,
+ 		const BoxRec *box,
+ 		int coverage)
+ {
++	struct span_thread_boxes *b = (struct span_thread_boxes *)op;
++
+ 	__DBG(("%s: %d -> %d @ %d\n", __FUNCTION__, box->x1, box->x2, coverage));
+-	span_thread_add_boxes(sna, op, box, 1, AREA_TO_ALPHA(coverage));
++	if (b->num_boxes) {
++		struct sna_opacity_box *bb = &b->boxes[b->num_boxes-1];
++		if (bb->box.x1 == box->x1 &&
++		    bb->box.x2 == box->x2 &&
++		    bb->box.y2 == box->y1 &&
++		    bb->alpha == AREA_TO_ALPHA(coverage)) {
++			bb->box.y2 = box->y2;
++			__DBG(("%s: contracted double row: %d -> %d\n", __func__, bb->box.y1, bb->box.y2));
++			return;
++		}
++	}
++
++	span_thread_add_box(sna, op, box, AREA_TO_ALPHA(coverage));
+ }
+ 
+ static void
+@@ -1733,20 +1753,28 @@ span_thread_clipped_box(struct sna *sna,
+ 			const BoxRec *box,
+ 			int coverage)
+ {
+-	pixman_region16_t region;
++	struct span_thread_boxes *b = (struct span_thread_boxes *)op;
++	const BoxRec *c;
+ 
+ 	__DBG(("%s: %d -> %d @ %f\n", __FUNCTION__, box->x1, box->x2,
+ 	       AREA_TO_ALPHA(coverage)));
+ 
+-	pixman_region_init_rects(&region, box, 1);
+-	RegionIntersect(&region, &region, clip);
+-	if (region_num_rects(&region)) {
+-		span_thread_add_boxes(sna, op,
+-				      region_rects(&region),
+-				      region_num_rects(&region),
+-				      AREA_TO_ALPHA(coverage));
++	b->clip_start =
++		find_clip_box_for_y(b->clip_start, b->clip_end, box->y1);
++
++	c = b->clip_start;
++	while (c != b->clip_end) {
++		BoxRec clipped;
++
++		if (box->y2 <= c->y1)
++			break;
++
++		clipped = *box;
++		if (!box_intersect(&clipped, c++))
++			continue;
++
++		span_thread_add_box(sna, op, &clipped, AREA_TO_ALPHA(coverage));
+ 	}
+-	pixman_region_fini(&region);
+ }
+ 
+ static span_func_t
+@@ -1777,6 +1805,16 @@ thread_choose_span(struct sna_composite_spans_op *tmp,
+ 	return span;
+ }
+ 
++inline static void
++span_thread_boxes_init(struct span_thread_boxes *boxes,
++		       const struct sna_composite_spans_op *op,
++		       const RegionRec *clip)
++{
++	boxes->op = op;
++	region_get_boxes(clip, &boxes->clip_start, &boxes->clip_end);
++	boxes->num_boxes = 0;
++}
++
+ static void
+ span_thread(void *arg)
+ {
+@@ -1789,8 +1827,7 @@ span_thread(void *arg)
+ 	if (!tor_init(&tor, &thread->extents, 2*thread->ntrap))
+ 		return;
+ 
+-	boxes.op = thread->op;
+-	boxes.num_boxes = 0;
++	span_thread_boxes_init(&boxes, thread->op, thread->clip);
+ 
+ 	y1 = thread->extents.y1 - thread->draw_y;
+ 	y2 = thread->extents.y2 - thread->draw_y;
+@@ -2190,6 +2227,52 @@ static void _tor_blt_src(struct inplace *in, const BoxRec *box, uint8_t v)
+ 	} while (--h);
+ }
+ 
++struct clipped_span {
++	span_func_t span;
++	const BoxRec *clip_start, *clip_end;
++};
++
++static void
++tor_blt_clipped(struct sna *sna,
++		struct sna_composite_spans_op *op,
++		pixman_region16_t *clip,
++		const BoxRec *box,
++		int coverage)
++{
++	struct clipped_span *cs = (struct clipped_span *)clip;
++	const BoxRec *c;
++
++	cs->clip_start =
++		find_clip_box_for_y(cs->clip_start, cs->clip_end, box->y1);
++
++	c = cs->clip_start;
++	while (c != cs->clip_end) {
++		BoxRec clipped;
++
++		if (box->y2 <= c->y1)
++			break;
++
++		clipped = *box;
++		if (!box_intersect(&clipped, c++))
++			continue;
++
++		cs->span(sna, op, NULL, &clipped, coverage);
++	}
++}
++
++inline static span_func_t
++clipped_span(struct clipped_span *cs,
++	     span_func_t span,
++	     const RegionRec *clip)
++{
++	if (clip->data) {
++		cs->span = span;
++		region_get_boxes(clip, &cs->clip_start, &cs->clip_end);
++		span = tor_blt_clipped;
++	}
++	return span;
++}
++
+ static void
+ tor_blt_src(struct sna *sna,
+ 	    struct sna_composite_spans_op *op,
+@@ -2203,25 +2286,6 @@ tor_blt_src(struct sna *sna,
+ }
+ 
+ static void
+-tor_blt_src_clipped(struct sna *sna,
+-		    struct sna_composite_spans_op *op,
+-		    pixman_region16_t *clip,
+-		    const BoxRec *box,
+-		    int coverage)
+-{
+-	pixman_region16_t region;
+-	int n;
+-
+-	pixman_region_init_rects(&region, box, 1);
+-	RegionIntersect(&region, &region, clip);
+-	n = region_num_rects(&region);
+-	box = region_rects(&region);
+-	while (n--)
+-		tor_blt_src(sna, op, NULL, box++, coverage);
+-	pixman_region_fini(&region);
+-}
+-
+-static void
+ tor_blt_in(struct sna *sna,
+ 	   struct sna_composite_spans_op *op,
+ 	   pixman_region16_t *clip,
+@@ -2253,25 +2317,6 @@ tor_blt_in(struct sna *sna,
+ }
+ 
+ static void
+-tor_blt_in_clipped(struct sna *sna,
+-		   struct sna_composite_spans_op *op,
+-		   pixman_region16_t *clip,
+-		   const BoxRec *box,
+-		   int coverage)
+-{
+-	pixman_region16_t region;
+-	int n;
+-
+-	pixman_region_init_rects(&region, box, 1);
+-	RegionIntersect(&region, &region, clip);
+-	n = region_num_rects(&region);
+-	box = region_rects(&region);
+-	while (n--)
+-		tor_blt_in(sna, op, NULL, box++, coverage);
+-	pixman_region_fini(&region);
+-}
+-
+-static void
+ tor_blt_add(struct sna *sna,
+ 	    struct sna_composite_spans_op *op,
+ 	    pixman_region16_t *clip,
+@@ -2310,25 +2355,6 @@ tor_blt_add(struct sna *sna,
+ }
+ 
+ static void
+-tor_blt_add_clipped(struct sna *sna,
+-		    struct sna_composite_spans_op *op,
+-		    pixman_region16_t *clip,
+-		    const BoxRec *box,
+-		    int coverage)
+-{
+-	pixman_region16_t region;
+-	int n;
+-
+-	pixman_region_init_rects(&region, box, 1);
+-	RegionIntersect(&region, &region, clip);
+-	n = region_num_rects(&region);
+-	box = region_rects(&region);
+-	while (n--)
+-		tor_blt_add(sna, op, NULL, box++, coverage);
+-	pixman_region_fini(&region);
+-}
+-
+-static void
+ tor_blt_lerp32(struct sna *sna,
+ 	       struct sna_composite_spans_op *op,
+ 	       pixman_region16_t *clip,
+@@ -2343,6 +2369,7 @@ tor_blt_lerp32(struct sna *sna,
+ 	if (coverage == 0)
+ 		return;
+ 
++	sigtrap_assert_active();
+ 	ptr += box->y1 * stride + box->x1;
+ 
+ 	h = box->y2 - box->y1;
+@@ -2383,25 +2410,6 @@ tor_blt_lerp32(struct sna *sna,
+ 	}
+ }
+ 
+-static void
+-tor_blt_lerp32_clipped(struct sna *sna,
+-		       struct sna_composite_spans_op *op,
+-		       pixman_region16_t *clip,
+-		       const BoxRec *box,
+-		       int coverage)
+-{
+-	pixman_region16_t region;
+-	int n;
+-
+-	pixman_region_init_rects(&region, box, 1);
+-	RegionIntersect(&region, &region, clip);
+-	n = region_num_rects(&region);
+-	box = region_rects(&region);
+-	while (n--)
+-		tor_blt_lerp32(sna, op, NULL, box++, coverage);
+-	pixman_region_fini(&region);
+-}
+-
+ struct pixman_inplace {
+ 	pixman_image_t *image, *source, *mask;
+ 	uint32_t color;
+@@ -2431,24 +2439,6 @@ pixmask_span_solid(struct sna *sna,
+ 			       pi->dx + box->x1, pi->dy + box->y1,
+ 			       box->x2 - box->x1, box->y2 - box->y1);
+ }
+-static void
+-pixmask_span_solid__clipped(struct sna *sna,
+-			    struct sna_composite_spans_op *op,
+-			    pixman_region16_t *clip,
+-			    const BoxRec *box,
+-			    int coverage)
+-{
+-	pixman_region16_t region;
+-	int n;
+-
+-	pixman_region_init_rects(&region, box, 1);
+-	RegionIntersect(&region, &region, clip);
+-	n = region_num_rects(&region);
+-	box = region_rects(&region);
+-	while (n--)
+-		pixmask_span_solid(sna, op, NULL, box++, coverage);
+-	pixman_region_fini(&region);
+-}
+ 
+ static void
+ pixmask_span(struct sna *sna,
+@@ -2471,24 +2461,6 @@ pixmask_span(struct sna *sna,
+ 			       pi->dx + box->x1, pi->dy + box->y1,
+ 			       box->x2 - box->x1, box->y2 - box->y1);
+ }
+-static void
+-pixmask_span__clipped(struct sna *sna,
+-		      struct sna_composite_spans_op *op,
+-		      pixman_region16_t *clip,
+-		      const BoxRec *box,
+-		      int coverage)
+-{
+-	pixman_region16_t region;
+-	int n;
+-
+-	pixman_region_init_rects(&region, box, 1);
+-	RegionIntersect(&region, &region, clip);
+-	n = region_num_rects(&region);
+-	box = region_rects(&region);
+-	while (n--)
+-		pixmask_span(sna, op, NULL, box++, coverage);
+-	pixman_region_fini(&region);
+-}
+ 
+ struct inplace_x8r8g8b8_thread {
+ 	xTrapezoid *traps;
+@@ -2507,6 +2479,7 @@ static void inplace_x8r8g8b8_thread(void *arg)
+ 	struct inplace_x8r8g8b8_thread *thread = arg;
+ 	struct tor tor;
+ 	span_func_t span;
++	struct clipped_span clipped;
+ 	RegionPtr clip;
+ 	int y1, y2, n;
+ 
+@@ -2537,12 +2510,11 @@ static void inplace_x8r8g8b8_thread(void *arg)
+ 		inplace.stride = pixmap->devKind;
+ 		inplace.color = thread->color;
+ 
+-		if (clip->data)
+-			span = tor_blt_lerp32_clipped;
+-		else
+-			span = tor_blt_lerp32;
++		span = clipped_span(&clipped, tor_blt_lerp32, clip);
+ 
+-		tor_render(NULL, &tor, (void*)&inplace, clip, span, false);
++		tor_render(NULL, &tor,
++			   (void*)&inplace, (void*)&clipped,
++			   span, false);
+ 	} else if (thread->is_solid) {
+ 		struct pixman_inplace pi;
+ 
+@@ -2555,12 +2527,11 @@ static void inplace_x8r8g8b8_thread(void *arg)
+ 						     1, 1, pi.bits, 0);
+ 		pixman_image_set_repeat(pi.source, PIXMAN_REPEAT_NORMAL);
+ 
+-		if (clip->data)
+-			span = pixmask_span_solid__clipped;
+-		else
+-			span = pixmask_span_solid;
++		span = clipped_span(&clipped, pixmask_span_solid, clip);
+ 
+-		tor_render(NULL, &tor, (void*)&pi, clip, span, false);
++		tor_render(NULL, &tor,
++			   (void*)&pi, (void *)&clipped,
++			   span, false);
+ 
+ 		pixman_image_unref(pi.source);
+ 		pixman_image_unref(pi.image);
+@@ -2579,12 +2550,11 @@ static void inplace_x8r8g8b8_thread(void *arg)
+ 		pi.bits = pixman_image_get_data(pi.mask);
+ 		pi.op = thread->op;
+ 
+-		if (clip->data)
+-			span = pixmask_span__clipped;
+-		else
+-			span = pixmask_span;
++		span = clipped_span(&clipped, pixmask_span, clip);
+ 
+-		tor_render(NULL, &tor, (void*)&pi, clip, span, false);
++		tor_render(NULL, &tor,
++			   (void*)&pi, (void *)&clipped,
++			   span, false);
+ 
+ 		pixman_image_unref(pi.mask);
+ 		pixman_image_unref(pi.source);
+@@ -2698,6 +2668,7 @@ trapezoid_span_inplace__x8r8g8b8(CARD8 op,
+ 	if (num_threads == 1) {
+ 		struct tor tor;
+ 		span_func_t span;
++		struct clipped_span clipped;
+ 
+ 		if (!tor_init(&tor, &region.extents, 2*ntrap))
+ 			return true;
+@@ -2723,17 +2694,15 @@ trapezoid_span_inplace__x8r8g8b8(CARD8 op,
+ 			inplace.stride = pixmap->devKind;
+ 			inplace.color = color;
+ 
+-			if (dst->pCompositeClip->data)
+-				span = tor_blt_lerp32_clipped;
+-			else
+-				span = tor_blt_lerp32;
++			span = clipped_span(&clipped, tor_blt_lerp32, dst->pCompositeClip);
+ 
+ 			DBG(("%s: render inplace op=%d, color=%08x\n",
+ 			     __FUNCTION__, op, color));
+ 
+ 			if (sigtrap_get() == 0) {
+-				tor_render(NULL, &tor, (void*)&inplace,
+-					   dst->pCompositeClip, span, false);
++				tor_render(NULL, &tor,
++					   (void*)&inplace, (void*)&clipped,
++					   span, false);
+ 				sigtrap_put();
+ 			}
+ 		} else if (is_solid) {
+@@ -2748,15 +2717,12 @@ trapezoid_span_inplace__x8r8g8b8(CARD8 op,
+ 							     1, 1, pi.bits, 0);
+ 			pixman_image_set_repeat(pi.source, PIXMAN_REPEAT_NORMAL);
+ 
+-			if (dst->pCompositeClip->data)
+-				span = pixmask_span_solid__clipped;
+-			else
+-				span = pixmask_span_solid;
++			span = clipped_span(&clipped, pixmask_span_solid, dst->pCompositeClip);
+ 
+ 			if (sigtrap_get() == 0) {
+-				tor_render(NULL, &tor, (void*)&pi,
+-					   dst->pCompositeClip, span,
+-					   false);
++				tor_render(NULL, &tor,
++					   (void*)&pi, (void*)&clipped,
++					   span, false);
+ 				sigtrap_put();
+ 			}
+ 
+@@ -2777,15 +2743,12 @@ trapezoid_span_inplace__x8r8g8b8(CARD8 op,
+ 			pi.bits = pixman_image_get_data(pi.mask);
+ 			pi.op = op;
+ 
+-			if (dst->pCompositeClip->data)
+-				span = pixmask_span__clipped;
+-			else
+-				span = pixmask_span;
++			span = clipped_span(&clipped, pixmask_span, dst->pCompositeClip);
+ 
+ 			if (sigtrap_get() == 0) {
+-				tor_render(NULL, &tor, (void*)&pi,
+-					   dst->pCompositeClip, span,
+-					   false);
++				tor_render(NULL, &tor,
++					   (void*)&pi, (void*)&clipped,
++					   span, false);
+ 				sigtrap_put();
+ 			}
+ 
+@@ -2847,9 +2810,9 @@ trapezoid_span_inplace__x8r8g8b8(CARD8 op,
+ 
+ struct inplace_thread {
+ 	xTrapezoid *traps;
+-	RegionPtr clip;
+ 	span_func_t span;
+ 	struct inplace inplace;
++	struct clipped_span clipped;
+ 	BoxRec extents;
+ 	int dx, dy;
+ 	int draw_x, draw_y;
+@@ -2874,8 +2837,9 @@ static void inplace_thread(void *arg)
+ 		tor_add_trapezoid(&tor, &thread->traps[n], thread->dx, thread->dy);
+ 	}
+ 
+-	tor_render(NULL, &tor, (void*)&thread->inplace,
+-		   thread->clip, thread->span, thread->unbounded);
++	tor_render(NULL, &tor,
++		   (void*)&thread->inplace, (void*)&thread->clipped,
++		   thread->span, thread->unbounded);
+ 
+ 	tor_fini(&tor);
+ }
+@@ -2889,6 +2853,7 @@ imprecise_trapezoid_span_inplace(struct sna *sna,
+ 				 bool fallback)
+ {
+ 	struct inplace inplace;
++	struct clipped_span clipped;
+ 	span_func_t span;
+ 	PixmapPtr pixmap;
+ 	struct sna_pixmap *priv;
+@@ -3005,21 +2970,12 @@ imprecise_trapezoid_span_inplace(struct sna *sna,
+ 	     region.extents.x2, region.extents.y2));
+ 
+ 	if (op == PictOpSrc) {
+-		if (dst->pCompositeClip->data)
+-			span = tor_blt_src_clipped;
+-		else
+-			span = tor_blt_src;
++		span = tor_blt_src;
+ 	} else if (op == PictOpIn) {
+-		if (dst->pCompositeClip->data)
+-			span = tor_blt_in_clipped;
+-		else
+-			span = tor_blt_in;
++		span = tor_blt_in;
+ 	} else {
+ 		assert(op == PictOpAdd);
+-		if (dst->pCompositeClip->data)
+-			span = tor_blt_add_clipped;
+-		else
+-			span = tor_blt_add;
++		span = tor_blt_add;
+ 	}
+ 
+ 	DBG(("%s: move-to-cpu\n", __FUNCTION__));
+@@ -3037,6 +2993,8 @@ imprecise_trapezoid_span_inplace(struct sna *sna,
+ 	inplace.stride = pixmap->devKind;
+ 	inplace.opacity = color >> 24;
+ 
++	span = clipped_span(&clipped, span, dst->pCompositeClip);
++
+ 	num_threads = 1;
+ 	if ((flags & COMPOSITE_SPANS_RECTILINEAR) == 0)
+ 		num_threads = sna_use_threads(region.extents.x2 - region.extents.x1,
+@@ -3057,8 +3015,9 @@ imprecise_trapezoid_span_inplace(struct sna *sna,
+ 		}
+ 
+ 		if (sigtrap_get() == 0) {
+-			tor_render(NULL, &tor, (void*)&inplace,
+-				   dst->pCompositeClip, span, unbounded);
++			tor_render(NULL, &tor,
++				   (void*)&inplace, (void *)&clipped,
++				   span, unbounded);
+ 			sigtrap_put();
+ 		}
+ 
+@@ -3075,8 +3034,8 @@ imprecise_trapezoid_span_inplace(struct sna *sna,
+ 		threads[0].traps = traps;
+ 		threads[0].ntrap = ntrap;
+ 		threads[0].inplace = inplace;
++		threads[0].clipped = clipped;
+ 		threads[0].extents = region.extents;
+-		threads[0].clip = dst->pCompositeClip;
+ 		threads[0].span = span;
+ 		threads[0].unbounded = unbounded;
+ 		threads[0].dx = dx;
+@@ -3707,8 +3666,7 @@ tristrip_thread(void *arg)
+ 	if (!tor_init(&tor, &thread->extents, 2*thread->count))
+ 		return;
+ 
+-	boxes.op = thread->op;
+-	boxes.num_boxes = 0;
++	span_thread_boxes_init(&boxes, thread->op, thread->clip);
+ 
+ 	cw = 0; ccw = 1;
+ 	polygon_add_line(tor.polygon,
+@@ -3874,7 +3832,7 @@ imprecise_tristrip_span_converter(struct sna *sna,
+ 				break;
+ 		} while (1);
+ 		polygon_add_line(tor.polygon,
+-				 &points[cw], &points[2+ccw],
++				 &points[cw], &points[ccw],
+ 				 dx, dy);
+ 		assert(tor.polygon->num_edges <= 2*count);
+ 
+diff --git a/src/sna/sna_trapezoids_mono.c b/src/sna/sna_trapezoids_mono.c
+index 808703a9..07a7867d 100644
+--- a/src/sna/sna_trapezoids_mono.c
++++ b/src/sna/sna_trapezoids_mono.c
+@@ -72,13 +72,14 @@ struct mono {
+ 	struct sna *sna;
+ 	struct sna_composite_op op;
+ 	pixman_region16_t clip;
++	const BoxRec *clip_start, *clip_end;
+ 
+ 	fastcall void (*span)(struct mono *, int, int, BoxPtr);
+ 
+ 	struct mono_polygon polygon;
+ };
+ 
+-#define I(x) pixman_fixed_to_int ((x) + pixman_fixed_1_minus_e/2)
++#define I(x) pixman_fixed_to_int((x) + pixman_fixed_1_minus_e/2)
+ 
+ static struct quorem
+ floored_muldivrem(int32_t x, int32_t a, int32_t b)
+@@ -249,22 +250,22 @@ mono_add_line(struct mono *mono,
+ 
+ 		e->dxdy = floored_muldivrem(dx, pixman_fixed_1, dy);
+ 
+-		e->x = floored_muldivrem((ytop - dst_y) * pixman_fixed_1 + pixman_fixed_1_minus_e/2 - p1->y,
++		e->x = floored_muldivrem((ytop - dst_y) * pixman_fixed_1 + pixman_fixed_1/2 - p1->y,
+ 					 dx, dy);
+ 		e->x.quo += p1->x;
+ 		e->x.rem -= dy;
+ 
+ 		e->dy = dy;
+-
+-		__DBG(("%s: initial x=%d [%d.%d/%d] + dxdy=%d.%d/%d\n",
+-		       __FUNCTION__,
+-		       I(e->x.quo), e->x.quo, e->x.rem, e->dy,
+-		       e->dxdy.quo, e->dxdy.rem, e->dy));
+ 	}
+ 	e->x.quo += dst_x*pixman_fixed_1;
++	__DBG(("%s: initial x=%d [%d.%d/%d] + dxdy=%d.%d/%d\n",
++	       __FUNCTION__,
++	       I(e->x.quo), e->x.quo, e->x.rem, e->dy,
++	       e->dxdy.quo, e->dxdy.rem, e->dy));
+ 
+ 	{
+ 		struct mono_edge **ptail = &polygon->y_buckets[ytop - mono->clip.extents.y1];
++		assert(ytop - mono->clip.extents.y1 < mono->clip.extents.y2 - mono->clip.extents.y1);
+ 		if (*ptail)
+ 			(*ptail)->prev = e;
+ 		e->next = *ptail;
+@@ -368,6 +369,10 @@ static struct mono_edge *mono_filter(struct mono_edge *edges)
+ 		    e->x.rem == n->x.rem &&
+ 		    e->dxdy.quo == n->dxdy.quo &&
+ 		    e->dxdy.rem == n->dxdy.rem) {
++			assert(e->dy == n->dy);
++			__DBG(("%s: discarding cancellation pair (%d.%d) + (%d.%d)\n",
++			       __FUNCTION__, e->x.quo, e->x.rem, e->dxdy.quo, e->dxdy.rem));
++
+ 			if (e->prev)
+ 				e->prev->next = n->next;
+ 			else
+@@ -378,8 +383,11 @@ static struct mono_edge *mono_filter(struct mono_edge *edges)
+ 				break;
+ 
+ 			e = n->next;
+-		} else
++		} else {
++			__DBG(("%s: adding edge (%d.%d) + (%d.%d)/%d, height=%d\n",
++			       __FUNCTION__, n->x.quo, n->x.rem, n->dxdy.quo, n->dxdy.rem, n->dy, n->height_left));
+ 			e = n;
++		}
+ 	}
+ 
+ 	return edges;
+@@ -474,6 +482,34 @@ mono_span__fast(struct mono *c, int x1, int x2, BoxPtr box)
+ 	c->op.box(c->sna, &c->op, box);
+ }
+ 
++fastcall static void
++mono_span__clipped(struct mono *c, int x1, int x2, BoxPtr box)
++{
++	const BoxRec *b;
++
++	__DBG(("%s [%d, %d]\n", __FUNCTION__, x1, x2));
++
++	c->clip_start =
++		find_clip_box_for_y(c->clip_start, c->clip_end, box->y1);
++
++	b = c->clip_start;
++	while (b != c->clip_end) {
++		BoxRec clipped;
++
++		if (box->y2 <= b->y1)
++			break;
++
++		clipped.x1 = x1;
++		clipped.x2 = x2;
++		clipped.y1 = box->y1;
++		clipped.y2 = box->y2;
++		if (!box_intersect(&clipped, b++))
++			continue;
++
++		c->op.box(c->sna, &c->op, &clipped);
++	}
++}
++
+ struct mono_span_thread_boxes {
+ 	const struct sna_composite_op *op;
+ #define MONO_SPAN_MAX_BOXES (8192/sizeof(BoxRec))
+@@ -482,40 +518,45 @@ struct mono_span_thread_boxes {
+ };
+ 
+ inline static void
+-thread_mono_span_add_boxes(struct mono *c, const BoxRec *box, int count)
++thread_mono_span_add_box(struct mono *c, const BoxRec *box)
+ {
+ 	struct mono_span_thread_boxes *b = c->op.priv;
+ 
+-	assert(count > 0 && count <= MONO_SPAN_MAX_BOXES);
+-	if (unlikely(b->num_boxes + count > MONO_SPAN_MAX_BOXES)) {
++	if (unlikely(b->num_boxes == MONO_SPAN_MAX_BOXES)) {
+ 		b->op->thread_boxes(c->sna, b->op, b->boxes, b->num_boxes);
+ 		b->num_boxes = 0;
+ 	}
+ 
+-	memcpy(b->boxes + b->num_boxes, box, count*sizeof(BoxRec));
+-	b->num_boxes += count;
++	b->boxes[b->num_boxes++] = *box;
+ 	assert(b->num_boxes <= MONO_SPAN_MAX_BOXES);
+ }
+ 
+ fastcall static void
+ thread_mono_span_clipped(struct mono *c, int x1, int x2, BoxPtr box)
+ {
+-	pixman_region16_t region;
++	const BoxRec *b;
+ 
+ 	__DBG(("%s [%d, %d]\n", __FUNCTION__, x1, x2));
+ 
+-	box->x1 = x1;
+-	box->x2 = x2;
++	c->clip_start =
++		find_clip_box_for_y(c->clip_start, c->clip_end, box->y1);
+ 
+-	assert(c->clip.data);
++	b = c->clip_start;
++	while (b != c->clip_end) {
++		BoxRec clipped;
++
++		if (box->y2 <= b->y1)
++			break;
++
++		clipped.x1 = x1;
++		clipped.x2 = x2;
++		clipped.y1 = box->y1;
++		clipped.y2 = box->y2;
++		if (!box_intersect(&clipped, b++))
++			continue;
+ 
+-	pixman_region_init_rects(&region, box, 1);
+-	RegionIntersect(&region, &region, &c->clip);
+-	if (region_num_rects(&region))
+-		thread_mono_span_add_boxes(c,
+-					   region_rects(&region),
+-					   region_num_rects(&region));
+-	pixman_region_fini(&region);
++		thread_mono_span_add_box(c, &clipped);
++	}
+ }
+ 
+ fastcall static void
+@@ -525,7 +566,7 @@ thread_mono_span(struct mono *c, int x1, int x2, BoxPtr box)
+ 
+ 	box->x1 = x1;
+ 	box->x2 = x2;
+-	thread_mono_span_add_boxes(c, box, 1);
++	thread_mono_span_add_box(c, box);
+ }
+ 
+ inline static void
+@@ -537,6 +578,8 @@ mono_row(struct mono *c, int16_t y, int16_t h)
+ 	int winding = 0;
+ 	BoxRec box;
+ 
++	__DBG(("%s: y=%d, h=%d\n", __FUNCTION__, y, h));
++
+ 	DBG_MONO_EDGES(edge);
+ 	VALIDATE_MONO_EDGES(&c->head);
+ 
+@@ -547,6 +590,8 @@ mono_row(struct mono *c, int16_t y, int16_t h)
+ 		struct mono_edge *next = edge->next;
+ 		int16_t xend = I(edge->x.quo);
+ 
++		__DBG(("%s: adding edge dir=%d [winding=%d], x=%d [%d]\n",
++		       __FUNCTION__, edge->dir, winding + edge->dir, xend, edge->x.quo));
+ 		if (--edge->height_left) {
+ 			if (edge->dy) {
+ 				edge->x.quo += edge->dxdy.quo;
+@@ -555,6 +600,8 @@ mono_row(struct mono *c, int16_t y, int16_t h)
+ 					++edge->x.quo;
+ 					edge->x.rem -= edge->dy;
+ 				}
++				__DBG(("%s: stepped edge (%d.%d) + (%d.%d)/%d, height=%d, prev_x=%d\n",
++				       __FUNCTION__, edge->x.quo, edge->x.rem, edge->dxdy.quo, edge->dxdy.rem, edge->dy, edge->height_left, edge->x.quo));
+ 			}
+ 
+ 			if (edge->x.quo < prev_x) {
+@@ -578,17 +625,22 @@ mono_row(struct mono *c, int16_t y, int16_t h)
+ 		winding += edge->dir;
+ 		if (winding == 0) {
+ 			assert(I(next->x.quo) >= xend);
+-			if (I(next->x.quo) > xend + 1) {
++			if (I(next->x.quo) > xend) {
++				__DBG(("%s: end span: %d\n", __FUNCTION__, xend));
+ 				if (xstart < c->clip.extents.x1)
+ 					xstart = c->clip.extents.x1;
+ 				if (xend > c->clip.extents.x2)
+ 					xend = c->clip.extents.x2;
+-				if (xend > xstart)
++				if (xend > xstart) {
++					__DBG(("%s: emit span [%d, %d]\n", __FUNCTION__, xstart, xend));
+ 					c->span(c, xstart, xend, &box);
++				}
+ 				xstart = INT16_MIN;
+ 			}
+-		} else if (xstart == INT16_MIN)
++		} else if (xstart == INT16_MIN) {
++			__DBG(("%s: starting new span: %d\n", __FUNCTION__, xend));
+ 			xstart = xend;
++		}
+ 
+ 		edge = next;
+ 	}
+@@ -650,9 +702,14 @@ mono_render(struct mono *mono)
+ 	for (i = 0; i < h; i = j) {
+ 		j = i + 1;
+ 
++		__DBG(("%s: row=%d, new edges? %d\n", __FUNCTION__,
++		       i, polygon->y_buckets[i] != NULL));
++
+ 		if (polygon->y_buckets[i])
+ 			mono_merge_edges(mono, polygon->y_buckets[i]);
+ 
++		__DBG(("%s: row=%d, vertical? %d\n", __FUNCTION__,
++		       i, mono->is_vertical));
+ 		if (mono->is_vertical) {
+ 			struct mono_edge *e = mono->head.next;
+ 			int min_height = h - i;
+@@ -667,6 +724,7 @@ mono_render(struct mono *mono)
+ 				j++;
+ 			if (j != i + 1)
+ 				mono_step_edges(mono, j - (i + 1));
++			__DBG(("%s: %d vertical rows\n", __FUNCTION__, j-i));
+ 		}
+ 
+ 		mono_row(mono, i, j-i);
+@@ -717,6 +775,7 @@ mono_span_thread(void *arg)
+ 		if (RegionNil(&mono.clip))
+ 			return;
+ 	}
++	region_get_boxes(&mono.clip, &mono.clip_start, &mono.clip_end);
+ 
+ 	boxes.op = thread->op;
+ 	boxes.num_boxes = 0;
+@@ -891,9 +950,12 @@ mono_trapezoids_span_converter(struct sna *sna,
+ 
+ 	if (mono.clip.data == NULL && mono.op.damage == NULL)
+ 		mono.span = mono_span__fast;
++	else if (mono.clip.data != NULL && mono.op.damage == NULL)
++		mono.span = mono_span__clipped;
+ 	else
+ 		mono.span = mono_span;
+ 
++	region_get_boxes(&mono.clip, &mono.clip_start, &mono.clip_end);
+ 	mono_render(&mono);
+ 	mono.op.done(mono.sna, &mono.op);
+ 	mono_fini(&mono);
+@@ -939,6 +1001,7 @@ mono_trapezoids_span_converter(struct sna *sna,
+ 					       mono.clip.extents.x2 - mono.clip.extents.x1,
+ 					       mono.clip.extents.y2 - mono.clip.extents.y1,
+ 					       COMPOSITE_PARTIAL, memset(&mono.op, 0, sizeof(mono.op)))) {
++			region_get_boxes(&mono.clip, &mono.clip_start, &mono.clip_end);
+ 			mono_render(&mono);
+ 			mono.op.done(mono.sna, &mono.op);
+ 		}
+@@ -974,6 +1037,7 @@ mono_inplace_fill_box(struct sna *sna,
+ 	     box->x2 - box->x1,
+ 	     box->y2 - box->y1,
+ 	     fill->color));
++	sigtrap_assert_active();
+ 	pixman_fill(fill->data, fill->stride, fill->bpp,
+ 		    box->x1, box->y1,
+ 		    box->x2 - box->x1,
+@@ -995,6 +1059,7 @@ mono_inplace_fill_boxes(struct sna *sna,
+ 		     box->x2 - box->x1,
+ 		     box->y2 - box->y1,
+ 		     fill->color));
++		sigtrap_assert_active();
+ 		pixman_fill(fill->data, fill->stride, fill->bpp,
+ 			    box->x1, box->y1,
+ 			    box->x2 - box->x1,
+@@ -1382,10 +1447,13 @@ mono_triangles_span_converter(struct sna *sna,
+ 		mono_render(&mono);
+ 		mono.op.done(mono.sna, &mono.op);
+ 	}
++	mono_fini(&mono);
+ 
+ 	if (!was_clear && !operator_is_bounded(op)) {
+ 		xPointFixed p1, p2;
+ 
++		DBG(("%s: performing unbounded clear\n", __FUNCTION__));
++
+ 		if (!mono_init(&mono, 2+3*count))
+ 			return false;
+ 
+@@ -1431,7 +1499,6 @@ mono_triangles_span_converter(struct sna *sna,
+ 		mono_fini(&mono);
+ 	}
+ 
+-	mono_fini(&mono);
+ 	REGION_UNINIT(NULL, &mono.clip);
+ 	return true;
+ }
+diff --git a/src/sna/sna_trapezoids_precise.c b/src/sna/sna_trapezoids_precise.c
+index 9187ab48..242b4acb 100644
+--- a/src/sna/sna_trapezoids_precise.c
++++ b/src/sna/sna_trapezoids_precise.c
+@@ -1023,6 +1023,16 @@ tor_init(struct tor *converter, const BoxRec *box, int num_edges)
+ static void
+ tor_add_trapezoid(struct tor *tor, const xTrapezoid *t, int dx, int dy)
+ {
++	if (!xTrapezoidValid(t)) {
++		__DBG(("%s: skipping invalid trapezoid: top=%d, bottom=%d, left=(%d, %d), (%d, %d), right=(%d, %d), (%d, %d)\n",
++		       __FUNCTION__,
++		       t->top, t->bottom,
++		       t->left.p1.x, t->left.p1.y,
++		       t->left.p2.x, t->left.p2.y,
++		       t->right.p1.x, t->right.p1.y,
++		       t->right.p2.x, t->right.p2.y));
++		return;
++	}
+ 	polygon_add_edge(tor->polygon, t, &t->left, 1, dx, dy);
+ 	polygon_add_edge(tor->polygon, t, &t->right, -1, dx, dy);
+ }
+@@ -1635,31 +1645,27 @@ struct span_thread {
+ #define SPAN_THREAD_MAX_BOXES (8192/sizeof(struct sna_opacity_box))
+ struct span_thread_boxes {
+ 	const struct sna_composite_spans_op *op;
++	const BoxRec *clip_start, *clip_end;
+ 	int num_boxes;
+ 	struct sna_opacity_box boxes[SPAN_THREAD_MAX_BOXES];
+ };
+ 
+-static void span_thread_add_boxes(struct sna *sna, void *data,
+-				  const BoxRec *box, int count, float alpha)
++static void span_thread_add_box(struct sna *sna, void *data,
++				const BoxRec *box, float alpha)
+ {
+ 	struct span_thread_boxes *b = data;
+ 
+-	__DBG(("%s: adding %d boxes with alpha=%f\n",
+-	       __FUNCTION__, count, alpha));
++	__DBG(("%s: adding box with alpha=%f\n", __FUNCTION__, alpha));
+ 
+-	assert(count > 0 && count <= SPAN_THREAD_MAX_BOXES);
+-	if (unlikely(b->num_boxes + count > SPAN_THREAD_MAX_BOXES)) {
+-		DBG(("%s: flushing %d boxes, adding %d\n", __FUNCTION__, b->num_boxes, count));
+-		assert(b->num_boxes <= SPAN_THREAD_MAX_BOXES);
++	if (unlikely(b->num_boxes == SPAN_THREAD_MAX_BOXES)) {
++		DBG(("%s: flushing %d boxes\n", __FUNCTION__, b->num_boxes));
+ 		b->op->thread_boxes(sna, b->op, b->boxes, b->num_boxes);
+ 		b->num_boxes = 0;
+ 	}
+ 
+-	do {
+-		b->boxes[b->num_boxes].box = *box++;
+-		b->boxes[b->num_boxes].alpha = alpha;
+-		b->num_boxes++;
+-	} while (--count);
++	b->boxes[b->num_boxes].box = *box++;
++	b->boxes[b->num_boxes].alpha = alpha;
++	b->num_boxes++;
+ 	assert(b->num_boxes <= SPAN_THREAD_MAX_BOXES);
+ }
+ 
+@@ -1670,8 +1676,22 @@ span_thread_box(struct sna *sna,
+ 		const BoxRec *box,
+ 		int coverage)
+ {
++	struct span_thread_boxes *b = (struct span_thread_boxes *)op;
++
+ 	__DBG(("%s: %d -> %d @ %d\n", __FUNCTION__, box->x1, box->x2, coverage));
+-	span_thread_add_boxes(sna, op, box, 1, AREA_TO_FLOAT(coverage));
++	if (b->num_boxes) {
++		struct sna_opacity_box *bb = &b->boxes[b->num_boxes-1];
++		if (bb->box.x1 == box->x1 &&
++		    bb->box.x2 == box->x2 &&
++		    bb->box.y2 == box->y1 &&
++		    bb->alpha == AREA_TO_FLOAT(coverage)) {
++			bb->box.y2 = box->y2;
++			__DBG(("%s: contracted double row: %d -> %d\n", __func__, bb->box.y1, bb->box.y2));
++			return;
++		}
++	}
++
++	span_thread_add_box(sna, op, box, AREA_TO_FLOAT(coverage));
+ }
+ 
+ static void
+@@ -1681,20 +1701,28 @@ span_thread_clipped_box(struct sna *sna,
+ 			const BoxRec *box,
+ 			int coverage)
+ {
+-	pixman_region16_t region;
++	struct span_thread_boxes *b = (struct span_thread_boxes *)op;
++	const BoxRec *c;
+ 
+ 	__DBG(("%s: %d -> %d @ %f\n", __FUNCTION__, box->x1, box->x2,
+ 	       AREA_TO_FLOAT(coverage)));
+ 
+-	pixman_region_init_rects(&region, box, 1);
+-	RegionIntersect(&region, &region, clip);
+-	if (region_num_rects(&region)) {
+-		span_thread_add_boxes(sna, op,
+-				      region_rects(&region),
+-				      region_num_rects(&region),
+-				      AREA_TO_FLOAT(coverage));
++	b->clip_start =
++		find_clip_box_for_y(b->clip_start, b->clip_end, box->y1);
++
++	c = b->clip_start;
++	while (c != b->clip_end) {
++		BoxRec clipped;
++
++		if (box->y2 <= c->y1)
++			break;
++
++		clipped = *box;
++		if (!box_intersect(&clipped, c++))
++			continue;
++
++		span_thread_add_box(sna, op, &clipped, AREA_TO_FLOAT(coverage));
+ 	}
+-	pixman_region_fini(&region);
+ }
+ 
+ static span_func_t
+@@ -1712,7 +1740,7 @@ thread_choose_span(struct sna_composite_spans_op *tmp,
+ 
+ 	assert(!is_mono(dst, maskFormat));
+ 	assert(tmp->thread_boxes);
+-	DBG(("%s: clipped? %d\n", __FUNCTION__, clip->data != NULL));
++	DBG(("%s: clipped? %d x %d\n", __FUNCTION__, clip->data != NULL, region_num_rects(clip)));
+ 	if (clip->data)
+ 		span = span_thread_clipped_box;
+ 	else
+@@ -1721,6 +1749,17 @@ thread_choose_span(struct sna_composite_spans_op *tmp,
+ 	return span;
+ }
+ 
++inline static void
++span_thread_boxes_init(struct span_thread_boxes *boxes,
++		       const struct sna_composite_spans_op *op,
++		       const RegionRec *clip)
++{
++	boxes->op = op;
++	boxes->clip_start = region_rects(clip);
++	boxes->clip_end = boxes->clip_start + region_num_rects(clip);
++	boxes->num_boxes = 0;
++}
++
+ static void
+ span_thread(void *arg)
+ {
+@@ -1733,8 +1772,7 @@ span_thread(void *arg)
+ 	if (!tor_init(&tor, &thread->extents, 2*thread->ntrap))
+ 		return;
+ 
+-	boxes.op = thread->op;
+-	boxes.num_boxes = 0;
++	span_thread_boxes_init(&boxes, thread->op, thread->clip);
+ 
+ 	y1 = thread->extents.y1 - thread->draw_y;
+ 	y2 = thread->extents.y2 - thread->draw_y;
+@@ -2183,6 +2221,52 @@ static force_inline uint8_t coverage_opacity(int coverage, uint8_t opacity)
+ 	return opacity == 255 ? coverage : mul_8_8(coverage, opacity);
+ }
+ 
++struct clipped_span {
++	span_func_t span;
++	const BoxRec *clip_start, *clip_end;
++};
++
++static void
++tor_blt_clipped(struct sna *sna,
++		struct sna_composite_spans_op *op,
++		pixman_region16_t *clip,
++		const BoxRec *box,
++		int coverage)
++{
++	struct clipped_span *cs = (struct clipped_span *)clip;
++	const BoxRec *c;
++
++	cs->clip_start =
++		find_clip_box_for_y(cs->clip_start, cs->clip_end, box->y1);
++
++	c = cs->clip_start;
++	while (c != cs->clip_end) {
++		BoxRec clipped;
++
++		if (box->y2 <= c->y1)
++			break;
++
++		clipped = *box;
++		if (!box_intersect(&clipped, c++))
++			continue;
++
++		cs->span(sna, op, NULL, &clipped, coverage);
++	}
++}
++
++inline static span_func_t
++clipped_span(struct clipped_span *cs,
++	     span_func_t span,
++	     const RegionRec *clip)
++{
++	if (clip->data) {
++		cs->span = span;
++		region_get_boxes(clip, &cs->clip_start, &cs->clip_end);
++		span = tor_blt_clipped;
++	}
++	return span;
++}
++
+ static void _tor_blt_src(struct inplace *in, const BoxRec *box, uint8_t v)
+ {
+ 	uint8_t *ptr = in->ptr;
+@@ -2218,25 +2302,6 @@ tor_blt_src(struct sna *sna,
+ }
+ 
+ static void
+-tor_blt_src_clipped(struct sna *sna,
+-		    struct sna_composite_spans_op *op,
+-		    pixman_region16_t *clip,
+-		    const BoxRec *box,
+-		    int coverage)
+-{
+-	pixman_region16_t region;
+-	int n;
+-
+-	pixman_region_init_rects(&region, box, 1);
+-	RegionIntersect(&region, &region, clip);
+-	n = region_num_rects(&region);
+-	box = region_rects(&region);
+-	while (n--)
+-		tor_blt_src(sna, op, NULL, box++, coverage);
+-	pixman_region_fini(&region);
+-}
+-
+-static void
+ tor_blt_in(struct sna *sna,
+ 	   struct sna_composite_spans_op *op,
+ 	   pixman_region16_t *clip,
+@@ -2268,25 +2333,6 @@ tor_blt_in(struct sna *sna,
+ }
+ 
+ static void
+-tor_blt_in_clipped(struct sna *sna,
+-		   struct sna_composite_spans_op *op,
+-		   pixman_region16_t *clip,
+-		   const BoxRec *box,
+-		   int coverage)
+-{
+-	pixman_region16_t region;
+-	int n;
+-
+-	pixman_region_init_rects(&region, box, 1);
+-	RegionIntersect(&region, &region, clip);
+-	n = region_num_rects(&region);
+-	box = region_rects(&region);
+-	while (n--)
+-		tor_blt_in(sna, op, NULL, box++, coverage);
+-	pixman_region_fini(&region);
+-}
+-
+-static void
+ tor_blt_add(struct sna *sna,
+ 	    struct sna_composite_spans_op *op,
+ 	    pixman_region16_t *clip,
+@@ -2325,25 +2371,6 @@ tor_blt_add(struct sna *sna,
+ }
+ 
+ static void
+-tor_blt_add_clipped(struct sna *sna,
+-		    struct sna_composite_spans_op *op,
+-		    pixman_region16_t *clip,
+-		    const BoxRec *box,
+-		    int coverage)
+-{
+-	pixman_region16_t region;
+-	int n;
+-
+-	pixman_region_init_rects(&region, box, 1);
+-	RegionIntersect(&region, &region, clip);
+-	n = region_num_rects(&region);
+-	box = region_rects(&region);
+-	while (n--)
+-		tor_blt_add(sna, op, NULL, box++, coverage);
+-	pixman_region_fini(&region);
+-}
+-
+-static void
+ tor_blt_lerp32(struct sna *sna,
+ 	       struct sna_composite_spans_op *op,
+ 	       pixman_region16_t *clip,
+@@ -2358,6 +2385,7 @@ tor_blt_lerp32(struct sna *sna,
+ 	if (coverage == 0)
+ 		return;
+ 
++	sigtrap_assert_active();
+ 	ptr += box->y1 * stride + box->x1;
+ 
+ 	h = box->y2 - box->y1;
+@@ -2396,25 +2424,6 @@ tor_blt_lerp32(struct sna *sna,
+ 	}
+ }
+ 
+-static void
+-tor_blt_lerp32_clipped(struct sna *sna,
+-		       struct sna_composite_spans_op *op,
+-		       pixman_region16_t *clip,
+-		       const BoxRec *box,
+-		       int coverage)
+-{
+-	pixman_region16_t region;
+-	int n;
+-
+-	pixman_region_init_rects(&region, box, 1);
+-	RegionIntersect(&region, &region, clip);
+-	n = region_num_rects(&region);
+-	box = region_rects(&region);
+-	while (n--)
+-		tor_blt_lerp32(sna, op, NULL, box++, coverage);
+-	pixman_region_fini(&region);
+-}
+-
+ struct pixman_inplace {
+ 	pixman_image_t *image, *source, *mask;
+ 	uint32_t color;
+@@ -2442,24 +2451,6 @@ pixmask_span_solid(struct sna *sna,
+ 			       pi->dx + box->x1, pi->dy + box->y1,
+ 			       box->x2 - box->x1, box->y2 - box->y1);
+ }
+-static void
+-pixmask_span_solid__clipped(struct sna *sna,
+-			    struct sna_composite_spans_op *op,
+-			    pixman_region16_t *clip,
+-			    const BoxRec *box,
+-			    int coverage)
+-{
+-	pixman_region16_t region;
+-	int n;
+-
+-	pixman_region_init_rects(&region, box, 1);
+-	RegionIntersect(&region, &region, clip);
+-	n = region_num_rects(&region);
+-	box = region_rects(&region);
+-	while (n--)
+-		pixmask_span_solid(sna, op, NULL, box++, coverage);
+-	pixman_region_fini(&region);
+-}
+ 
+ static void
+ pixmask_span(struct sna *sna,
+@@ -2480,24 +2471,6 @@ pixmask_span(struct sna *sna,
+ 			       pi->dx + box->x1, pi->dy + box->y1,
+ 			       box->x2 - box->x1, box->y2 - box->y1);
+ }
+-static void
+-pixmask_span__clipped(struct sna *sna,
+-		      struct sna_composite_spans_op *op,
+-		      pixman_region16_t *clip,
+-		      const BoxRec *box,
+-		      int coverage)
+-{
+-	pixman_region16_t region;
+-	int n;
+-
+-	pixman_region_init_rects(&region, box, 1);
+-	RegionIntersect(&region, &region, clip);
+-	n = region_num_rects(&region);
+-	box = region_rects(&region);
+-	while (n--)
+-		pixmask_span(sna, op, NULL, box++, coverage);
+-	pixman_region_fini(&region);
+-}
+ 
+ struct inplace_x8r8g8b8_thread {
+ 	xTrapezoid *traps;
+@@ -2516,6 +2489,7 @@ static void inplace_x8r8g8b8_thread(void *arg)
+ 	struct inplace_x8r8g8b8_thread *thread = arg;
+ 	struct tor tor;
+ 	span_func_t span;
++	struct clipped_span clipped;
+ 	RegionPtr clip;
+ 	int y1, y2, n;
+ 
+@@ -2546,12 +2520,11 @@ static void inplace_x8r8g8b8_thread(void *arg)
+ 		inplace.stride = pixmap->devKind;
+ 		inplace.color = thread->color;
+ 
+-		if (clip->data)
+-			span = tor_blt_lerp32_clipped;
+-		else
+-			span = tor_blt_lerp32;
++		span = clipped_span(&clipped, tor_blt_lerp32, clip);
+ 
+-		tor_render(NULL, &tor, (void*)&inplace, clip, span, false);
++		tor_render(NULL, &tor,
++			   (void*)&inplace, (void *)&clipped,
++			   span, false);
+ 	} else if (thread->is_solid) {
+ 		struct pixman_inplace pi;
+ 
+@@ -2564,10 +2537,7 @@ static void inplace_x8r8g8b8_thread(void *arg)
+ 						     1, 1, pi.bits, 0);
+ 		pixman_image_set_repeat(pi.source, PIXMAN_REPEAT_NORMAL);
+ 
+-		if (clip->data)
+-			span = pixmask_span_solid__clipped;
+-		else
+-			span = pixmask_span_solid;
++		span = clipped_span(&clipped, pixmask_span_solid, clip);
+ 
+ 		tor_render(NULL, &tor, (void*)&pi, clip, span, false);
+ 
+@@ -2588,12 +2558,11 @@ static void inplace_x8r8g8b8_thread(void *arg)
+ 		pi.bits = pixman_image_get_data(pi.mask);
+ 		pi.op = thread->op;
+ 
+-		if (clip->data)
+-			span = pixmask_span__clipped;
+-		else
+-			span = pixmask_span;
++		span = clipped_span(&clipped, pixmask_span, clip);
+ 
+-		tor_render(NULL, &tor, (void*)&pi, clip, span, false);
++		tor_render(NULL, &tor,
++			   (void*)&pi, (void *)&clipped,
++			   span, false);
+ 
+ 		pixman_image_unref(pi.mask);
+ 		pixman_image_unref(pi.source);
+@@ -2712,6 +2681,7 @@ trapezoid_span_inplace__x8r8g8b8(CARD8 op,
+ 	if (num_threads == 1) {
+ 		struct tor tor;
+ 		span_func_t span;
++		struct clipped_span clipped;
+ 
+ 		if (!tor_init(&tor, &region.extents, 2*ntrap))
+ 			return true;
+@@ -2737,17 +2707,14 @@ trapezoid_span_inplace__x8r8g8b8(CARD8 op,
+ 			inplace.stride = pixmap->devKind;
+ 			inplace.color = color;
+ 
+-			if (dst->pCompositeClip->data)
+-				span = tor_blt_lerp32_clipped;
+-			else
+-				span = tor_blt_lerp32;
+-
++			span = clipped_span(&clipped, tor_blt_lerp32, dst->pCompositeClip);
+ 			DBG(("%s: render inplace op=%d, color=%08x\n",
+ 			     __FUNCTION__, op, color));
+ 
+ 			if (sigtrap_get() == 0) {
+-				tor_render(NULL, &tor, (void*)&inplace,
+-					   dst->pCompositeClip, span, false);
++				tor_render(NULL, &tor,
++					   (void*)&inplace, (void*)&clipped,
++					   span, false);
+ 				sigtrap_put();
+ 			}
+ 		} else if (is_solid) {
+@@ -2762,15 +2729,11 @@ trapezoid_span_inplace__x8r8g8b8(CARD8 op,
+ 							     1, 1, pi.bits, 0);
+ 			pixman_image_set_repeat(pi.source, PIXMAN_REPEAT_NORMAL);
+ 
+-			if (dst->pCompositeClip->data)
+-				span = pixmask_span_solid__clipped;
+-			else
+-				span = pixmask_span_solid;
+-
++			span = clipped_span(&clipped, pixmask_span_solid, dst->pCompositeClip);
+ 			if (sigtrap_get() == 0) {
+-				tor_render(NULL, &tor, (void*)&pi,
+-					   dst->pCompositeClip, span,
+-					   false);
++				tor_render(NULL, &tor,
++					   (void*)&pi, (void*)&clipped,
++					    span, false);
+ 				sigtrap_put();
+ 			}
+ 
+@@ -2791,15 +2754,11 @@ trapezoid_span_inplace__x8r8g8b8(CARD8 op,
+ 			pi.bits = pixman_image_get_data(pi.mask);
+ 			pi.op = op;
+ 
+-			if (dst->pCompositeClip->data)
+-				span = pixmask_span__clipped;
+-			else
+-				span = pixmask_span;
+-
++			span = clipped_span(&clipped, pixmask_span, dst->pCompositeClip);
+ 			if (sigtrap_get() == 0) {
+-				tor_render(NULL, &tor, (void*)&pi,
+-					   dst->pCompositeClip, span,
+-					   false);
++				tor_render(NULL, &tor,
++					   (void*)&pi, (void *)&clipped,
++					   span, false);
+ 				sigtrap_put();
+ 			}
+ 
+@@ -2861,9 +2820,9 @@ trapezoid_span_inplace__x8r8g8b8(CARD8 op,
+ 
+ struct inplace_thread {
+ 	xTrapezoid *traps;
+-	RegionPtr clip;
+ 	span_func_t span;
+ 	struct inplace inplace;
++	struct clipped_span clipped;
+ 	BoxRec extents;
+ 	int dx, dy;
+ 	int draw_x, draw_y;
+@@ -2888,8 +2847,9 @@ static void inplace_thread(void *arg)
+ 		tor_add_trapezoid(&tor, &thread->traps[n], thread->dx, thread->dy);
+ 	}
+ 
+-	tor_render(NULL, &tor, (void*)&thread->inplace,
+-		   thread->clip, thread->span, thread->unbounded);
++	tor_render(NULL, &tor, 
++		   (void*)&thread->inplace, (void*)&thread->clipped,
++		   thread->span, thread->unbounded);
+ 
+ 	tor_fini(&tor);
+ }
+@@ -2903,6 +2863,7 @@ precise_trapezoid_span_inplace(struct sna *sna,
+ 			       bool fallback)
+ {
+ 	struct inplace inplace;
++	struct clipped_span clipped;
+ 	span_func_t span;
+ 	PixmapPtr pixmap;
+ 	struct sna_pixmap *priv;
+@@ -3020,21 +2981,12 @@ precise_trapezoid_span_inplace(struct sna *sna,
+ 	     dst->pCompositeClip->data != NULL));
+ 
+ 	if (op == PictOpSrc) {
+-		if (dst->pCompositeClip->data)
+-			span = tor_blt_src_clipped;
+-		else
+-			span = tor_blt_src;
++		span = tor_blt_src;
+ 	} else if (op == PictOpIn) {
+-		if (dst->pCompositeClip->data)
+-			span = tor_blt_in_clipped;
+-		else
+-			span = tor_blt_in;
++		span = tor_blt_in;
+ 	} else {
+ 		assert(op == PictOpAdd);
+-		if (dst->pCompositeClip->data)
+-			span = tor_blt_add_clipped;
+-		else
+-			span = tor_blt_add;
++		span = tor_blt_add;
+ 	}
+ 
+ 	DBG(("%s: move-to-cpu(dst)\n", __FUNCTION__));
+@@ -3052,6 +3004,8 @@ precise_trapezoid_span_inplace(struct sna *sna,
+ 	inplace.stride = pixmap->devKind;
+ 	inplace.opacity = color >> 24;
+ 
++	span = clipped_span(&clipped, span, dst->pCompositeClip);
++
+ 	num_threads = 1;
+ 	if (!NO_GPU_THREADS &&
+ 	    (flags & COMPOSITE_SPANS_RECTILINEAR) == 0)
+@@ -3074,8 +3028,9 @@ precise_trapezoid_span_inplace(struct sna *sna,
+ 		}
+ 
+ 		if (sigtrap_get() == 0) {
+-			tor_render(NULL, &tor, (void*)&inplace,
+-				   dst->pCompositeClip, span, unbounded);
++			tor_render(NULL, &tor,
++				   (void*)&inplace, (void *)&clipped,
++				   span, unbounded);
+ 			sigtrap_put();
+ 		}
+ 
+@@ -3093,7 +3048,7 @@ precise_trapezoid_span_inplace(struct sna *sna,
+ 		threads[0].ntrap = ntrap;
+ 		threads[0].inplace = inplace;
+ 		threads[0].extents = region.extents;
+-		threads[0].clip = dst->pCompositeClip;
++		threads[0].clipped = clipped;
+ 		threads[0].span = span;
+ 		threads[0].unbounded = unbounded;
+ 		threads[0].dx = dx;
+@@ -3316,8 +3271,7 @@ tristrip_thread(void *arg)
+ 	if (!tor_init(&tor, &thread->extents, 2*thread->count))
+ 		return;
+ 
+-	boxes.op = thread->op;
+-	boxes.num_boxes = 0;
++	span_thread_boxes_init(&boxes, thread->op, thread->clip);
+ 
+ 	cw = 0; ccw = 1;
+ 	polygon_add_line(tor.polygon,
+diff --git a/src/sna/sna_video.c b/src/sna/sna_video.c
+index ed0e7b31..e2b11c31 100644
+--- a/src/sna/sna_video.c
++++ b/src/sna/sna_video.c
+@@ -591,6 +591,72 @@ use_gtt: /* copy data, must use GTT so that we keep the overlay uncached */
+ 	return true;
+ }
+ 
++void sna_video_fill_colorkey(struct sna_video *video,
++			     const RegionRec *clip)
++{
++	struct sna *sna = video->sna;
++	PixmapPtr front = sna->front;
++	struct kgem_bo *bo = __sna_pixmap_get_bo(front);
++	uint8_t *dst, *tmp;
++	int w, width;
++
++	if (video->AlwaysOnTop || RegionEqual(&video->clip, (RegionPtr)clip))
++		return;
++
++	assert(bo);
++	if (!wedged(sna) &&
++	    sna_blt_fill_boxes(sna, GXcopy, bo,
++			       front->drawable.bitsPerPixel,
++			       video->color_key,
++			       region_rects(clip),
++			       region_num_rects(clip))) {
++		RegionCopy(&video->clip, (RegionPtr)clip);
++		return;
++	}
++
++	dst = kgem_bo_map__gtt(&sna->kgem, bo);
++	if (dst == NULL)
++		return;
++
++	w = front->drawable.bitsPerPixel/8;
++	width = (clip->extents.x2 - clip->extents.x1) * w;
++	tmp = malloc(width);
++	if (tmp == NULL)
++		return;
++
++	memcpy(tmp, &video->color_key, w);
++	while (2 * w < width) {
++		memcpy(tmp + w, tmp, w);
++		w *= 2;
++	}
++	if (w < width)
++		memcpy(tmp + w, tmp, width - w);
++
++	if (sigtrap_get() == 0) {
++		const BoxRec *box = region_rects(clip);
++		int n = region_num_rects(clip);
++
++		w = front->drawable.bitsPerPixel/8;
++		do {
++			int y = box->y1;
++			uint8_t *row = dst + y*bo->pitch + w*box->x1;
++
++			width = (box->x2 - box->x1) * w;
++			while (y < box->y2) {
++				memcpy(row, tmp, width);
++				row += bo->pitch;
++				y++;
++			}
++			box++;
++		} while (--n);
++		sigtrap_put();
++
++		RegionCopy(&video->clip, (RegionPtr)clip);
++	}
++
++	free(tmp);
++}
++
+ XvAdaptorPtr sna_xv_adaptor_alloc(struct sna *sna)
+ {
+ 	XvAdaptorPtr new_adaptors;
+diff --git a/src/sna/sna_video.h b/src/sna/sna_video.h
+index f21605fc..39cb725f 100644
+--- a/src/sna/sna_video.h
++++ b/src/sna/sna_video.h
+@@ -72,6 +72,8 @@ THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ struct sna_video {
+ 	struct sna *sna;
+ 
++	int idx; /* XXX expose struct plane instead? */
++
+ 	int brightness;
+ 	int contrast;
+ 	int saturation;
+@@ -193,6 +195,9 @@ bool
+ sna_video_copy_data(struct sna_video *video,
+ 		    struct sna_video_frame *frame,
+ 		    const uint8_t *buf);
++void
++sna_video_fill_colorkey(struct sna_video *video,
++			const RegionRec *clip);
+ 
+ void sna_video_buffer_fini(struct sna_video *video);
+ 
+@@ -210,4 +215,26 @@ sna_window_set_port(WindowPtr window, XvPortPtr port)
+ 	((void **)__get_private(window, sna_window_key))[2] = port;
+ }
+ 
++static inline int offset_and_clip(int x, int dx)
++{
++	x += dx;
++	if (x <= 0)
++		return 0;
++	if (x >= MAXSHORT)
++		return MAXSHORT;
++	return x;
++}
++
++static inline void init_video_region(RegionRec *region,
++				     DrawablePtr draw,
++				     int drw_x, int drw_y,
++				     int drw_w, int drw_h)
++{
++	region->extents.x1 = offset_and_clip(draw->x, drw_x);
++	region->extents.y1 = offset_and_clip(draw->y, drw_y);
++	region->extents.x2 = offset_and_clip(draw->x, drw_x + drw_w);
++	region->extents.y2 = offset_and_clip(draw->y, drw_y + drw_h);
++	region->data = NULL;
++}
++
+ #endif /* SNA_VIDEO_H */
+diff --git a/src/sna/sna_video_overlay.c b/src/sna/sna_video_overlay.c
+index ac81f1a0..9bc5ce40 100644
+--- a/src/sna/sna_video_overlay.c
++++ b/src/sna/sna_video_overlay.c
+@@ -130,7 +130,7 @@ static int sna_video_overlay_stop(ddStopVideo_ARGS)
+ 
+ 	DBG(("%s()\n", __FUNCTION__));
+ 
+-	REGION_EMPTY(scrn->pScreen, &video->clip);
++	REGION_EMPTY(to_screen_from_sna(sna), &video->clip);
+ 
+ 	request.flags = 0;
+ 	(void)drmIoctl(sna->kgem.fd,
+@@ -474,15 +474,13 @@ sna_video_overlay_put_image(ddPutImage_ARGS)
+ 	if (src_h >= (drw_h * 8))
+ 		drw_h = src_h / 7;
+ 
+-	clip.extents.x1 = draw->x + drw_x;
+-	clip.extents.y1 = draw->y + drw_y;
+-	clip.extents.x2 = clip.extents.x1 + drw_w;
+-	clip.extents.y2 = clip.extents.y1 + drw_h;
+-	clip.data = NULL;
++	init_video_region(&clip, draw, drw_x, drw_y, drw_w, drw_h);
+ 
+ 	DBG(("%s: always_on_top=%d\n", __FUNCTION__, video->AlwaysOnTop));
+-	if (!video->AlwaysOnTop)
++	if (!video->AlwaysOnTop) {
++		ValidateGC(draw, gc);
+ 		RegionIntersect(&clip, &clip, gc->pCompositeClip);
++	}
+ 	if (box_empty(&clip.extents))
+ 		goto invisible;
+ 
+@@ -551,15 +549,7 @@ sna_video_overlay_put_image(ddPutImage_ARGS)
+ 	ret = Success;
+ 	if (sna_video_overlay_show
+ 	    (sna, video, &frame, crtc, &dstBox, src_w, src_h, drw_w, drw_h)) {
+-		//xf86XVFillKeyHelperDrawable(draw, video->color_key, &clip);
+-		if (!video->AlwaysOnTop && !RegionEqual(&video->clip, &clip) &&
+-		    sna_blt_fill_boxes(sna, GXcopy,
+-				       __sna_pixmap_get_bo(sna->front),
+-				       sna->front->drawable.bitsPerPixel,
+-				       video->color_key,
+-				       region_rects(&clip),
+-				       region_num_rects(&clip)))
+-			RegionCopy(&video->clip, &clip);
++		sna_video_fill_colorkey(video, &clip);
+ 		sna_window_set_port((WindowPtr)draw, port);
+ 	} else {
+ 		DBG(("%s: failed to show video frame\n", __FUNCTION__));
+diff --git a/src/sna/sna_video_sprite.c b/src/sna/sna_video_sprite.c
+index 92230f97..69bfdfd2 100644
+--- a/src/sna/sna_video_sprite.c
++++ b/src/sna/sna_video_sprite.c
+@@ -47,6 +47,8 @@
+ #define DRM_FORMAT_YUYV         fourcc_code('Y', 'U', 'Y', 'V') /* [31:0] Cr0:Y1:Cb0:Y0 8:8:8:8 little endian */
+ #define DRM_FORMAT_UYVY         fourcc_code('U', 'Y', 'V', 'Y') /* [31:0] Y1:Cr0:Y0:Cb0 8:8:8:8 little endian */
+ 
++#define has_hw_scaling(sna) ((sna)->kgem.gen < 071)
++
+ #define LOCAL_IOCTL_MODE_SETPLANE	DRM_IOWR(0xB7, struct local_mode_set_plane)
+ struct local_mode_set_plane {
+ 	uint32_t plane_id;
+@@ -81,19 +83,17 @@ static int sna_video_sprite_stop(ddStopVideo_ARGS)
+ 	xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(video->sna->scrn);
+ 	int i;
+ 
+-	for (i = 0; i < config->num_crtc; i++) {
++	for (i = 0; i < video->sna->mode.num_real_crtc; i++) {
+ 		xf86CrtcPtr crtc = config->crtc[i];
+ 		int pipe;
+ 
+-		if (sna_crtc_id(crtc) == 0)
+-			break;
+-
+-		pipe = sna_crtc_to_pipe(crtc);
++		pipe = sna_crtc_pipe(crtc);
++		assert(pipe < ARRAY_SIZE(video->bo));
+ 		if (video->bo[pipe] == NULL)
+ 			continue;
+ 
+ 		memset(&s, 0, sizeof(s));
+-		s.plane_id = sna_crtc_to_sprite(crtc);
++		s.plane_id = sna_crtc_to_sprite(crtc, video->idx);
+ 		if (drmIoctl(video->sna->kgem.fd, LOCAL_IOCTL_MODE_SETPLANE, &s))
+ 			xf86DrvMsg(video->sna->scrn->scrnIndex, X_ERROR,
+ 				   "failed to disable plane\n");
+@@ -153,7 +153,7 @@ static int sna_video_sprite_best_size(ddQueryBestSize_ARGS)
+ 	struct sna_video *video = port->devPriv.ptr;
+ 	struct sna *sna = video->sna;
+ 
+-	if (sna->kgem.gen >= 075) {
++	if (!has_hw_scaling(sna) && !sna->render.video) {
+ 		*p_w = vid_w;
+ 		*p_h = vid_h;
+ 	} else {
+@@ -221,12 +221,12 @@ sna_video_sprite_show(struct sna *sna,
+ 		      BoxPtr dstBox)
+ {
+ 	struct local_mode_set_plane s;
+-	int pipe = sna_crtc_to_pipe(crtc);
++	int pipe = sna_crtc_pipe(crtc);
+ 
+ 	/* XXX handle video spanning multiple CRTC */
+ 
+ 	VG_CLEAR(s);
+-	s.plane_id = sna_crtc_to_sprite(crtc);
++	s.plane_id = sna_crtc_to_sprite(crtc, video->idx);
+ 
+ #define DRM_I915_SET_SPRITE_COLORKEY 0x2b
+ #define LOCAL_IOCTL_I915_SET_SPRITE_COLORKEY DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_SET_SPRITE_COLORKEY, struct local_intel_sprite_colorkey)
+@@ -263,9 +263,6 @@ sna_video_sprite_show(struct sna *sna,
+ 		video->color_key_changed &= ~(1 << pipe);
+ 	}
+ 
+-	if (video->bo[pipe] == frame->bo)
+-		return true;
+-
+ 	update_dst_box_to_crtc_coords(sna, crtc, dstBox);
+ 	if (frame->rotation & (RR_Rotate_90 | RR_Rotate_270)) {
+ 		int tmp = frame->width;
+@@ -283,15 +280,30 @@ sna_video_sprite_show(struct sna *sna,
+ 			uint32_t handles[4];
+ 			uint32_t pitches[4]; /* pitch for each plane */
+ 			uint32_t offsets[4]; /* offset of each plane */
++			uint64_t modifiers[4];
+ 		} f;
+ 		bool purged = true;
+ 
+ 		memset(&f, 0, sizeof(f));
+ 		f.width = frame->width;
+ 		f.height = frame->height;
++		f.flags = 1 << 1; /* +modifiers */
+ 		f.handles[0] = frame->bo->handle;
+ 		f.pitches[0] = frame->pitch[0];
+ 
++		switch (frame->bo->tiling) {
++		case I915_TILING_NONE:
++			break;
++		case I915_TILING_X:
++			/* I915_FORMAT_MOD_X_TILED */
++			f.modifiers[0] = (uint64_t)1 << 56 | 1;
++			break;
++		case I915_TILING_Y:
++			/* I915_FORMAT_MOD_X_TILED */
++			f.modifiers[0] = (uint64_t)1 << 56 | 2;
++			break;
++		}
++
+ 		switch (frame->id) {
+ 		case FOURCC_RGB565:
+ 			f.pixel_format = DRM_FORMAT_RGB565;
+@@ -360,7 +372,7 @@ sna_video_sprite_show(struct sna *sna,
+ 		return false;
+ 	}
+ 
+-	frame->bo->domain = DOMAIN_NONE;
++	__kgem_bo_clear_dirty(frame->bo);
+ 
+ 	if (video->bo[pipe])
+ 		kgem_bo_destroy(&sna->kgem, video->bo[pipe]);
+@@ -374,17 +386,17 @@ static int sna_video_sprite_put_image(ddPutImage_ARGS)
+ 	struct sna *sna = video->sna;
+ 	xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(sna->scrn);
+ 	RegionRec clip;
++	BoxRec draw_extents;
+ 	int ret, i;
+ 
+-	clip.extents.x1 = draw->x + drw_x;
+-	clip.extents.y1 = draw->y + drw_y;
+-	clip.extents.x2 = clip.extents.x1 + drw_w;
+-	clip.extents.y2 = clip.extents.y1 + drw_h;
+-	clip.data = NULL;
++	init_video_region(&clip, draw, drw_x, drw_y, drw_w, drw_h);
++	draw_extents = clip.extents;
+ 
+ 	DBG(("%s: always_on_top=%d\n", __FUNCTION__, video->AlwaysOnTop));
+-	if (!video->AlwaysOnTop)
++	if (!video->AlwaysOnTop) {
++		ValidateGC(draw, gc);
+ 		RegionIntersect(&clip, &clip, gc->pCompositeClip);
++	}
+ 
+ 	DBG(("%s: src=(%d, %d),(%d, %d), dst=(%d, %d),(%d, %d), id=%d, sizep=%dx%d, sync?=%d\n",
+ 	     __FUNCTION__,
+@@ -402,19 +414,17 @@ static int sna_video_sprite_put_image(ddPutImage_ARGS)
+ 		goto err;
+ 	}
+ 
+-	for (i = 0; i < config->num_crtc; i++) {
++	for (i = 0; i < video->sna->mode.num_real_crtc; i++) {
+ 		xf86CrtcPtr crtc = config->crtc[i];
+ 		struct sna_video_frame frame;
++		BoxRec dst = draw_extents;
+ 		int pipe;
+ 		INT32 x1, x2, y1, y2;
+-		BoxRec dst;
+ 		RegionRec reg;
+ 		Rotation rotation;
++		bool cache_bo;
+ 
+-		if (sna_crtc_id(crtc) == 0)
+-			break;
+-
+-		pipe = sna_crtc_to_pipe(crtc);
++		pipe = sna_crtc_pipe(crtc);
+ 
+ 		sna_video_frame_init(video, format->id, width, height, &frame);
+ 
+@@ -423,10 +433,11 @@ static int sna_video_sprite_put_image(ddPutImage_ARGS)
+ 		RegionIntersect(&reg, &reg, &clip);
+ 		if (RegionNil(&reg)) {
+ off:
++			assert(pipe < ARRAY_SIZE(video->bo));
+ 			if (video->bo[pipe]) {
+ 				struct local_mode_set_plane s;
+ 				memset(&s, 0, sizeof(s));
+-				s.plane_id = sna_crtc_to_sprite(crtc);
++				s.plane_id = sna_crtc_to_sprite(crtc, video->idx);
+ 				if (drmIoctl(video->sna->kgem.fd, LOCAL_IOCTL_MODE_SETPLANE, &s))
+ 					xf86DrvMsg(video->sna->scrn->scrnIndex, X_ERROR,
+ 						   "failed to disable plane\n");
+@@ -440,8 +451,6 @@ off:
+ 		y1 = src_y;
+ 		y2 = src_y + src_h;
+ 
+-		dst = clip.extents;
+-
+ 		ret = xf86XVClipVideoHelper(&dst, &x1, &x2, &y1, &y2,
+ 					    &reg, frame.width, frame.height);
+ 		RegionUninit(&reg);
+@@ -465,8 +474,8 @@ off:
+ 
+ 		/* if sprite can't handle rotation natively, store it for the copy func */
+ 		rotation = RR_Rotate_0;
+-		if (!sna_crtc_set_sprite_rotation(crtc, crtc->rotation)) {
+-			sna_crtc_set_sprite_rotation(crtc, RR_Rotate_0);
++		if (!sna_crtc_set_sprite_rotation(crtc, video->idx, crtc->rotation)) {
++			sna_crtc_set_sprite_rotation(crtc, video->idx, RR_Rotate_0);
+ 			rotation = crtc->rotation;
+ 		}
+ 		sna_video_frame_set_rotation(video, &frame, rotation);
+@@ -496,6 +505,8 @@ off:
+ 			frame.image.y1 = 0;
+ 			frame.image.x2 = frame.width;
+ 			frame.image.y2 = frame.height;
++
++			cache_bo = false;
+ 		} else {
+ 			frame.bo = sna_video_buffer(video, &frame);
+ 			if (frame.bo == NULL) {
+@@ -509,6 +520,60 @@ off:
+ 				ret = BadAlloc;
+ 				goto err;
+ 			}
++
++			cache_bo = true;
++		}
++
++		if (!has_hw_scaling(sna) && sna->render.video &&
++		    !((frame.src.x2 - frame.src.x1) == (dst.x2 - dst.x1) &&
++		      (frame.src.y2 - frame.src.y1) == (dst.y2 - dst.y1))) {
++			ScreenPtr screen = to_screen_from_sna(sna);
++			PixmapPtr scaled;
++			RegionRec r;
++
++			r.extents.x1 = r.extents.y1 = 0;
++			r.extents.x2 = dst.x2 - dst.x1;
++			r.extents.y2 = dst.y2 - dst.y1;
++			r.data = NULL;
++
++			DBG(("%s: scaling from (%d, %d) to (%d, %d)\n",
++			     __FUNCTION__,
++			     frame.src.x2 - frame.src.x1,
++			     frame.src.y2 - frame.src.y1,
++			     r.extents.x2, r.extents.y2));
++
++			scaled = screen->CreatePixmap(screen,
++						      r.extents.x2,
++						      r.extents.y2,
++						      24,
++						      CREATE_PIXMAP_USAGE_SCRATCH);
++			if (scaled == NULL) {
++				ret = BadAlloc;
++				goto err;
++			}
++
++			if (!sna->render.video(sna, video, &frame, &r, scaled)) {
++				screen->DestroyPixmap(scaled);
++				ret = BadAlloc;
++				goto err;
++			}
++
++			if (cache_bo)
++				sna_video_buffer_fini(video);
++			else
++				kgem_bo_destroy(&sna->kgem, frame.bo);
++
++			frame.bo = kgem_bo_reference(__sna_pixmap_get_bo(scaled));
++			kgem_bo_submit(&sna->kgem, frame.bo);
++
++			frame.id = FOURCC_RGB888;
++			frame.src = frame.image = r.extents;
++			frame.width = frame.image.x2;
++			frame.height = frame.image.y2;
++			frame.pitch[0] = frame.bo->pitch;
++
++			screen->DestroyPixmap(scaled);
++			cache_bo = false;
+ 		}
+ 
+ 		ret = Success;
+@@ -517,24 +582,16 @@ off:
+ 			ret = BadAlloc;
+ 		}
+ 
+-		frame.bo->domain = DOMAIN_NONE;
+-		if (xvmc_passthrough(format->id))
+-			kgem_bo_destroy(&sna->kgem, frame.bo);
+-		else
++		if (cache_bo)
+ 			sna_video_buffer_fini(video);
++		else
++			kgem_bo_destroy(&sna->kgem, frame.bo);
+ 
+ 		if (ret != Success)
+ 			goto err;
+ 	}
+ 
+-	if (!video->AlwaysOnTop && !RegionEqual(&video->clip, &clip) &&
+-	    sna_blt_fill_boxes(sna, GXcopy,
+-			       __sna_pixmap_get_bo(sna->front),
+-			       sna->front->drawable.bitsPerPixel,
+-			       video->color_key,
+-			       region_rects(&clip),
+-			       region_num_rects(&clip)))
+-		RegionCopy(&video->clip, &clip);
++	sna_video_fill_colorkey(video, &clip);
+ 	sna_window_set_port((WindowPtr)draw, port);
+ 
+ 	return Success;
+@@ -606,25 +663,28 @@ static int sna_video_sprite_color_key(struct sna *sna)
+ 	return color_key & ((1 << scrn->depth) - 1);
+ }
+ 
+-static bool sna_video_has_sprites(struct sna *sna)
++static int sna_video_has_sprites(struct sna *sna)
+ {
+ 	xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(sna->scrn);
++	unsigned min;
+ 	int i;
+ 
+ 	DBG(("%s: num_crtc=%d\n", __FUNCTION__, sna->mode.num_real_crtc));
+ 
+ 	if (sna->mode.num_real_crtc == 0)
+-		return false;
++		return 0;
+ 
++	min = -1;
+ 	for (i = 0; i < sna->mode.num_real_crtc; i++) {
+-		if (!sna_crtc_to_sprite(config->crtc[i])) {
+-			DBG(("%s: no sprite found on pipe %d\n", __FUNCTION__, sna_crtc_to_pipe(config->crtc[i])));
+-			return false;
+-		}
++		unsigned count =  sna_crtc_count_sprites(config->crtc[i]);
++		DBG(("%s: %d sprites found on pipe %d\n", __FUNCTION__,
++		     count, sna_crtc_pipe(config->crtc[i])));
++		if (count < min)
++			min = count;
+ 	}
+ 
+-	DBG(("%s: yes\n", __FUNCTION__));
+-	return true;
++	DBG(("%s: min=%d\n", __FUNCTION__, min));
++	return min;
+ }
+ 
+ void sna_video_sprite_setup(struct sna *sna, ScreenPtr screen)
+@@ -632,16 +692,18 @@ void sna_video_sprite_setup(struct sna *sna, ScreenPtr screen)
+ 	XvAdaptorPtr adaptor;
+ 	struct sna_video *video;
+ 	XvPortPtr port;
++	int count, i;
+ 
+-	if (!sna_video_has_sprites(sna))
++	count = sna_video_has_sprites(sna);
++	if (!count)
+ 		return;
+ 
+ 	adaptor = sna_xv_adaptor_alloc(sna);
+ 	if (!adaptor)
+ 		return;
+ 
+-	video = calloc(1, sizeof(*video));
+-	port = calloc(1, sizeof(*port));
++	video = calloc(count, sizeof(*video));
++	port = calloc(count, sizeof(*port));
+ 	if (video == NULL || port == NULL) {
+ 		free(video);
+ 		free(port);
+@@ -686,36 +748,43 @@ void sna_video_sprite_setup(struct sna *sna, ScreenPtr screen)
+ 	adaptor->ddPutImage = sna_video_sprite_put_image;
+ 	adaptor->ddQueryImageAttributes = sna_video_sprite_query;
+ 
+-	adaptor->nPorts = 1;
++	adaptor->nPorts = count;
+ 	adaptor->pPorts = port;
+ 
+-	adaptor->base_id = port->id = FakeClientID(0);
+-	AddResource(port->id, XvGetRTPort(), port);
+-	port->pAdaptor = adaptor;
+-	port->pNotify =  NULL;
+-	port->pDraw =  NULL;
+-	port->client =  NULL;
+-	port->grab.client =  NULL;
+-	port->time = currentTime;
+-	port->devPriv.ptr = video;
+-
+-	video->sna = sna;
+-	video->alignment = 64;
+-	video->color_key = sna_video_sprite_color_key(sna);
+-	video->color_key_changed = ~0;
+-	video->has_color_key = true;
+-	video->brightness = -19;	/* (255/219) * -16 */
+-	video->contrast = 75;	/* 255/219 * 64 */
+-	video->saturation = 146;	/* 128/112 * 128 */
+-	video->desired_crtc = NULL;
+-	video->gamma5 = 0xc0c0c0;
+-	video->gamma4 = 0x808080;
+-	video->gamma3 = 0x404040;
+-	video->gamma2 = 0x202020;
+-	video->gamma1 = 0x101010;
+-	video->gamma0 = 0x080808;
+-	RegionNull(&video->clip);
+-	video->SyncToVblank = 1;
++	for (i = 0; i < count; i++) {
++		port->id = FakeClientID(0);
++		AddResource(port->id, XvGetRTPort(), port);
++		port->pAdaptor = adaptor;
++		port->pNotify =  NULL;
++		port->pDraw =  NULL;
++		port->client =  NULL;
++		port->grab.client =  NULL;
++		port->time = currentTime;
++		port->devPriv.ptr = video;
++
++		video->sna = sna;
++		video->idx = i;
++		video->alignment = 64;
++		video->color_key = sna_video_sprite_color_key(sna);
++		video->color_key_changed = ~0;
++		video->has_color_key = true;
++		video->brightness = -19;	/* (255/219) * -16 */
++		video->contrast = 75;	/* 255/219 * 64 */
++		video->saturation = 146;	/* 128/112 * 128 */
++		video->desired_crtc = NULL;
++		video->gamma5 = 0xc0c0c0;
++		video->gamma4 = 0x808080;
++		video->gamma3 = 0x404040;
++		video->gamma2 = 0x202020;
++		video->gamma1 = 0x101010;
++		video->gamma0 = 0x080808;
++		RegionNull(&video->clip);
++		video->SyncToVblank = 1;
++
++		port++;
++		video++;
++	}
++	adaptor->base_id = adaptor->pPorts[0].id;
+ 
+ 	xvColorKey = MAKE_ATOM("XV_COLORKEY");
+ 	xvAlwaysOnTop = MAKE_ATOM("XV_ALWAYS_ON_TOP");
+diff --git a/src/sna/sna_video_textured.c b/src/sna/sna_video_textured.c
+index 95011939..3cce5cf1 100644
+--- a/src/sna/sna_video_textured.c
++++ b/src/sna/sna_video_textured.c
+@@ -48,7 +48,12 @@ static const XvAttributeRec Attributes[] = {
+ 	//{XvSettable | XvGettable, 0, 255, (char *)"XV_CONTRAST"},
+ };
+ 
+-static const XvImageRec Images[] = {
++static const XvImageRec gen2_Images[] = {
++	XVIMAGE_YUY2,
++	XVIMAGE_UYVY,
++};
++
++static const XvImageRec gen3_Images[] = {
+ 	XVIMAGE_YUY2,
+ 	XVIMAGE_YV12,
+ 	XVIMAGE_I420,
+@@ -149,15 +154,16 @@ sna_video_textured_put_image(ddPutImage_ARGS)
+ 	BoxRec dstBox;
+ 	RegionRec clip;
+ 	xf86CrtcPtr crtc;
++	int16_t dx, dy;
+ 	bool flush = false;
+ 	bool ret;
+ 
+-	clip.extents.x1 = draw->x + drw_x;
+-	clip.extents.y1 = draw->y + drw_y;
+-	clip.extents.x2 = clip.extents.x1 + drw_w;
+-	clip.extents.y2 = clip.extents.y1 + drw_h;
+-	clip.data = NULL;
++	if (wedged(sna))
++		return BadAlloc;
+ 
++	init_video_region(&clip, draw, drw_x, drw_y, drw_w, drw_h);
++
++	ValidateGC(draw, gc);
+ 	RegionIntersect(&clip, &clip, gc->pCompositeClip);
+ 	if (!RegionNotEmpty(&clip))
+ 		return Success;
+@@ -181,6 +187,9 @@ sna_video_textured_put_image(ddPutImage_ARGS)
+ 				   &clip))
+ 		return Success;
+ 
++	if (get_drawable_deltas(draw, pixmap, &dx, &dy))
++		RegionTranslate(&clip, dx, dy);
++
+ 	flags = MOVE_WRITE | __MOVE_FORCE;
+ 	if (clip.data)
+ 		flags |= MOVE_READ;
+@@ -234,7 +243,7 @@ sna_video_textured_put_image(ddPutImage_ARGS)
+ 		DBG(("%s: failed to render video\n", __FUNCTION__));
+ 		ret = BadAlloc;
+ 	} else
+-		DamageDamageRegion(draw, &clip);
++		DamageDamageRegion(&pixmap->drawable, &clip);
+ 
+ 	kgem_bo_destroy(&sna->kgem, frame.bo);
+ 
+@@ -316,7 +325,7 @@ void sna_video_textured_setup(struct sna *sna, ScreenPtr screen)
+ 
+ 	if (!sna->render.video) {
+ 		xf86DrvMsg(sna->scrn->scrnIndex, X_INFO,
+-			   "Textured video not supported on this hardware\n");
++			   "Textured video not supported on this hardware or backend\n");
+ 		return;
+ 	}
+ 
+@@ -362,8 +371,13 @@ void sna_video_textured_setup(struct sna *sna, ScreenPtr screen)
+ 						 ARRAY_SIZE(Formats));
+ 	adaptor->nAttributes = ARRAY_SIZE(Attributes);
+ 	adaptor->pAttributes = (XvAttributeRec *)Attributes;
+-	adaptor->nImages = ARRAY_SIZE(Images);
+-	adaptor->pImages = (XvImageRec *)Images;
++	if (sna->kgem.gen < 030) {
++		adaptor->nImages = ARRAY_SIZE(gen2_Images);
++		adaptor->pImages = (XvImageRec *)gen2_Images;
++	} else {
++		adaptor->nImages = ARRAY_SIZE(gen3_Images);
++		adaptor->pImages = (XvImageRec *)gen3_Images;
++	}
+ #if XORG_XV_VERSION < 2
+ 	adaptor->ddAllocatePort = sna_xv_alloc_port;
+ 	adaptor->ddFreePort = sna_xv_free_port;
+diff --git a/src/sna/xassert.h b/src/sna/xassert.h
+index 1bcfd080..e648e4bc 100644
+--- a/src/sna/xassert.h
++++ b/src/sna/xassert.h
+@@ -43,6 +43,28 @@
+ 	xorg_backtrace(); \
+ 	FatalError("%s:%d assertion '%s' failed\n", __func__, __LINE__, #E); \
+ } while (0)
++
++#define warn_unless(E) \
++({ \
++	bool fail = !(E); \
++	if (unlikely(fail)) { \
++		static int __warn_once__; \
++		if (!__warn_once__) { \
++			xorg_backtrace(); \
++			ErrorF("%s:%d assertion '%s' failed\n", __func__, __LINE__, #E); \
++			__warn_once__ = 1; \
++		} \
++	} \
++	unlikely(fail); \
++})
++
++#define dbg(EXPR) EXPR
++
++#else
++
++#define warn_unless(E) ({ bool fail = !(E); unlikely(fail); })
++#define dbg(EXPR)
++
+ #endif
+ 
+ #endif /* __XASSERT_H__ */
+diff --git a/src/uxa/i830_reg.h b/src/uxa/i830_reg.h
+index d8306bcd..ba39d82c 100644
+--- a/src/uxa/i830_reg.h
++++ b/src/uxa/i830_reg.h
+@@ -65,6 +65,12 @@
+ #define MI_LOAD_SCAN_LINES_DISPLAY_PIPEA	(0)
+ #define MI_LOAD_SCAN_LINES_DISPLAY_PIPEB	(0x1<<20)
+ 
++#define MI_LOAD_REGISTER_IMM		(0x22<<23 | (3-2))
++
++#define BCS_SWCTRL                      0x22200
++# define BCS_SWCTRL_SRC_Y               (1 << 0)
++# define BCS_SWCTRL_DST_Y               (1 << 1)
++
+ /* BLT commands */
+ #define COLOR_BLT_CMD		((2<<29)|(0x40<<22)|(0x3))
+ #define COLOR_BLT_WRITE_ALPHA	(1<<21)
+diff --git a/src/uxa/i965_video.c b/src/uxa/i965_video.c
+index 68e6fd38..438ab909 100644
+--- a/src/uxa/i965_video.c
++++ b/src/uxa/i965_video.c
+@@ -37,7 +37,6 @@
+ #include "fourcc.h"
+ 
+ #include "intel.h"
+-#include "intel_xvmc.h"
+ #include "intel_uxa.h"
+ #include "i830_reg.h"
+ #include "i965_reg.h"
+diff --git a/src/uxa/intel.h b/src/uxa/intel.h
+index 1b7e5339..a5e77af4 100644
+--- a/src/uxa/intel.h
++++ b/src/uxa/intel.h
+@@ -121,7 +121,6 @@ typedef struct intel_screen_private {
+ 
+ 	void *modes;
+ 	drm_intel_bo *front_buffer, *back_buffer;
+-	unsigned int back_name;
+ 	long front_pitch, front_tiling;
+ 
+ 	dri_bufmgr *bufmgr;
+@@ -169,6 +168,7 @@ typedef struct intel_screen_private {
+ 	const struct intel_device_info *info;
+ 
+ 	unsigned int BR[20];
++	unsigned int BR_tiling[2];
+ 
+ 	CloseScreenProcPtr CloseScreen;
+ 
+@@ -196,7 +196,9 @@ typedef struct intel_screen_private {
+ 
+ 	int colorKey;
+ 	XF86VideoAdaptorPtr adaptor;
++#if !HAVE_NOTIFY_FD
+ 	ScreenBlockHandlerProcPtr BlockHandler;
++#endif
+ 	Bool overlayOn;
+ 
+ 	struct {
+@@ -285,8 +287,6 @@ typedef struct intel_screen_private {
+ 	Bool has_kernel_flush;
+ 	Bool needs_flush;
+ 
+-	struct _DRI2FrameEvent *pending_flip[MAX_PIPES];
+-
+ 	/* Broken-out options. */
+ 	OptionInfoPtr Options;
+ 
+@@ -368,6 +368,7 @@ typedef void (*intel_drm_abort_proc)(ScrnInfoPtr scrn,
+ 
+ extern uint32_t intel_drm_queue_alloc(ScrnInfoPtr scrn, xf86CrtcPtr crtc, void *data, intel_drm_handler_proc handler, intel_drm_abort_proc abort);
+ extern void intel_drm_abort(ScrnInfoPtr scrn, Bool (*match)(void *data, void *match_data), void *match_data);
++extern void intel_drm_abort_seq(ScrnInfoPtr scrn, uint32_t seq);
+ 
+ extern int intel_get_pipe_from_crtc_id(drm_intel_bufmgr *bufmgr, xf86CrtcPtr crtc);
+ extern int intel_crtc_id(xf86CrtcPtr crtc);
+@@ -408,7 +409,6 @@ typedef struct _DRI2FrameEvent {
+ 	ClientPtr client;
+ 	enum DRI2FrameEventType type;
+ 	int frame;
+-	int pipe;
+ 
+ 	struct list drawable_resource, client_resource;
+ 
+@@ -418,7 +418,12 @@ typedef struct _DRI2FrameEvent {
+ 	DRI2BufferPtr front;
+ 	DRI2BufferPtr back;
+ 
+-	struct _DRI2FrameEvent *chain;
++	/* current scanout for triple buffer */
++	int old_width;
++	int old_height;
++	int old_pitch;
++	int old_tiling;
++	dri_bo *old_buffer;
+ } DRI2FrameEventRec, *DRI2FrameEventPtr;
+ 
+ extern Bool intel_do_pageflip(intel_screen_private *intel,
+@@ -456,10 +461,6 @@ extern xf86CrtcPtr intel_covering_crtc(ScrnInfoPtr scrn, BoxPtr box,
+ 
+ Bool I830DRI2ScreenInit(ScreenPtr pScreen);
+ void I830DRI2CloseScreen(ScreenPtr pScreen);
+-void I830DRI2FrameEventHandler(unsigned int frame, unsigned int tv_sec,
+-			       unsigned int tv_usec, DRI2FrameEventPtr flip_info);
+-void I830DRI2FlipEventHandler(unsigned int frame, unsigned int tv_sec,
+-			      unsigned int tv_usec, DRI2FrameEventPtr flip_info);
+ 
+ /* intel_dri3.c */
+ Bool intel_dri3_screen_init(ScreenPtr screen);
+diff --git a/src/uxa/intel_batchbuffer.c b/src/uxa/intel_batchbuffer.c
+index a29e4434..114c6026 100644
+--- a/src/uxa/intel_batchbuffer.c
++++ b/src/uxa/intel_batchbuffer.c
+@@ -245,6 +245,17 @@ void intel_batch_submit(ScrnInfoPtr scrn)
+ 	if (intel->batch_used == 0)
+ 		return;
+ 
++	if (intel->current_batch == I915_EXEC_BLT &&
++	    INTEL_INFO(intel)->gen >= 060) {
++		OUT_BATCH(MI_FLUSH_DW);
++		OUT_BATCH(0);
++		OUT_BATCH(0);
++		OUT_BATCH(0);
++		OUT_BATCH(MI_LOAD_REGISTER_IMM);
++		OUT_BATCH(BCS_SWCTRL);
++		OUT_BATCH((BCS_SWCTRL_DST_Y | BCS_SWCTRL_SRC_Y) << 16);
++	}
++
+ 	/* Mark the end of the batchbuffer. */
+ 	OUT_BATCH(MI_BATCH_BUFFER_END);
+ 	/* Emit a padding dword if we aren't going to be quad-word aligned. */
+diff --git a/src/uxa/intel_batchbuffer.h b/src/uxa/intel_batchbuffer.h
+index e5fb8d08..e71ffd19 100644
+--- a/src/uxa/intel_batchbuffer.h
++++ b/src/uxa/intel_batchbuffer.h
+@@ -30,7 +30,7 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #ifndef _INTEL_BATCHBUFFER_H
+ #define _INTEL_BATCHBUFFER_H
+ 
+-#define BATCH_RESERVED		16
++#define BATCH_RESERVED		64
+ 
+ 
+ void intel_batch_init(ScrnInfoPtr scrn);
+@@ -202,6 +202,23 @@ do {									\
+ 
+ #define BEGIN_BATCH(n)	__BEGIN_BATCH(n,RENDER_BATCH)
+ #define BEGIN_BATCH_BLT(n)	__BEGIN_BATCH(n,BLT_BATCH)
++#define BEGIN_BATCH_BLT_TILED(n) \
++do { \
++	if (INTEL_INFO(intel)->gen < 060) { \
++		__BEGIN_BATCH(n, BLT_BATCH); \
++	} else { \
++		__BEGIN_BATCH(n+7, BLT_BATCH); \
++		OUT_BATCH(MI_FLUSH_DW); \
++		OUT_BATCH(0); \
++		OUT_BATCH(0); \
++		OUT_BATCH(0); \
++		OUT_BATCH(MI_LOAD_REGISTER_IMM); \
++		OUT_BATCH(BCS_SWCTRL); \
++		OUT_BATCH((BCS_SWCTRL_DST_Y | BCS_SWCTRL_SRC_Y) << 16 | \
++			  ((intel->BR_tiling[0] == I915_TILING_Y) ? BCS_SWCTRL_DST_Y : 0) | \
++			  ((intel->BR_tiling[1] == I915_TILING_Y) ? BCS_SWCTRL_SRC_Y : 0)); \
++	} \
++} while (0)
+ 
+ #define ADVANCE_BATCH() do {						\
+ 	if (intel->batch_emitting == 0)					\
+diff --git a/src/uxa/intel_display.c b/src/uxa/intel_display.c
+index 7b4d4e0c..809cda1d 100644
+--- a/src/uxa/intel_display.c
++++ b/src/uxa/intel_display.c
+@@ -89,11 +89,11 @@ struct intel_mode {
+ 	struct list outputs;
+ 	struct list crtcs;
+ 
+-	void *pageflip_data;
+-	intel_pageflip_handler_proc pageflip_handler;
+-	intel_pageflip_abort_proc pageflip_abort;
+-
+-	Bool delete_dp_12_displays;
++	struct {
++		intel_pageflip_handler_proc handler;
++		intel_pageflip_abort_proc abort;
++		void *data;
++	} pageflip;
+ };
+ 
+ struct intel_pageflip {
+@@ -114,7 +114,6 @@ struct intel_crtc {
+ 	struct list link;
+ 	PixmapPtr scanout_pixmap;
+ 	uint32_t scanout_fb_id;
+-	int32_t vblank_offset;
+ 	uint32_t msc_prev;
+ 	uint64_t msc_high;
+ };
+@@ -193,7 +192,7 @@ intel_output_backlight_init(xf86OutputPtr output)
+ 
+ 	str = xf86GetOptValString(intel->Options, OPTION_BACKLIGHT);
+ 	if (str != NULL) {
+-		if (backlight_exists(str) != BL_NONE) {
++		if (backlight_exists(str)) {
+ 			intel_output->backlight_active_level =
+ 				backlight_open(&intel_output->backlight,
+ 					       strdup(str));
+@@ -689,9 +688,11 @@ intel_set_scanout_pixmap(xf86CrtcPtr crtc, PixmapPtr ppix)
+ 	}
+ 
+ 	bo = intel_get_pixmap_bo(ppix);
+-	if (intel->front_buffer) {
+-		ErrorF("have front buffer\n");
+-	}
++	if (!bo)
++		return FALSE;
++
++	if (intel->front_buffer)
++		return FALSE;
+ 
+ 	drm_intel_bo_disable_reuse(bo);
+ 
+@@ -867,6 +868,48 @@ intel_output_attach_edid(xf86OutputPtr output)
+ 	xf86OutputSetEDID(output, mon);
+ }
+ 
++static void
++intel_output_attach_tile(xf86OutputPtr output)
++{
++#if XF86_OUTPUT_VERSION >= 3
++	struct intel_output *intel_output = output->driver_private;
++	drmModeConnectorPtr koutput = intel_output->mode_output;
++	struct intel_mode *mode = intel_output->mode;
++	drmModePropertyBlobPtr blob = NULL;
++	struct xf86CrtcTileInfo tile_info, *set = NULL;
++	int i;
++
++	for (i = 0; koutput && i < koutput->count_props; i++) {
++		drmModePropertyPtr props;
++
++		props = drmModeGetProperty(mode->fd, koutput->props[i]);
++		if (!props)
++			continue;
++
++		if (!(props->flags & DRM_MODE_PROP_BLOB)) {
++			drmModeFreeProperty(props);
++			continue;
++		}
++
++		if (!strcmp(props->name, "TILE")) {
++			blob = drmModeGetPropertyBlob(mode->fd,
++						      koutput->prop_values[i]);
++		}
++		drmModeFreeProperty(props);
++	}
++
++	if (blob) {
++		if (xf86OutputParseKMSTile(blob->data,
++					   blob->length,
++					   &tile_info))
++			set = &tile_info;
++		drmModeFreePropertyBlob(blob);
++	}
++
++	xf86OutputSetTile(output, set);
++#endif
++}
++
+ static DisplayModePtr
+ intel_output_panel_edid(xf86OutputPtr output, DisplayModePtr modes)
+ {
+@@ -922,6 +965,7 @@ intel_output_get_modes(xf86OutputPtr output)
+ 	int i;
+ 
+ 	intel_output_attach_edid(output);
++	intel_output_attach_tile(output);
+ 
+ 	if (!koutput)
+ 		return Modes;
+@@ -1492,6 +1536,7 @@ intel_output_init(ScrnInfoPtr scrn, struct intel_mode *mode, drmModeResPtr mode_
+ 			intel_output = output->driver_private;
+ 			intel_output->output_id = mode_res->connectors[num];
+ 			intel_output->mode_output = koutput;
++			RROutputChanged(output->randr_output, TRUE);
+ 			return;
+ 		}
+ 	}
+@@ -1650,9 +1695,6 @@ intel_pageflip_abort(ScrnInfoPtr scrn, xf86CrtcPtr crtc, void *data);
+ static void
+ intel_pageflip_complete(struct intel_mode *mode);
+ 
+-static void
+-intel_drm_abort_seq (ScrnInfoPtr scrn, uint32_t seq);
+-
+ Bool
+ intel_do_pageflip(intel_screen_private *intel,
+ 		  dri_bo *new_front,
+@@ -1671,23 +1713,30 @@ intel_do_pageflip(intel_screen_private *intel,
+ 	uint32_t new_fb_id;
+ 	uint32_t flags;
+ 	uint32_t seq;
++	int err = 0;
+ 	int i;
+ 
+ 	/*
++	 * We only have a single length queue in the kernel, so any
++	 * attempts to schedule a second flip before processing the first
++	 * is a bug. Punt it back to the caller.
++	 */
++	if (mode->flip_count)
++		return FALSE;
++
++	/*
+ 	 * Create a new handle for the back buffer
+ 	 */
+ 	if (drmModeAddFB(mode->fd, scrn->virtualX, scrn->virtualY,
+ 			 scrn->depth, scrn->bitsPerPixel, pitch,
+-			 new_front->handle, &new_fb_id))
++			 new_front->handle, &new_fb_id)) {
++		err = errno;
+ 		goto error_out;
++	}
+ 
+ 	drm_intel_bo_disable_reuse(new_front);
+         intel_flush(intel);
+ 
+-	mode->pageflip_data = pageflip_data;
+-	mode->pageflip_handler = pageflip_handler;
+-	mode->pageflip_abort = pageflip_abort;
+-
+ 	/*
+ 	 * Queue flips on all enabled CRTCs
+ 	 * Note that if/when we get per-CRTC buffers, we'll have to update this.
+@@ -1699,6 +1748,7 @@ intel_do_pageflip(intel_screen_private *intel,
+ 	 */
+ 	mode->fe_msc = 0;
+ 	mode->fe_usec = 0;
++	memset(&mode->pageflip, 0, sizeof(mode->pageflip));
+ 
+ 	flags = DRM_MODE_PAGE_FLIP_EVENT;
+ 	if (async)
+@@ -1711,8 +1761,7 @@ intel_do_pageflip(intel_screen_private *intel,
+ 
+ 		flip = calloc(1, sizeof(struct intel_pageflip));
+ 		if (flip == NULL) {
+-			xf86DrvMsg(scrn->scrnIndex, X_WARNING,
+-				   "flip queue: carrier alloc failed.\n");
++			err = errno;
+ 			goto error_undo;
+ 		}
+ 
+@@ -1724,33 +1773,30 @@ intel_do_pageflip(intel_screen_private *intel,
+ 
+ 		seq = intel_drm_queue_alloc(scrn, config->crtc[i], flip, intel_pageflip_handler, intel_pageflip_abort);
+ 		if (!seq) {
++			err = errno;
+ 			free(flip);
+ 			goto error_undo;
+ 		}
+ 
+-again:
++		mode->flip_count++;
++
+ 		if (drmModePageFlip(mode->fd,
+ 				    crtc_id(crtc),
+ 				    new_fb_id,
+ 				    flags, (void *)(uintptr_t)seq)) {
+-			if (intel_mode_read_drm_events(intel)) {
+-				xf86DrvMsg(scrn->scrnIndex, X_WARNING,
+-					   "flip queue retry\n");
+-				goto again;
+-			}
+-			xf86DrvMsg(scrn->scrnIndex, X_WARNING,
+-				   "flip queue failed: %s\n", strerror(errno));
+-			if (seq)
+-				intel_drm_abort_seq(scrn, seq);
+-			free(flip);
++			err = errno;
++			intel_drm_abort_seq(scrn, seq);
+ 			goto error_undo;
+ 		}
+-		mode->flip_count++;
+ 	}
+ 
+ 	mode->old_fb_id = mode->fb_id;
+ 	mode->fb_id = new_fb_id;
+ 
++	mode->pageflip.data = pageflip_data;
++	mode->pageflip.handler = pageflip_handler;
++	mode->pageflip.abort = pageflip_abort;
++
+ 	if (!mode->flip_count)
+ 		intel_pageflip_complete(mode);
+ 
+@@ -1765,7 +1811,7 @@ error_undo:
+ 
+ error_out:
+ 	xf86DrvMsg(scrn->scrnIndex, X_WARNING, "Page flip failed: %s\n",
+-		   strerror(errno));
++		   strerror(err));
+ 
+ 	mode->flip_count = 0;
+ 	return FALSE;
+@@ -1839,7 +1885,7 @@ intel_drm_abort(ScrnInfoPtr scrn, Bool (*match)(void *data, void *match_data), v
+ /*
+  * Abort by drm queue sequence number
+  */
+-static void
++void
+ intel_drm_abort_seq(ScrnInfoPtr scrn, uint32_t seq)
+ {
+ 	struct intel_drm_queue *q;
+@@ -1911,7 +1957,6 @@ intel_sequence_to_crtc_msc(xf86CrtcPtr crtc, uint32_t sequence)
+ {
+ 	struct intel_crtc *intel_crtc = crtc->driver_private;
+ 
+-        sequence += intel_crtc->vblank_offset;
+         if ((int32_t) (sequence - intel_crtc->msc_prev) < -0x40000000)
+                 intel_crtc->msc_high += 0x100000000L;
+         intel_crtc->msc_prev = sequence;
+@@ -1935,37 +1980,10 @@ intel_get_crtc_msc_ust(ScrnInfoPtr scrn, xf86CrtcPtr crtc, uint64_t *msc, uint64
+         return 0;
+ }
+ 
+-/*
+- * Convert a 64-bit adjusted MSC value into a 32-bit kernel sequence number,
+- * removing the high 32 bits and subtracting out the vblank_offset term.
+- *
+- * This also updates the vblank_offset when it notices that the value should
+- * change.
+- */
+-
+-#define MAX_VBLANK_OFFSET       1000
+-
+ uint32_t
+ intel_crtc_msc_to_sequence(ScrnInfoPtr scrn, xf86CrtcPtr crtc, uint64_t expect)
+ {
+-	struct intel_crtc *intel_crtc = crtc->driver_private;
+-        uint64_t msc, ust;
+-
+-	if (intel_get_crtc_msc_ust(scrn, crtc, &msc, &ust) == 0) {
+-		int64_t diff = expect - msc;
+-
+-		/* We're way off here, assume that the kernel has lost its mind
+-		 * and smack the vblank back to something sensible
+-		 */
+-		if (diff < -MAX_VBLANK_OFFSET || diff > MAX_VBLANK_OFFSET) {
+-			intel_crtc->vblank_offset += (int32_t) diff;
+-			if (intel_crtc->vblank_offset > -MAX_VBLANK_OFFSET &&
+-			    intel_crtc->vblank_offset < MAX_VBLANK_OFFSET)
+-				intel_crtc->vblank_offset = 0;
+-		}
+-	}
+-
+-        return (uint32_t) (expect - intel_crtc->vblank_offset);
++        return (uint32_t)expect;
+ }
+ 
+ /*
+@@ -1998,14 +2016,13 @@ intel_drm_handler(int fd, uint32_t frame, uint32_t sec, uint32_t usec, void *use
+ static void
+ intel_pageflip_complete(struct intel_mode *mode)
+ {
+-	/* Release framebuffer */
+-	drmModeRmFB(mode->fd, mode->old_fb_id);
+-
+-	if (!mode->pageflip_handler)
++	if (!mode->pageflip.handler)
+ 		return;
+ 
+-	mode->pageflip_handler(mode->fe_msc, mode->fe_usec,
+-			       mode->pageflip_data);
++	/* Release framebuffer */
++	drmModeRmFB(mode->fd, mode->old_fb_id);
++	mode->pageflip.handler(mode->fe_msc, mode->fe_usec,
++			       mode->pageflip.data);
+ }
+ 
+ /*
+@@ -2045,6 +2062,7 @@ intel_pageflip_handler(ScrnInfoPtr scrn, xf86CrtcPtr crtc,
+ 
+ 	if (!mode)
+ 		return;
++
+ 	intel_pageflip_complete(mode);
+ }
+ 
+@@ -2060,18 +2078,18 @@ intel_pageflip_abort(ScrnInfoPtr scrn, xf86CrtcPtr crtc, void *data)
+ 	if (!mode)
+ 		return;
+ 
+-	/* Release framebuffer */
+-	drmModeRmFB(mode->fd, mode->old_fb_id);
+-
+-	if (!mode->pageflip_abort)
++	if (!mode->pageflip.abort)
+ 		return;
+ 
+-	mode->pageflip_abort(mode->pageflip_data);
++	/* Release framebuffer */
++	drmModeRmFB(mode->fd, mode->old_fb_id);
++	mode->pageflip.abort(mode->pageflip.data);
+ }
+ 
+ /*
+  * Check for pending DRM events and process them.
+  */
++#if !HAVE_NOTIFY_FD
+ static void
+ drm_wakeup_handler(pointer data, int err, pointer p)
+ {
+@@ -2086,6 +2104,14 @@ drm_wakeup_handler(pointer data, int err, pointer p)
+ 	if (FD_ISSET(mode->fd, read_mask))
+ 		drmHandleEvent(mode->fd, &mode->event_context);
+ }
++#else
++static void
++drm_notify_fd(int fd, int ready, void *data)
++{
++	struct intel_mode *mode = data;
++	drmHandleEvent(mode->fd, &mode->event_context);
++}
++#endif
+ 
+ /*
+  * If there are any available, read drm_events
+@@ -2231,10 +2257,6 @@ Bool intel_mode_pre_init(ScrnInfoPtr scrn, int fd, int cpp)
+ 		intel->use_pageflipping = TRUE;
+ 	}
+ 
+-	if (xf86ReturnOptValBool(intel->Options, OPTION_DELETE_DP12, FALSE)) {
+-		mode->delete_dp_12_displays = TRUE;
+-	}
+-
+ 	intel->modes = mode;
+ 	drmModeFreeResources(mode_res);
+ 	return TRUE;
+@@ -2250,9 +2272,11 @@ intel_mode_init(struct intel_screen_private *intel)
+ 	 * registration within ScreenInit and not PreInit.
+ 	 */
+ 	mode->flip_count = 0;
+-	AddGeneralSocket(mode->fd);
++	SetNotifyFd(mode->fd, drm_notify_fd, X_NOTIFY_READ, mode);
++#if !HAVE_NOTIFY_FD
+ 	RegisterBlockAndWakeupHandlers((BlockHandlerProcPtr)NoopDDA,
+ 				       drm_wakeup_handler, mode);
++#endif
+ }
+ 
+ void
+@@ -2276,9 +2300,11 @@ intel_mode_close(intel_screen_private *intel)
+ 
+         intel_drm_abort_scrn(intel->scrn);
+ 
++#if !HAVE_NOTIFY_FD
+ 	RemoveBlockAndWakeupHandlers((BlockHandlerProcPtr)NoopDDA,
+ 				     drm_wakeup_handler, mode);
+-	RemoveGeneralSocket(mode->fd);
++#endif
++	RemoveNotifyFd(mode->fd);
+ }
+ 
+ void
+@@ -2498,12 +2524,11 @@ intel_mode_hotplug(struct intel_screen_private *intel)
+ 	int i, j;
+ 	Bool found;
+ 	Bool changed = FALSE;
+-	struct intel_mode *mode = intel->modes;
++
+ 	mode_res = drmModeGetResources(intel->drmSubFD);
+ 	if (!mode_res)
+ 		goto out;
+ 
+-restart_destroy:
+ 	for (i = 0; i < config->num_output; i++) {
+ 		xf86OutputPtr output = config->output[i];
+ 		struct intel_output *intel_output;
+@@ -2522,13 +2547,9 @@ restart_destroy:
+ 		drmModeFreeConnector(intel_output->mode_output);
+ 		intel_output->mode_output = NULL;
+ 		intel_output->output_id = -1;
++		RROutputChanged(output->randr_output, TRUE);
+ 
+ 		changed = TRUE;
+-		if (mode->delete_dp_12_displays) {
+-			RROutputDestroy(output->randr_output);
+-			xf86OutputDestroy(output);
+-			goto restart_destroy;
+-		}
+ 	}
+ 
+ 	/* find new output ids we don't have outputs for */
+@@ -2552,10 +2573,8 @@ restart_destroy:
+ 		intel_output_init(scrn, intel->modes, mode_res, i, 1);
+ 	}
+ 
+-	if (changed) {
+-		RRSetChanged(xf86ScrnToScreen(scrn));
++	if (changed)
+ 		RRTellChanged(xf86ScrnToScreen(scrn));
+-	}
+ 
+ 	drmModeFreeResources(mode_res);
+ out:
+diff --git a/src/uxa/intel_dri.c b/src/uxa/intel_dri.c
+index f61c6210..524826d2 100644
+--- a/src/uxa/intel_dri.c
++++ b/src/uxa/intel_dri.c
+@@ -81,6 +81,47 @@ static DevPrivateKeyRec i830_client_key;
+ static int i830_client_key;
+ #endif
+ 
++static void I830DRI2FlipEventHandler(unsigned int frame,
++				     unsigned int tv_sec,
++				     unsigned int tv_usec,
++				     DRI2FrameEventPtr flip_info);
++
++static void I830DRI2FrameEventHandler(unsigned int frame,
++				      unsigned int tv_sec,
++				      unsigned int tv_usec,
++				      DRI2FrameEventPtr swap_info);
++
++static void
++i830_dri2_del_frame_event(DRI2FrameEventPtr info);
++
++static uint32_t pipe_select(int pipe)
++{
++	if (pipe > 1)
++		return pipe << DRM_VBLANK_HIGH_CRTC_SHIFT;
++	else if (pipe > 0)
++		return DRM_VBLANK_SECONDARY;
++	else
++		return 0;
++}
++
++static void
++intel_dri2_vblank_handler(ScrnInfoPtr scrn,
++                          xf86CrtcPtr crtc,
++                          uint64_t msc,
++                          uint64_t usec,
++                          void *data)
++{
++        I830DRI2FrameEventHandler((uint32_t) msc, usec / 1000000, usec % 1000000, data);
++}
++
++static void
++intel_dri2_vblank_abort(ScrnInfoPtr scrn,
++                        xf86CrtcPtr crtc,
++                        void *data)
++{
++        i830_dri2_del_frame_event(data);
++}
++
+ static uint32_t pixmap_flink(PixmapPtr pixmap)
+ {
+ 	struct intel_uxa_pixmap *priv = intel_uxa_get_pixmap_private(pixmap);
+@@ -135,9 +176,6 @@ I830DRI2CreateBuffers(DrawablePtr drawable, unsigned int *attachments,
+ 		pixmap = NULL;
+ 		if (attachments[i] == DRI2BufferFrontLeft) {
+ 			pixmap = get_front_buffer(drawable);
+-
+-			if (pixmap == NULL)
+-				drawable = &(get_drawable_pixmap(drawable)->drawable);
+ 		} else if (attachments[i] == DRI2BufferStencil && pDepthPixmap) {
+ 			pixmap = pDepthPixmap;
+ 			pixmap->refcnt++;
+@@ -246,11 +284,8 @@ I830DRI2CreateBuffer(DrawablePtr drawable, unsigned int attachment,
+ 	}
+ 
+ 	pixmap = NULL;
+-	if (attachment == DRI2BufferFrontLeft) {
++	if (attachment == DRI2BufferFrontLeft)
+ 		pixmap = get_front_buffer(drawable);
+-		if (pixmap == NULL)
+-			drawable = &(get_drawable_pixmap(drawable)->drawable);
+-	}
+ 
+ 	if (pixmap == NULL) {
+ 		unsigned int hint = INTEL_CREATE_PIXMAP_DRI2;
+@@ -673,6 +708,20 @@ i830_dri2_del_frame_event(DRI2FrameEventPtr info)
+ 	if (info->back)
+ 		I830DRI2DestroyBuffer(NULL, info->back);
+ 
++	if (info->old_buffer) {
++		/* Check that the old buffer still matches the front buffer
++		 * in case a mode change occurred before we woke up.
++		 */
++		if (info->intel->back_buffer == NULL &&
++		    info->old_width  == info->intel->scrn->virtualX &&
++		    info->old_height == info->intel->scrn->virtualY &&
++		    info->old_pitch  == info->intel->front_pitch &&
++		    info->old_tiling == info->intel->front_tiling)
++			info->intel->back_buffer = info->old_buffer;
++		else
++			dri_bo_unreference(info->old_buffer);
++	}
++
+ 	free(info);
+ }
+ 
+@@ -708,16 +757,14 @@ static void
+ I830DRI2ExchangeBuffers(struct intel_screen_private *intel, DRI2BufferPtr front, DRI2BufferPtr back)
+ {
+ 	I830DRI2BufferPrivatePtr front_priv, back_priv;
+-	int tmp;
+ 	struct intel_uxa_pixmap *new_front;
+ 
+ 	front_priv = front->driverPrivate;
+ 	back_priv = back->driverPrivate;
+ 
+ 	/* Swap BO names so DRI works */
+-	tmp = front->name;
+ 	front->name = back->name;
+-	back->name = tmp;
++	back->name = pixmap_flink(front_priv->pixmap);
+ 
+ 	/* Swap pixmap bos */
+ 	new_front = intel_exchange_pixmap_buffers(intel,
+@@ -753,87 +800,30 @@ I830DRI2FlipAbort(void *pageflip_data)
+         i830_dri2_del_frame_event(info);
+ }
+ 
+-/*
+- * Our internal swap routine takes care of actually exchanging, blitting, or
+- * flipping buffers as necessary.
+- */
+ static Bool
+-I830DRI2ScheduleFlip(struct intel_screen_private *intel,
+-		     DrawablePtr draw,
+-		     DRI2FrameEventPtr info)
++allocate_back_buffer(struct intel_screen_private *intel)
+ {
+-	I830DRI2BufferPrivatePtr priv = info->back->driverPrivate;
+-	drm_intel_bo *new_back, *old_back;
+-	int tmp_name;
+-
+-	if (!intel->use_triple_buffer) {
+-		info->type = DRI2_SWAP;
+-		if (!intel_do_pageflip(intel,
+-				       get_pixmap_bo(priv),
+-				       info->pipe, FALSE, info,
+-                                       I830DRI2FlipComplete,
+-                                       I830DRI2FlipAbort))
+-			return FALSE;
+-
+-		I830DRI2ExchangeBuffers(intel, info->front, info->back);
+-		return TRUE;
+-	}
++	drm_intel_bo *bo;
++	int pitch;
++	uint32_t tiling;
+ 
+-	if (intel->pending_flip[info->pipe]) {
+-		assert(intel->pending_flip[info->pipe]->chain == NULL);
+-		intel->pending_flip[info->pipe]->chain = info;
++	if (intel->back_buffer)
+ 		return TRUE;
+-	}
+ 
+-	if (intel->back_buffer == NULL) {
+-		new_back = drm_intel_bo_alloc(intel->bufmgr, "front buffer",
+-					      intel->front_buffer->size, 0);
+-		if (new_back == NULL)
+-			return FALSE;
+-
+-		if (intel->front_tiling != I915_TILING_NONE) {
+-			uint32_t tiling = intel->front_tiling;
+-			drm_intel_bo_set_tiling(new_back, &tiling, intel->front_pitch);
+-			if (tiling != intel->front_tiling) {
+-				drm_intel_bo_unreference(new_back);
+-				return FALSE;
+-			}
+-		}
+-
+-		drm_intel_bo_disable_reuse(new_back);
+-		dri_bo_flink(new_back, &intel->back_name);
+-	} else {
+-		new_back = intel->back_buffer;
+-		intel->back_buffer = NULL;
+-	}
++	bo = intel_allocate_framebuffer(intel->scrn,
++					intel->scrn->virtualX,
++					intel->scrn->virtualY,
++					intel->cpp,
++					&pitch, &tiling);
++	if (bo == NULL)
++		return FALSE;
+ 
+-	old_back = get_pixmap_bo(priv);
+-	if (!intel_do_pageflip(intel, old_back, info->pipe, FALSE, info, I830DRI2FlipComplete, I830DRI2FlipAbort)) {
+-		intel->back_buffer = new_back;
++	if (pitch != intel->front_pitch || tiling != intel->front_tiling) {
++		drm_intel_bo_unreference(bo);
+ 		return FALSE;
+ 	}
+-	info->type = DRI2_SWAP_CHAIN;
+-	intel->pending_flip[info->pipe] = info;
+-
+-	priv = info->front->driverPrivate;
+-
+-	/* Exchange the current front-buffer with the fresh bo */
+-
+-	intel->back_buffer = intel->front_buffer;
+-	drm_intel_bo_reference(intel->back_buffer);
+-	intel_set_pixmap_bo(priv->pixmap, new_back);
+-	drm_intel_bo_unreference(new_back);
+-
+-	tmp_name = info->front->name;
+-	info->front->name = intel->back_name;
+-	intel->back_name = tmp_name;
+ 
+-	/* Then flip DRI2 pointers and update the screen pixmap */
+-	I830DRI2ExchangeBuffers(intel, info->front, info->back);
+-	DRI2SwapComplete(info->client, draw, 0, 0, 0,
+-			 DRI2_EXCHANGE_COMPLETE,
+-			 info->event_complete,
+-			 info->event_data);
++	intel->back_buffer = bo;
+ 	return TRUE;
+ }
+ 
+@@ -889,8 +879,88 @@ can_exchange(DrawablePtr drawable, DRI2BufferPtr front, DRI2BufferPtr back)
+ 	return TRUE;
+ }
+ 
+-void I830DRI2FrameEventHandler(unsigned int frame, unsigned int tv_sec,
+-			       unsigned int tv_usec, DRI2FrameEventPtr swap_info)
++static Bool
++queue_flip(struct intel_screen_private *intel,
++	   DrawablePtr draw,
++	   DRI2FrameEventPtr info)
++{
++	xf86CrtcPtr crtc = I830DRI2DrawableCrtc(draw);
++	I830DRI2BufferPrivatePtr priv = info->back->driverPrivate;
++	drm_intel_bo *old_back = get_pixmap_bo(priv);
++
++	if (crtc == NULL)
++		return FALSE;
++
++	if (!can_exchange(draw, info->front, info->back))
++		return FALSE;
++
++	if (!intel_do_pageflip(intel, old_back,
++			       intel_crtc_to_pipe(crtc),
++			       FALSE, info,
++			       I830DRI2FlipComplete, I830DRI2FlipAbort))
++		return FALSE;
++
++#if DRI2INFOREC_VERSION >= 6
++	if (intel->use_triple_buffer && allocate_back_buffer(intel)) {
++		info->old_width  = intel->scrn->virtualX;
++		info->old_height = intel->scrn->virtualY;
++		info->old_pitch  = intel->front_pitch;
++		info->old_tiling = intel->front_tiling;
++		info->old_buffer = intel->front_buffer;
++		dri_bo_reference(info->old_buffer);
++
++		priv = info->front->driverPrivate;
++		intel_set_pixmap_bo(priv->pixmap, intel->back_buffer);
++
++		dri_bo_unreference(intel->back_buffer);
++		intel->back_buffer = NULL;
++
++		DRI2SwapLimit(draw, 2);
++	} else
++		DRI2SwapLimit(draw, 1);
++#endif
++
++	/* Then flip DRI2 pointers and update the screen pixmap */
++	I830DRI2ExchangeBuffers(intel, info->front, info->back);
++	return TRUE;
++}
++
++static Bool
++queue_swap(struct intel_screen_private *intel,
++	   DrawablePtr draw,
++	   DRI2FrameEventPtr info)
++{
++	xf86CrtcPtr crtc = I830DRI2DrawableCrtc(draw);
++	drmVBlank vbl;
++
++	if (crtc == NULL)
++		return FALSE;
++
++	vbl.request.type =
++		DRM_VBLANK_RELATIVE |
++		DRM_VBLANK_EVENT |
++		pipe_select(intel_crtc_to_pipe(crtc));
++	vbl.request.sequence = 1;
++	vbl.request.signal =
++		intel_drm_queue_alloc(intel->scrn, crtc, info,
++				      intel_dri2_vblank_handler,
++				      intel_dri2_vblank_abort);
++	if (vbl.request.signal == 0)
++		return FALSE;
++
++	info->type = DRI2_SWAP;
++	if (drmWaitVBlank(intel->drmSubFD, &vbl)) {
++		intel_drm_abort_seq(intel->scrn, vbl.request.signal);
++		return FALSE;
++	}
++
++	return TRUE;
++}
++
++static void I830DRI2FrameEventHandler(unsigned int frame,
++				      unsigned int tv_sec,
++				      unsigned int tv_usec,
++				      DRI2FrameEventPtr swap_info)
+ {
+ 	intel_screen_private *intel = swap_info->intel;
+ 	DrawablePtr drawable;
+@@ -906,24 +976,22 @@ void I830DRI2FrameEventHandler(unsigned int frame, unsigned int tv_sec,
+ 		return;
+ 	}
+ 
+-
+ 	switch (swap_info->type) {
+ 	case DRI2_FLIP:
+ 		/* If we can still flip... */
+-		if (can_exchange(drawable, swap_info->front, swap_info->back) &&
+-		    I830DRI2ScheduleFlip(intel, drawable, swap_info))
+-			return;
+-
+-		/* else fall through to exchange/blit */
+-	case DRI2_SWAP: {
+-		I830DRI2FallbackBlitSwap(drawable,
+-					 swap_info->front, swap_info->back);
+-		DRI2SwapComplete(swap_info->client, drawable, frame, tv_sec, tv_usec,
+-				 DRI2_BLIT_COMPLETE,
+-				 swap_info->client ? swap_info->event_complete : NULL,
+-				 swap_info->event_data);
+-		break;
+-	}
++		if (!queue_flip(intel, drawable, swap_info) &&
++		    !queue_swap(intel, drawable, swap_info)) {
++		case DRI2_SWAP:
++			I830DRI2FallbackBlitSwap(drawable,
++						 swap_info->front, swap_info->back);
++			DRI2SwapComplete(swap_info->client, drawable, frame, tv_sec, tv_usec,
++					 DRI2_BLIT_COMPLETE,
++					 swap_info->client ? swap_info->event_complete : NULL,
++					 swap_info->event_data);
++			break;
++		}
++		return;
++
+ 	case DRI2_WAITMSC:
+ 		if (swap_info->client)
+ 			DRI2WaitMSCComplete(swap_info->client, drawable,
+@@ -939,12 +1007,13 @@ void I830DRI2FrameEventHandler(unsigned int frame, unsigned int tv_sec,
+ 	i830_dri2_del_frame_event(swap_info);
+ }
+ 
+-void I830DRI2FlipEventHandler(unsigned int frame, unsigned int tv_sec,
+-			      unsigned int tv_usec, DRI2FrameEventPtr flip_info)
++static void I830DRI2FlipEventHandler(unsigned int frame,
++				     unsigned int tv_sec,
++				     unsigned int tv_usec,
++				     DRI2FrameEventPtr flip_info)
+ {
+ 	struct intel_screen_private *intel = flip_info->intel;
+ 	DrawablePtr drawable;
+-	DRI2FrameEventPtr chain;
+ 
+ 	drawable = NULL;
+ 	if (flip_info->drawable_id)
+@@ -954,6 +1023,7 @@ void I830DRI2FlipEventHandler(unsigned int frame, unsigned int tv_sec,
+ 
+ 	/* We assume our flips arrive in order, so we don't check the frame */
+ 	switch (flip_info->type) {
++	case DRI2_FLIP:
+ 	case DRI2_SWAP:
+ 		if (!drawable)
+ 			break;
+@@ -984,35 +1054,6 @@ void I830DRI2FlipEventHandler(unsigned int frame, unsigned int tv_sec,
+ 				 flip_info->event_data);
+ 		break;
+ 
+-	case DRI2_SWAP_CHAIN:
+-		assert(intel->pending_flip[flip_info->pipe] == flip_info);
+-		intel->pending_flip[flip_info->pipe] = NULL;
+-
+-		chain = flip_info->chain;
+-		if (chain) {
+-			DrawablePtr chain_drawable = NULL;
+-			if (chain->drawable_id)
+-				 dixLookupDrawable(&chain_drawable,
+-						   chain->drawable_id,
+-						   serverClient,
+-						   M_ANY, DixWriteAccess);
+-			if (chain_drawable == NULL) {
+-				i830_dri2_del_frame_event(chain);
+-			} else if (!can_exchange(chain_drawable, chain->front, chain->back) ||
+-				   !I830DRI2ScheduleFlip(intel, chain_drawable, chain)) {
+-				I830DRI2FallbackBlitSwap(chain_drawable,
+-							 chain->front,
+-							 chain->back);
+-
+-				DRI2SwapComplete(chain->client, chain_drawable, frame, tv_sec, tv_usec,
+-						 DRI2_BLIT_COMPLETE,
+-						 chain->client ? chain->event_complete : NULL,
+-						 chain->event_data);
+-				i830_dri2_del_frame_event(chain);
+-			}
+-		}
+-		break;
+-
+ 	default:
+ 		xf86DrvMsg(intel->scrn->scrnIndex, X_WARNING,
+ 			   "%s: unknown vblank event received\n", __func__);
+@@ -1023,38 +1064,6 @@ void I830DRI2FlipEventHandler(unsigned int frame, unsigned int tv_sec,
+ 	i830_dri2_del_frame_event(flip_info);
+ }
+ 
+-static uint32_t pipe_select(int pipe)
+-{
+-	if (pipe > 1)
+-		return pipe << DRM_VBLANK_HIGH_CRTC_SHIFT;
+-	else if (pipe > 0)
+-		return DRM_VBLANK_SECONDARY;
+-	else
+-		return 0;
+-}
+-
+-static void
+-intel_dri2_vblank_handler(ScrnInfoPtr scrn,
+-                          xf86CrtcPtr crtc,
+-                          uint64_t msc,
+-                          uint64_t usec,
+-                          void *data)
+-{
+-        DRI2FrameEventPtr swap_info = data;
+-
+-        I830DRI2FrameEventHandler((uint32_t) msc, usec / 1000000, usec % 1000000, swap_info);
+-}
+-
+-static void
+-intel_dri2_vblank_abort(ScrnInfoPtr scrn,
+-                        xf86CrtcPtr crtc,
+-                        void *data)
+-{
+-        DRI2FrameEventPtr swap_info = data;
+-
+-        i830_dri2_del_frame_event(swap_info);
+-}
+-
+ /*
+  * ScheduleSwap is responsible for requesting a DRM vblank event for the
+  * appropriate frame.
+@@ -1089,7 +1098,6 @@ I830DRI2ScheduleSwap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
+         int pipe = crtc ? intel_crtc_to_pipe(crtc) : -1;
+         int flip = 0;
+ 	DRI2FrameEventPtr swap_info = NULL;
+-	enum DRI2FrameEventType swap_type = DRI2_SWAP;
+ 	uint64_t current_msc, current_ust;
+         uint64_t request_msc;
+         uint32_t seq;
+@@ -1109,7 +1117,7 @@ I830DRI2ScheduleSwap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
+ 	swap_info->event_data = data;
+ 	swap_info->front = front;
+ 	swap_info->back = back;
+-	swap_info->pipe = pipe;
++	swap_info->type = DRI2_SWAP;
+ 
+ 	if (!i830_dri2_add_frame_event(swap_info)) {
+ 	    free(swap_info);
+@@ -1124,20 +1132,27 @@ I830DRI2ScheduleSwap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
+ 	if (ret)
+ 	    goto blit_fallback;
+ 
+-	/* Flips need to be submitted one frame before */
++	/*
++	 * If we can, schedule the flip directly from here rather
++	 * than waiting for an event from the kernel for the current
++	 * (or a past) MSC.
++	 */
++	if (divisor == 0 &&
++	    current_msc >= *target_msc &&
++	    queue_flip(intel, draw, swap_info))
++		return TRUE;
++
+ 	if (can_exchange(draw, front, back)) {
+-	    swap_type = DRI2_FLIP;
+-	    flip = 1;
++		swap_info->type = DRI2_FLIP;
++		/* Flips need to be submitted one frame before */
++		if (*target_msc > 0)
++			--*target_msc;
++		flip = 1;
+ 	}
+ 
+-	swap_info->type = swap_type;
+-
+-	/* Correct target_msc by 'flip' if swap_type == DRI2_FLIP.
+-	 * Do it early, so handling of different timing constraints
+-	 * for divisor, remainder and msc vs. target_msc works.
+-	 */
+-	if (*target_msc > 0)
+-		*target_msc -= flip;
++#if DRI2INFOREC_VERSION >= 6
++	DRI2SwapLimit(draw, 1);
++#endif
+ 
+ 	/*
+ 	 * If divisor is zero, or current_msc is smaller than target_msc
+@@ -1145,15 +1160,6 @@ I830DRI2ScheduleSwap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
+ 	 * the swap.
+ 	 */
+ 	if (divisor == 0 || current_msc < *target_msc) {
+-		/*
+-		 * If we can, schedule the flip directly from here rather
+-		 * than waiting for an event from the kernel for the current
+-		 * (or a past) MSC.
+-		 */
+-		if (flip && divisor == 0 && current_msc >= *target_msc &&
+-		    I830DRI2ScheduleFlip(intel, draw, swap_info))
+-			return TRUE;
+-
+ 		vbl.request.type =
+ 			DRM_VBLANK_ABSOLUTE | DRM_VBLANK_EVENT | pipe_select(pipe);
+ 
+@@ -1168,7 +1174,7 @@ I830DRI2ScheduleSwap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
+ 		 * current_msc to ensure we return a reasonable value back
+ 		 * to the caller. This makes swap_interval logic more robust.
+ 		 */
+-		if (current_msc >= *target_msc)
++		if (current_msc > *target_msc)
+ 			*target_msc = current_msc;
+ 
+                 seq = intel_drm_queue_alloc(scrn, crtc, swap_info, intel_dri2_vblank_handler, intel_dri2_vblank_abort);
+@@ -1183,6 +1189,8 @@ I830DRI2ScheduleSwap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
+ 			xf86DrvMsg(scrn->scrnIndex, X_WARNING,
+ 				   "divisor 0 get vblank counter failed: %s\n",
+ 				   strerror(errno));
++			intel_drm_abort_seq(intel->scrn, seq);
++			swap_info = NULL;
+ 			goto blit_fallback;
+ 		}
+ 
+@@ -1332,7 +1340,6 @@ I830DRI2ScheduleWaitMSC(ClientPtr client, DrawablePtr draw, CARD64 target_msc,
+ 
+ 	if (!i830_dri2_add_frame_event(wait_info)) {
+ 	    free(wait_info);
+-	    wait_info = NULL;
+ 	    goto out_complete;
+ 	}
+ 
+@@ -1374,7 +1381,8 @@ I830DRI2ScheduleWaitMSC(ClientPtr client, DrawablePtr draw, CARD64 target_msc,
+ 					   strerror(errno));
+ 				limit--;
+ 			}
+-			goto out_free;
++			intel_drm_abort_seq(intel->scrn, seq);
++			goto out_complete;
+ 		}
+ 
+ 		wait_info->frame = intel_sequence_to_crtc_msc(crtc, vbl.reply.sequence);
+@@ -1417,7 +1425,8 @@ I830DRI2ScheduleWaitMSC(ClientPtr client, DrawablePtr draw, CARD64 target_msc,
+ 				   strerror(errno));
+ 			limit--;
+ 		}
+-		goto out_free;
++		intel_drm_abort_seq(intel->scrn, seq);
++		goto out_complete;
+ 	}
+ 
+ 	wait_info->frame = intel_sequence_to_crtc_msc(crtc, vbl.reply.sequence);
+@@ -1440,13 +1449,92 @@ static int has_i830_dri(void)
+ 	return access(DRI_DRIVER_PATH "/i830_dri.so", R_OK) == 0;
+ }
+ 
+-static const char *dri_driver_name(intel_screen_private *intel)
++static int
++namecmp(const char *s1, const char *s2)
++{
++	char c1, c2;
++
++	if (!s1 || *s1 == 0) {
++		if (!s2 || *s2 == 0)
++			return 0;
++		else
++			return 1;
++	}
++
++	while (*s1 == '_' || *s1 == ' ' || *s1 == '\t')
++		s1++;
++
++	while (*s2 == '_' || *s2 == ' ' || *s2 == '\t')
++		s2++;
++
++	c1 = isupper(*s1) ? tolower(*s1) : *s1;
++	c2 = isupper(*s2) ? tolower(*s2) : *s2;
++	while (c1 == c2) {
++		if (c1 == '\0')
++			return 0;
++
++		s1++;
++		while (*s1 == '_' || *s1 == ' ' || *s1 == '\t')
++			s1++;
++
++		s2++;
++		while (*s2 == '_' || *s2 == ' ' || *s2 == '\t')
++			s2++;
++
++		c1 = isupper(*s1) ? tolower(*s1) : *s1;
++		c2 = isupper(*s2) ? tolower(*s2) : *s2;
++	}
++
++	return c1 - c2;
++}
++
++static Bool is_level(const char **str)
++{
++	const char *s = *str;
++	char *end;
++	unsigned val;
++
++	if (s == NULL || *s == '\0')
++		return TRUE;
++
++	if (namecmp(s, "on") == 0)
++		return TRUE;
++	if (namecmp(s, "true") == 0)
++		return TRUE;
++	if (namecmp(s, "yes") == 0)
++		return TRUE;
++
++	if (namecmp(s, "0") == 0)
++		return TRUE;
++	if (namecmp(s, "off") == 0)
++		return TRUE;
++	if (namecmp(s, "false") == 0)
++		return TRUE;
++	if (namecmp(s, "no") == 0)
++		return TRUE;
++
++	val = strtoul(s, &end, 0);
++	if (val && *end == '\0')
++		return TRUE;
++	if (val && *end == ':')
++		*str = end + 1;
++	return FALSE;
++}
++
++static const char *options_get_dri(intel_screen_private *intel)
+ {
+ #if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,7,99,901,0)
+-	const char *s = xf86GetOptValString(intel->Options, OPTION_DRI);
+-	Bool dummy;
++	return xf86GetOptValString(intel->Options, OPTION_DRI);
++#else
++	return NULL;
++#endif
++}
+ 
+-	if (s == NULL || xf86getBoolValue(&dummy, s)) {
++static const char *dri_driver_name(intel_screen_private *intel)
++{
++	const char *s = options_get_dri(intel);
++
++	if (is_level(&s)) {
+ 		if (INTEL_INFO(intel)->gen < 030)
+ 			return has_i830_dri() ? "i830" : "i915";
+ 		else if (INTEL_INFO(intel)->gen < 040)
+@@ -1456,14 +1544,6 @@ static const char *dri_driver_name(intel_screen_private *intel)
+ 	}
+ 
+ 	return s;
+-#else
+-	if (INTEL_INFO(intel)->gen < 030)
+-		return has_i830_dri() ? "i830" : "i915";
+-	else if (INTEL_INFO(intel)->gen < 040)
+-		return "i915";
+-	else
+-		return "i965";
+-#endif
+ }
+ 
+ Bool I830DRI2ScreenInit(ScreenPtr screen)
+@@ -1544,7 +1624,7 @@ Bool I830DRI2ScreenInit(ScreenPtr screen)
+ 	info.numDrivers = 2;
+ 	info.driverNames = driverNames;
+ 	driverNames[0] = info.driverName;
+-	driverNames[1] = info.driverName;
++	driverNames[1] = "va_gl";
+ #endif
+ 
+ 	return DRI2ScreenInit(screen, &info);
+diff --git a/src/uxa/intel_driver.c b/src/uxa/intel_driver.c
+index 2793da5d..3703c412 100644
+--- a/src/uxa/intel_driver.c
++++ b/src/uxa/intel_driver.c
+@@ -237,24 +237,17 @@ static Bool I830GetEarlyOptions(ScrnInfoPtr scrn)
+ 	return TRUE;
+ }
+ 
+-static Bool intel_option_cast_string_to_bool(intel_screen_private *intel,
+-					     int id, Bool val)
+-{
+-#if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,7,99,901,0)
+-	xf86getBoolValue(&val, xf86GetOptValString(intel->Options, id));
+-	return val;
+-#else
+-	return val;
+-#endif
+-}
+-
+ static void intel_check_dri_option(ScrnInfoPtr scrn)
+ {
+ 	intel_screen_private *intel = intel_get_screen_private(scrn);
++	unsigned level;
+ 
+ 	intel->dri2 = intel->dri3 = DRI_NONE;
+-	if (!intel_option_cast_string_to_bool(intel, OPTION_DRI, TRUE))
+-		intel->dri2 = intel->dri3 = DRI_DISABLED;
++	level = intel_option_cast_to_unsigned(intel->Options, OPTION_DRI, DEFAULT_DRI_LEVEL);
++	if (level < 3 || INTEL_INFO(intel)->gen < 040)
++		intel->dri3 = DRI_DISABLED;
++	if (level < 2)
++		intel->dri2 = DRI_DISABLED;
+ 
+ 	if (scrn->depth != 16 && scrn->depth != 24 && scrn->depth != 30) {
+ 		xf86DrvMsg(scrn->scrnIndex, X_CONFIG,
+@@ -371,8 +364,8 @@ static Bool can_accelerate_blt(struct intel_screen_private *intel)
+ 	if (INTEL_INFO(intel)->gen == -1)
+ 		return FALSE;
+ 
+-	if (xf86ReturnOptValBool(intel->Options, OPTION_ACCEL_DISABLE, FALSE) ||
+-	    !intel_option_cast_string_to_bool(intel, OPTION_ACCEL_METHOD, TRUE)) {
++	if (!xf86ReturnOptValBool(intel->Options, OPTION_ACCEL_ENABLE, TRUE) ||
++	    !intel_option_cast_to_bool(intel->Options, OPTION_ACCEL_METHOD, TRUE)) {
+ 		xf86DrvMsg(intel->scrn->scrnIndex, X_CONFIG,
+ 			   "Disabling hardware acceleration.\n");
+ 		return FALSE;
+@@ -659,8 +652,9 @@ redisplay_dirty(ScreenPtr screen, PixmapDirtyUpdatePtr dirty)
+ }
+ 
+ static void
+-intel_dirty_update(ScreenPtr screen)
++intel_dirty_update(intel_screen_private *intel)
+ {
++	ScreenPtr screen = xf86ScrnToScreen(intel->scrn);
+ 	RegionPtr region;
+ 	PixmapDirtyUpdatePtr ent;
+ 
+@@ -677,6 +671,7 @@ intel_dirty_update(ScreenPtr screen)
+ }
+ #endif
+ 
++#if !HAVE_NOTIFY_FD
+ static void
+ I830BlockHandler(BLOCKHANDLER_ARGS_DECL)
+ {
+@@ -694,9 +689,22 @@ I830BlockHandler(BLOCKHANDLER_ARGS_DECL)
+ 	intel_uxa_block_handler(intel);
+ 	intel_video_block_handler(intel);
+ #ifdef INTEL_PIXMAP_SHARING
+-	intel_dirty_update(screen);
++	intel_dirty_update(intel);
+ #endif
+ }
++#else
++static void
++I830BlockHandler(void *data, void *timeout)
++{
++	intel_screen_private *intel = data;
++
++	intel_uxa_block_handler(intel);
++	intel_video_block_handler(intel);
++#ifdef INTEL_PIXMAP_SHARING
++	intel_dirty_update(intel);
++#endif
++}
++#endif
+ 
+ static Bool
+ intel_init_initial_framebuffer(ScrnInfoPtr scrn)
+@@ -735,6 +743,8 @@ intel_flush_callback(CallbackListPtr *list,
+ }
+ 
+ #if HAVE_UDEV
++#include <sys/stat.h>
++
+ static void
+ I830HandleUEvents(int fd, void *closure)
+ {
+@@ -771,6 +781,15 @@ I830HandleUEvents(int fd, void *closure)
+ 	udev_device_unref(dev);
+ }
+ 
++static int has_randr(void)
++{
++#if HAS_DIXREGISTERPRIVATEKEY
++	return dixPrivateKeyRegistered(rrPrivKey);
++#else
++	return *rrPrivKey;
++#endif
++}
++
+ static void
+ I830UeventInit(ScrnInfoPtr scrn)
+ {
+@@ -780,6 +799,10 @@ I830UeventInit(ScrnInfoPtr scrn)
+ 	Bool hotplug;
+ 	MessageType from = X_CONFIG;
+ 
++	/* Without RR, nothing we can do here */
++	if (!has_randr())
++		return;
++
+ 	if (!xf86GetOptValBool(intel->Options, OPTION_HOTPLUG, &hotplug)) {
+ 		from = X_DEFAULT;
+ 		hotplug = TRUE;
+@@ -939,8 +962,14 @@ I830ScreenInit(SCREEN_INIT_ARGS_DECL)
+ 			   "Hardware cursor initialization failed\n");
+ 	}
+ 
++#if !HAVE_NOTIFY_FD
+ 	intel->BlockHandler = screen->BlockHandler;
+ 	screen->BlockHandler = I830BlockHandler;
++#else
++	RegisterBlockAndWakeupHandlers(I830BlockHandler,
++				       (ServerWakeupHandlerProcPtr)NoopDDA,
++				       intel);
++#endif
+ 
+ #ifdef INTEL_PIXMAP_SHARING
+ 	screen->StartPixmapTracking = PixmapStartDirtyTracking;
+@@ -1164,8 +1193,6 @@ static Bool I830CloseScreen(CLOSE_SCREEN_ARGS_DECL)
+ 
+ 	intel_sync_close(screen);
+ 
+-	xf86GARTCloseScreen(scrn->scrnIndex);
+-
+ 	scrn->vtSema = FALSE;
+ 	return TRUE;
+ }
+diff --git a/src/uxa/intel_hwmc.c b/src/uxa/intel_hwmc.c
+index 829cb8e0..78540600 100644
+--- a/src/uxa/intel_hwmc.c
++++ b/src/uxa/intel_hwmc.c
+@@ -193,7 +193,7 @@ Bool intel_xvmc_adaptor_init(ScreenPtr pScreen)
+ 	intel_screen_private *intel = intel_get_screen_private(scrn);
+ 	struct pci_device *pci;
+ 	static XF86MCAdaptorRec *pAdapt;
+-	char *name;
++	const char *name;
+ 	char buf[64];
+ 
+ 	if (!intel->XvMCEnabled)
+diff --git a/src/uxa/intel_memory.c b/src/uxa/intel_memory.c
+index 0c6cf30c..b2d7a367 100644
+--- a/src/uxa/intel_memory.c
++++ b/src/uxa/intel_memory.c
+@@ -42,7 +42,7 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+  * This is the video memory allocator.  Our memory allocation is different from
+  * other graphics chips, where you have a fixed amount of graphics memory
+  * available that you want to put to the best use.  Instead, we have almost no
+- * memory pre-allocated, and we have to choose an appropriate amount of sytem
++ * memory pre-allocated, and we have to choose an appropriate amount of system
+  * memory to use.
+  *
+  * The allocations we might do:
+diff --git a/src/uxa/intel_present.c b/src/uxa/intel_present.c
+index d20043f3..ac028edd 100644
+--- a/src/uxa/intel_present.c
++++ b/src/uxa/intel_present.c
+@@ -244,6 +244,7 @@ intel_present_check_flip(RRCrtcPtr              crtc,
+ 	ScrnInfoPtr             scrn = xf86ScreenToScrn(screen);
+ 	intel_screen_private    *intel = intel_get_screen_private(scrn);
+         dri_bo                  *bo;
++	uint32_t		tiling, swizzle;
+ 
+ 	if (!scrn->vtSema)
+ 		return FALSE;
+@@ -266,6 +267,12 @@ intel_present_check_flip(RRCrtcPtr              crtc,
+         if (!bo)
+                 return FALSE;
+ 
++	if (drm_intel_bo_get_tiling(bo, &tiling, &swizzle))
++		return FALSE;
++
++	if (tiling == I915_TILING_Y)
++		return FALSE;
++
+ 	return TRUE;
+ }
+ 
+@@ -343,29 +350,33 @@ intel_present_unflip(ScreenPtr screen, uint64_t event_id)
+ {
+ 	ScrnInfoPtr                             scrn = xf86ScreenToScrn(screen);
+ 	intel_screen_private                    *intel = intel_get_screen_private(scrn);
+-	struct intel_present_vblank_event       *event;
+ 	PixmapPtr                               pixmap = screen->GetScreenPixmap(screen);
++	struct intel_present_vblank_event       *event = NULL;
+ 	dri_bo                                  *bo;
+-	Bool                                    ret;
+ 
+ 	if (!intel_present_check_flip(NULL, screen->root, pixmap, true))
+-		return;
++		goto fail;
+ 
+ 	bo = intel_get_pixmap_bo(pixmap);
+ 	if (!bo)
+-		return;
++		goto fail;
+ 
+ 	event = calloc(1, sizeof(struct intel_present_vblank_event));
+ 	if (!event)
+-		return;
++		goto fail;
+ 
+ 	event->event_id = event_id;
+ 
+-	ret = intel_do_pageflip(intel, bo, -1, FALSE, event, intel_present_flip_event, intel_present_flip_abort);
+-	if (!ret) {
+-		xf86DrvMsg(scrn->scrnIndex, X_ERROR,
+-			   "present unflip failed\n");
+-	}
++	if (!intel_do_pageflip(intel, bo, -1, FALSE, event,
++			       intel_present_flip_event,
++			       intel_present_flip_abort))
++		goto fail;
++
++	return;
++fail:
++	xf86SetDesiredModes(scrn);
++	present_event_notify(event_id, 0, 0);
++	free(event);
+ }
+ 
+ static present_screen_info_rec intel_present_screen_info = {
+diff --git a/src/uxa/intel_uxa.c b/src/uxa/intel_uxa.c
+index 590ff5d1..ec32a723 100644
+--- a/src/uxa/intel_uxa.c
++++ b/src/uxa/intel_uxa.c
+@@ -176,6 +176,24 @@ intel_uxa_check_solid(DrawablePtr drawable, int alu, Pixel planemask)
+ 	return TRUE;
+ }
+ 
++static Bool
++intel_uxa_check_bo_tiling(intel_screen_private *intel,
++			  PixmapPtr pixmap,
++			  unsigned *tiling_out)
++{
++	struct intel_uxa_pixmap *priv;
++
++	priv = intel_uxa_get_pixmap_private(pixmap);
++	if (!priv)
++		return FALSE;
++
++	if (priv->tiling == I915_TILING_Y && INTEL_INFO(intel)->gen < 060)
++		return FALSE;
++
++	*tiling_out = priv->tiling;
++	return TRUE;
++}
++
+ /**
+  * Sets up hardware state for a series of solid fills.
+  */
+@@ -189,6 +207,9 @@ intel_uxa_prepare_solid(PixmapPtr pixmap, int alu, Pixel planemask, Pixel fg)
+ 		intel_uxa_get_pixmap_bo(pixmap),
+ 	};
+ 
++	if (!intel_uxa_check_bo_tiling(intel, pixmap, &intel->BR_tiling[0]))
++		return FALSE;
++
+ 	if (!intel_uxa_check_pitch_2d(pixmap))
+ 		return FALSE;
+ 
+@@ -236,7 +257,7 @@ static void intel_uxa_solid(PixmapPtr pixmap, int x1, int y1, int x2, int y2)
+ 
+ 	{
+ 		int len = INTEL_INFO(intel)->gen >= 0100 ? 7 : 6;
+-		BEGIN_BATCH_BLT(len);
++		BEGIN_BATCH_BLT_TILED(len);
+ 
+ 		cmd = XY_COLOR_BLT_CMD | (len - 2);
+ 
+@@ -310,6 +331,10 @@ intel_uxa_prepare_copy(PixmapPtr source, PixmapPtr dest, int xdir,
+ 		intel_uxa_get_pixmap_bo(dest),
+ 	};
+ 
++	if (!intel_uxa_check_bo_tiling(intel, dest, &intel->BR_tiling[0]) ||
++	    !intel_uxa_check_bo_tiling(intel, source, &intel->BR_tiling[1]))
++		return FALSE;
++
+ 	if (!intel_uxa_get_aperture_space(scrn, bo_table, ARRAY_SIZE(bo_table)))
+ 		return FALSE;
+ 
+@@ -375,7 +400,7 @@ intel_uxa_copy(PixmapPtr dest, int src_x1, int src_y1, int dst_x1,
+ 
+ 	{
+ 		int len = INTEL_INFO(intel)->gen >= 0100 ? 10 : 8;
+-		BEGIN_BATCH_BLT(len);
++		BEGIN_BATCH_BLT_TILED(len);
+ 
+ 		cmd = XY_SRC_COPY_BLT_CMD | (len - 2);
+ 
+@@ -1068,7 +1093,7 @@ Bool intel_uxa_create_screen_resources(ScreenPtr screen)
+ 	ScrnInfoPtr scrn = xf86ScreenToScrn(screen);
+ 	PixmapPtr pixmap;
+ 	intel_screen_private *intel = intel_get_screen_private(scrn);
+-	dri_bo *bo = intel->front_buffer;
++	dri_bo *bo = intel->front_buffer, *old_bo;
+ 	int old_width, old_height, old_pitch;
+ 
+ 	if (!uxa_resources_init(screen))
+@@ -1081,6 +1106,7 @@ Bool intel_uxa_create_screen_resources(ScreenPtr screen)
+ 	old_width = pixmap->drawable.width;
+ 	old_height = pixmap->drawable.height;
+ 	old_pitch = pixmap->devKind;
++	old_bo = intel_uxa_get_pixmap_bo(pixmap);
+ 
+ 	if (!screen->ModifyPixmapHeader(pixmap,
+ 					scrn->virtualX,
+@@ -1102,6 +1128,9 @@ Bool intel_uxa_create_screen_resources(ScreenPtr screen)
+ err:
+ 	screen->ModifyPixmapHeader(pixmap,
+ 				   old_width, old_height, -1, -1, old_pitch, NULL);
++	if (old_bo)
++		intel_uxa_set_pixmap_bo(pixmap, old_bo);
++
+ 	return FALSE;
+ }
+ 
+diff --git a/test/Makefile.am b/test/Makefile.am
+index 66ed8ebb..12b5d5d8 100644
+--- a/test/Makefile.am
++++ b/test/Makefile.am
+@@ -5,6 +5,7 @@ stress_TESTS = \
+ 	basic-rectangle \
+ 	basic-string \
+ 	basic-copyarea \
++	basic-copyplane \
+ 	basic-copyarea-size \
+ 	basic-putimage \
+ 	basic-lines \
+@@ -12,8 +13,10 @@ stress_TESTS = \
+ 	DrawSegments \
+ 	cursor-test \
+ 	render-fill \
++	render-glyphs \
+ 	render-trapezoid \
+ 	render-trapezoid-image \
++	render-triangle \
+ 	render-fill-copy \
+ 	render-composite-solid \
+ 	render-composite-solid-mask \
+@@ -25,9 +28,16 @@ stress_TESTS = \
+ 	shm-test \
+ 	$(NULL)
+ 
++if X11_VM
++stress_TESTS += \
++	xvidmode \
++	$(NULL)
++endif
++
+ if DRI2
+ stress_TESTS += \
+ 	dri2-race \
++	dri2-speed \
+ 	dri2-swap \
+ 	dri2-test \
+ 	$(NULL)
+@@ -36,8 +46,11 @@ endif
+ if X11_DRI3
+ stress_TESTS += \
+ 	dri3-test \
++	present-race \
++	present-speed \
+ 	present-test \
+ 	$(NULL)
++present_speed_CFLAGS = ${AM_CFLAGS} -pthread
+ endif
+ check_PROGRAMS = $(stress_TESTS)
+ 
+diff --git a/test/basic-copyplane.c b/test/basic-copyplane.c
+new file mode 100644
+index 00000000..f049b82b
+--- /dev/null
++++ b/test/basic-copyplane.c
+@@ -0,0 +1,99 @@
++#include <stdint.h>
++#include <stdio.h>
++#include <stdlib.h>
++
++#include <X11/Xutil.h> /* for XDestroyImage */
++#include <pixman.h> /* for pixman blt functions */
++
++#include "test.h"
++
++static uint8_t clock_bits[] = {0x3C, 0x5E, 0xEF, 0xF7, 0x87, 0xFF, 0x7E, 0x3C};
++
++/* https://bugs.freedesktop.org/show_bug.cgi?id=91499 */
++static void draw_clock(struct test_display *t, Drawable d,
++		       uint8_t alu, int x, int y, uint32_t fg, uint32_t bg)
++{
++	Pixmap pixmap;
++	XGCValues val;
++	GC gc;
++
++	val.graphics_exposures = 0;
++	val.function = alu;
++	val.foreground = fg;
++	val.background = fg;
++
++	gc = XCreateGC(t->dpy, d,
++		       GCGraphicsExposures | GCForeground | GCBackground | GCFunction,
++		       &val);
++	pixmap = XCreateBitmapFromData(t->dpy, d, (char *)clock_bits, 8, 8);
++
++	XCopyPlane(t->dpy, pixmap, d, gc, 0, 0, 8, 8, x, y, 1);
++
++	XFreePixmap(t->dpy, pixmap);
++	XFreeGC(t->dpy, gc);
++}
++
++static void clear(struct test_display *dpy, struct test_target *tt)
++{
++	XRenderColor render_color = {0};
++	XRenderFillRectangle(dpy->dpy, PictOpClear, tt->picture, &render_color,
++			     0, 0, tt->width, tt->height);
++}
++
++static void clock_tests(struct test *t, int reps, int sets, enum target target)
++{
++	struct test_target out, ref;
++	int r, s;
++
++	printf("Testing clock (%s): ", test_target_name(target));
++	fflush(stdout);
++
++	test_target_create_render(&t->out, target, &out);
++	clear(&t->out, &out);
++
++	test_target_create_render(&t->ref, target, &ref);
++	clear(&t->ref, &ref);
++
++	for (s = 0; s < sets; s++) {
++		for (r = 0; r < reps; r++) {
++			int x = rand() % (out.width - 8);
++			int y = rand() % (out.height - 8);
++			uint8_t alu = rand() % (GXset + 1);
++			uint32_t bg = rand();
++			uint32_t fg = rand();
++
++			draw_clock(&t->out, out.draw, alu, x, y, fg, bg);
++			draw_clock(&t->ref, ref.draw, alu, x, y, fg, bg);
++		}
++
++		test_compare(t,
++			     out.draw, out.format,
++			     ref.draw, ref.format,
++			     0, 0, out.width, out.height,
++			     "");
++	}
++
++	printf("passed [%d iterations x %d]\n", reps, sets);
++
++	test_target_destroy_render(&t->out, &out);
++	test_target_destroy_render(&t->ref, &ref);
++}
++
++int main(int argc, char **argv)
++{
++	struct test test;
++	int i;
++
++	test_init(&test, argc, argv);
++
++	for (i = 0; i <= DEFAULT_ITERATIONS; i++) {
++		int reps = REPS(i), sets = SETS(i);
++		enum target t;
++
++		for (t = TARGET_FIRST; t <= TARGET_LAST; t++) {
++			clock_tests(&test, reps, sets, t);
++		}
++	}
++
++	return 0;
++}
+diff --git a/test/dri2-race.c b/test/dri2-race.c
+index 8862c84c..ece624f6 100644
+--- a/test/dri2-race.c
++++ b/test/dri2-race.c
+@@ -5,6 +5,11 @@
+ #include <X11/Xlib.h>
+ #include <X11/Xutil.h>
+ #include <X11/extensions/Xfixes.h>
++#include <X11/extensions/Xcomposite.h>
++#include <X11/Xlib-xcb.h>
++#include <xcb/xcb.h>
++#include <xcb/xcbext.h>
++#include <xcb/dri2.h>
+ #include <unistd.h>
+ #include <fcntl.h>
+ #include <string.h>
+@@ -12,11 +17,49 @@
+ 
+ #include <xf86drm.h>
+ #include <drm.h>
++#include <setjmp.h>
+ 
+ #include "dri2.h"
+ 
+ #define COUNT 60
+ 
++#define N_DIVISORS 3
++static const int divisors[N_DIVISORS] = { 0, 1, 16 };
++
++static jmp_buf error_handler[4];
++static int have_error_handler;
++
++#define error_get() \
++	setjmp(error_handler[have_error_handler++])
++
++#define error_put() \
++	have_error_handler--
++
++static int (*saved_io_error)(Display *dpy);
++
++static int io_error(Display *dpy)
++{
++	if (have_error_handler)
++		longjmp(error_handler[--have_error_handler], 0);
++
++	return saved_io_error(dpy);
++}
++
++static int x_error(Display *dpy, XErrorEvent *e)
++{
++	return Success;
++}
++
++static uint32_t upper_32_bits(uint64_t val)
++{
++	return val >> 32;
++}
++
++static uint32_t lower_32_bits(uint64_t val)
++{
++	return val & 0xffffffff;
++}
++
+ static int dri2_open(Display *dpy)
+ {
+ 	drm_auth_t auth;
+@@ -41,45 +84,701 @@ static int dri2_open(Display *dpy)
+ 	return fd;
+ }
+ 
+-static void run(Display *dpy, int width, int height,
+-		unsigned int *attachments, int nattachments,
+-		const char *name)
++static void swap_buffers(Display *dpy, Window win, int divisor,
++			 unsigned int *attachments, int nattachments)
++{
++	xcb_connection_t *c = XGetXCBConnection(dpy);
++	unsigned int seq[2];
++
++	seq[0] = xcb_dri2_swap_buffers_unchecked(c, win,
++						 0, 0, 0, divisor, 0, 0).sequence;
++
++
++	seq[1] = xcb_dri2_get_buffers_unchecked(c, win,
++						nattachments, nattachments,
++						attachments).sequence;
++
++	xcb_flush(c);
++	xcb_discard_reply(c, seq[0]);
++	xcb_discard_reply(c, seq[1]);
++}
++
++#define COMPOSITE 1
++
++static int has_composite(Display *dpy)
++{
++	Display *dummy = NULL;
++	int event, error;
++	int major = -1, minor = -1;
++
++	if (dpy == NULL)
++		dummy = dpy = XOpenDisplay(NULL);
++
++	if (XCompositeQueryExtension(dpy, &event, &error))
++		XCompositeQueryVersion(dpy, &major, &minor);
++
++	if (dummy)
++		XCloseDisplay(dummy);
++
++	return major > 0 || minor >= 4;
++}
++
++static void race_window(Display *dpy, int width, int height,
++			unsigned int *attachments, int nattachments,
++			unsigned flags, const char *name)
+ {
+ 	Window win;
+ 	XSetWindowAttributes attr;
+-	int count, loop;
++	int count, loop, n;
+ 	DRI2Buffer *buffers;
+ 
++	if (flags & COMPOSITE && !has_composite(dpy))
++		return;
++
++	printf("%s(%s)\n", __func__, name);
++
+ 	/* Be nasty and install a fullscreen window on top so that we
+ 	 * can guarantee we do not get clipped by children.
+ 	 */
+ 	attr.override_redirect = 1;
+-	loop = 100;
+-	do {
++	for (n = 0; n < N_DIVISORS; n++) {
++		loop = 256 >> ffs(divisors[n]);
++		printf("DRI2SwapBuffers(divisor=%d), loop=%d", divisors[n], loop);
++		fflush(stdout);
++		do {
++			win = XCreateWindow(dpy, DefaultRootWindow(dpy),
++					0, 0, width, height, 0,
++					DefaultDepth(dpy, DefaultScreen(dpy)),
++					InputOutput,
++					DefaultVisual(dpy, DefaultScreen(dpy)),
++					CWOverrideRedirect, &attr);
++			if (flags & COMPOSITE)
++				XCompositeRedirectWindow(dpy, win, CompositeRedirectManual);
++			XMapWindow(dpy, win);
++
++			DRI2CreateDrawable(dpy, win);
++
++			buffers = DRI2GetBuffers(dpy, win, &width, &height,
++					attachments, nattachments, &count);
++			if (count != nattachments)
++				return;
++
++			free(buffers);
++			for (count = 0; count < loop; count++)
++				DRI2SwapBuffers(dpy, win, 0, divisors[n], count & (divisors[n]-1));
++			XDestroyWindow(dpy, win);
++			printf("."); fflush(stdout);
++		} while (--loop);
++		printf("*\n");
++	}
++
++	for (n = 0; n < N_DIVISORS; n++) {
++		loop = 256 >> ffs(divisors[n]);
++		printf("xcb_dri2_swap_buffers(divisor=%d), loops=%d", divisors[n], loop);
++		fflush(stdout);
++		do {
++			win = XCreateWindow(dpy, DefaultRootWindow(dpy),
++					0, 0, width, height, 0,
++					DefaultDepth(dpy, DefaultScreen(dpy)),
++					InputOutput,
++					DefaultVisual(dpy, DefaultScreen(dpy)),
++					CWOverrideRedirect, &attr);
++			if (flags & COMPOSITE)
++				XCompositeRedirectWindow(dpy, win, CompositeRedirectManual);
++			XMapWindow(dpy, win);
++
++			DRI2CreateDrawable(dpy, win);
++
++			buffers = DRI2GetBuffers(dpy, win, &width, &height,
++					attachments, nattachments, &count);
++			if (count != nattachments)
++				return;
++
++			free(buffers);
++			for (count = 0; count < loop; count++)
++				swap_buffers(dpy, win, divisors[n], attachments, nattachments);
++			XDestroyWindow(dpy, win);
++			printf("."); fflush(stdout);
++		} while (--loop);
++		printf("*\n");
++	}
++
++	for (n = 0; n < N_DIVISORS; n++) {
++		loop = 256 >> ffs(divisors[n]);
++		printf("DRI2WaitMsc(divisor=%d), loop=%d", divisors[n], loop);
++		fflush(stdout);
++		do {
++			uint64_t ignore, msc;
++			xcb_connection_t *c = XGetXCBConnection(dpy);
++
++			win = XCreateWindow(dpy, DefaultRootWindow(dpy),
++					0, 0, width, height, 0,
++					DefaultDepth(dpy, DefaultScreen(dpy)),
++					InputOutput,
++					DefaultVisual(dpy, DefaultScreen(dpy)),
++					CWOverrideRedirect, &attr);
++			if (flags & COMPOSITE)
++				XCompositeRedirectWindow(dpy, win, CompositeRedirectManual);
++			XMapWindow(dpy, win);
++
++			DRI2CreateDrawable(dpy, win);
++			DRI2GetMSC(dpy, win, &ignore, &msc, &ignore);
++			msc++;
++			for (count = 0; count < loop; count++) {
++				xcb_discard_reply(c,
++						xcb_dri2_wait_msc(c, win,
++							upper_32_bits(msc),
++							lower_32_bits(msc),
++							0, 0, 0, 0).sequence);
++				msc += divisors[n];
++			}
++			XFlush(dpy);
++			XDestroyWindow(dpy, win);
++			printf("."); fflush(stdout);
++		} while (--loop);
++		printf("*\n");
++	}
++
++	XSync(dpy, 1);
++	sleep(2);
++	XSync(dpy, 1);
++}
++
++static int rand_size(int max)
++{
++	return 1 + (rand() % (max - 1));
++}
++
++static void race_resize(Display *dpy, int width, int height,
++			unsigned int *attachments, int nattachments,
++			unsigned flags, const char *name)
++{
++	Window win;
++	XSetWindowAttributes attr;
++	int count, loop, n;
++	DRI2Buffer *buffers;
++
++	if (flags & COMPOSITE && !has_composite(dpy))
++		return;
++
++	printf("%s(%s)\n", __func__, name);
++
++	attr.override_redirect = 1;
++	for (n = 0; n < N_DIVISORS; n++) {
++		win = XCreateWindow(dpy, DefaultRootWindow(dpy),
++				    0, 0, width, height, 0,
++				    DefaultDepth(dpy, DefaultScreen(dpy)),
++				    InputOutput,
++				    DefaultVisual(dpy, DefaultScreen(dpy)),
++				    CWOverrideRedirect, &attr);
++		if (flags & COMPOSITE)
++			XCompositeRedirectWindow(dpy, win, CompositeRedirectManual);
++		XMapWindow(dpy, win);
++
++		DRI2CreateDrawable(dpy, win);
++
++		loop = 256 >> ffs(divisors[n]);
++		printf("DRI2SwapBuffers(divisor=%d), loop=%d", divisors[n], loop);
++		fflush(stdout);
++		do {
++			int w, h;
++
++			buffers = DRI2GetBuffers(dpy, win, &w, &h,
++					attachments, nattachments, &count);
++			if (count != nattachments)
++				return;
++
++			free(buffers);
++			for (count = 0; count < loop; count++)
++				DRI2SwapBuffers(dpy, win, 0, divisors[n], count & (divisors[n]-1));
++			XResizeWindow(dpy, win, rand_size(width), rand_size(height));
++			printf("."); fflush(stdout);
++		} while (--loop);
++		XDestroyWindow(dpy, win);
++		XSync(dpy, True);
++		printf("*\n");
++	}
++
++	for (n = 0; n < N_DIVISORS; n++) {
+ 		win = XCreateWindow(dpy, DefaultRootWindow(dpy),
+ 				    0, 0, width, height, 0,
+ 				    DefaultDepth(dpy, DefaultScreen(dpy)),
+ 				    InputOutput,
+ 				    DefaultVisual(dpy, DefaultScreen(dpy)),
+ 				    CWOverrideRedirect, &attr);
++		if (flags & COMPOSITE)
++			XCompositeRedirectWindow(dpy, win, CompositeRedirectManual);
+ 		XMapWindow(dpy, win);
+ 
+ 		DRI2CreateDrawable(dpy, win);
+ 
+-		buffers = DRI2GetBuffers(dpy, win, &width, &height,
+-					 attachments, nattachments, &count);
+-		if (count != nattachments)
+-			return;
++		loop = 256 >> ffs(divisors[n]);
++		printf("xcb_dri2_swap_buffers(divisor=%d), loops=%d", divisors[n], loop);
++		fflush(stdout);
++		do {
++			int w, h;
++
++			buffers = DRI2GetBuffers(dpy, win, &w, &h,
++					attachments, nattachments, &count);
++			if (count != nattachments)
++				return;
+ 
+-		free(buffers);
+-		for (count = 0; count < loop; count++)
+-			DRI2SwapBuffers(dpy, win, 0, 0, 0);
++			free(buffers);
++			for (count = 0; count < loop; count++)
++				swap_buffers(dpy, win, divisors[n], attachments, nattachments);
++			XResizeWindow(dpy, win, rand_size(width), rand_size(height));
++			printf("."); fflush(stdout);
++		} while (--loop);
+ 		XDestroyWindow(dpy, win);
+-	} while (--loop);
++		XSync(dpy, True);
++		printf("*\n");
++	}
++
++	for (n = 0; n < N_DIVISORS; n++) {
++		win = XCreateWindow(dpy, DefaultRootWindow(dpy),
++				    0, 0, width, height, 0,
++				    DefaultDepth(dpy, DefaultScreen(dpy)),
++				    InputOutput,
++				    DefaultVisual(dpy, DefaultScreen(dpy)),
++				    CWOverrideRedirect, &attr);
++		if (flags & COMPOSITE)
++			XCompositeRedirectWindow(dpy, win, CompositeRedirectManual);
++		XMapWindow(dpy, win);
++
++		DRI2CreateDrawable(dpy, win);
++
++		loop = 256 >> ffs(divisors[n]);
++		printf("DRI2WaitMsc(divisor=%d), loop=%d", divisors[n], loop);
++		fflush(stdout);
++		do {
++			uint64_t ignore, msc;
++			xcb_connection_t *c = XGetXCBConnection(dpy);
++
++			DRI2GetMSC(dpy, win, &ignore, &msc, &ignore);
++			msc++;
++			for (count = 0; count < loop; count++) {
++				xcb_discard_reply(c,
++						xcb_dri2_wait_msc(c, win,
++							upper_32_bits(msc),
++							lower_32_bits(msc),
++							0, 0, 0, 0).sequence);
++				msc += divisors[n];
++			}
++			XFlush(dpy);
++			XResizeWindow(dpy, win, rand_size(width), rand_size(height));
++			printf("."); fflush(stdout);
++		} while (--loop);
++		XDestroyWindow(dpy, win);
++		XSync(dpy, True);
++		printf("*\n");
++	}
++
++	XSync(dpy, 1);
++	sleep(2);
++	XSync(dpy, 1);
++}
++
++static void race_manager(Display *dpy, int width, int height,
++			 unsigned int *attachments, int nattachments,
++			 unsigned flags, const char *name)
++{
++	Display *mgr = XOpenDisplay(NULL);
++	Window win;
++	XSetWindowAttributes attr;
++	int count, loop, n;
++	DRI2Buffer *buffers;
++
++	if (flags & COMPOSITE && !has_composite(dpy))
++		return;
++
++	printf("%s(%s)\n", __func__, name);
++
++	/* Be nasty and install a fullscreen window on top so that we
++	 * can guarantee we do not get clipped by children.
++	 */
++	attr.override_redirect = 1;
++	for (n = 0; n < N_DIVISORS; n++) {
++		printf("DRI2SwapBuffers(divisor=%d)", divisors[n]);
++		fflush(stdout);
++		loop = 256 >> ffs(divisors[n]);
++		do {
++			win = XCreateWindow(dpy, DefaultRootWindow(dpy),
++					0, 0, width, height, 0,
++					DefaultDepth(dpy, DefaultScreen(dpy)),
++					InputOutput,
++					DefaultVisual(dpy, DefaultScreen(dpy)),
++					CWOverrideRedirect, &attr);
++			if (flags & COMPOSITE)
++				XCompositeRedirectWindow(dpy, win, CompositeRedirectManual);
++			XMapWindow(dpy, win);
++
++			DRI2CreateDrawable(dpy, win);
++
++			buffers = DRI2GetBuffers(dpy, win, &width, &height,
++					attachments, nattachments, &count);
++			if (count != nattachments)
++				return;
++
++			free(buffers);
++			for (count = 0; count < loop; count++)
++				DRI2SwapBuffers(dpy, win, 0, divisors[n], count & (divisors[n]-1));
++			XFlush(dpy);
++			XDestroyWindow(mgr, win);
++			XFlush(mgr);
++			printf("."); fflush(stdout);
++		} while (--loop);
++		printf("*\n");
++	}
++
++	for (n = 0; n < N_DIVISORS; n++) {
++		printf("xcb_dri2_swap_buffers(divisor=%d)", divisors[n]);
++		fflush(stdout);
++		loop = 256 >> ffs(divisors[n]);
++		do {
++			win = XCreateWindow(dpy, DefaultRootWindow(dpy),
++					0, 0, width, height, 0,
++					DefaultDepth(dpy, DefaultScreen(dpy)),
++					InputOutput,
++					DefaultVisual(dpy, DefaultScreen(dpy)),
++					CWOverrideRedirect, &attr);
++			if (flags & COMPOSITE)
++				XCompositeRedirectWindow(dpy, win, CompositeRedirectManual);
++			XMapWindow(dpy, win);
++
++			DRI2CreateDrawable(dpy, win);
++
++			buffers = DRI2GetBuffers(dpy, win, &width, &height,
++					attachments, nattachments, &count);
++			if (count != nattachments)
++				return;
++
++			free(buffers);
++			for (count = 0; count < loop; count++)
++				swap_buffers(dpy, win, divisors[n], attachments, nattachments);
++			XFlush(dpy);
++			XDestroyWindow(mgr, win);
++			XFlush(mgr);
++			printf("."); fflush(stdout);
++		} while (--loop);
++		printf("*\n");
++	}
++
++	for (n = 0; n < N_DIVISORS; n++) {
++		printf("DRI2WaitMsc(divisor=%d)", divisors[n]);
++		fflush(stdout);
++		loop = 256 >> ffs(divisors[n]);
++		do {
++			uint64_t ignore, msc;
++			xcb_connection_t *c = XGetXCBConnection(dpy);
++
++			win = XCreateWindow(dpy, DefaultRootWindow(dpy),
++					0, 0, width, height, 0,
++					DefaultDepth(dpy, DefaultScreen(dpy)),
++					InputOutput,
++					DefaultVisual(dpy, DefaultScreen(dpy)),
++					CWOverrideRedirect, &attr);
++			if (flags & COMPOSITE)
++				XCompositeRedirectWindow(dpy, win, CompositeRedirectManual);
++			XMapWindow(dpy, win);
++
++			DRI2CreateDrawable(dpy, win);
++			DRI2GetMSC(dpy, win, &ignore, &msc, &ignore);
++			msc++;
++			for (count = 0; count < loop; count++) {
++				xcb_discard_reply(c,
++						xcb_dri2_wait_msc(c, win,
++							upper_32_bits(msc),
++							lower_32_bits(msc),
++							0, 0, 0, 0).sequence);
++				msc += divisors[n];
++			}
++			XFlush(dpy);
++			XDestroyWindow(mgr, win);
++			XFlush(mgr);
++			printf("."); fflush(stdout);
++		} while (--loop);
++		printf("*\n");
++	}
+ 
+ 	XSync(dpy, 1);
++	XSync(mgr, 1);
+ 	sleep(2);
+ 	XSync(dpy, 1);
++	XSync(mgr, 1);
++
++	XCloseDisplay(mgr);
++}
++
++static void race_close(int width, int height,
++		       unsigned int *attachments, int nattachments,
++		       unsigned flags, const char *name)
++{
++	XSetWindowAttributes attr;
++	int count, loop, n;
++
++	if (flags & COMPOSITE && !has_composite(NULL))
++		return;
++
++	printf("%s(%s)\n", __func__, name);
++
++	/* Be nasty and install a fullscreen window on top so that we
++	 * can guarantee we do not get clipped by children.
++	 */
++	attr.override_redirect = 1;
++	for (n = 0; n < N_DIVISORS; n++) {
++		printf("DRI2SwapBuffers(divisor=%d)", divisors[n]);
++		fflush(stdout);
++		loop = 256 >> ffs(divisors[n]);
++		do {
++			Display *dpy = XOpenDisplay(NULL);
++			Window win = XCreateWindow(dpy, DefaultRootWindow(dpy),
++					0, 0, width, height, 0,
++					DefaultDepth(dpy, DefaultScreen(dpy)),
++					InputOutput,
++					DefaultVisual(dpy, DefaultScreen(dpy)),
++					CWOverrideRedirect, &attr);
++			if (flags & COMPOSITE)
++				XCompositeRedirectWindow(dpy, win, CompositeRedirectManual);
++			XMapWindow(dpy, win);
++
++			DRI2CreateDrawable(dpy, win);
++			free(DRI2GetBuffers(dpy, win, &width, &height,
++						attachments, nattachments, &count));
++			if (count != nattachments)
++				return;
++
++			for (count = 0; count < loop; count++)
++				DRI2SwapBuffers(dpy, win, 0, divisors[n], count & (divisors[n]-1));
++			XCloseDisplay(dpy);
++			printf("."); fflush(stdout);
++		} while (--loop);
++		printf("*\n");
++	}
++
++	for (n = 0; n < N_DIVISORS; n++) {
++		printf("xcb_dri2_swap_buffers(divisor=%d)", divisors[n]);
++		fflush(stdout);
++		loop = 256 >> ffs(divisors[n]);
++		do {
++			Display *dpy = XOpenDisplay(NULL);
++			Window win = XCreateWindow(dpy, DefaultRootWindow(dpy),
++					0, 0, width, height, 0,
++					DefaultDepth(dpy, DefaultScreen(dpy)),
++					InputOutput,
++					DefaultVisual(dpy, DefaultScreen(dpy)),
++					CWOverrideRedirect, &attr);
++			if (flags & COMPOSITE)
++				XCompositeRedirectWindow(dpy, win, CompositeRedirectManual);
++			XMapWindow(dpy, win);
++
++			DRI2CreateDrawable(dpy, win);
++			free(DRI2GetBuffers(dpy, win, &width, &height,
++						attachments, nattachments, &count));
++			if (count != nattachments)
++				return;
++
++			for (count = 0; count < loop; count++)
++				swap_buffers(dpy, win, divisors[n], attachments, nattachments);
++			XCloseDisplay(dpy);
++			printf("."); fflush(stdout);
++		} while (--loop);
++		printf("*\n");
++	}
++
++	for (n = 0; n < N_DIVISORS; n++) {
++		printf("DRI2WaitMsc(divisor=%d)", divisors[n]);
++		fflush(stdout);
++		loop = 256 >> ffs(divisors[n]);
++		do {
++			uint64_t ignore, msc;
++			Display *dpy = XOpenDisplay(NULL);
++			xcb_connection_t *c = XGetXCBConnection(dpy);
++			Window win = XCreateWindow(dpy, DefaultRootWindow(dpy),
++					0, 0, width, height, 0,
++					DefaultDepth(dpy, DefaultScreen(dpy)),
++					InputOutput,
++					DefaultVisual(dpy, DefaultScreen(dpy)),
++					CWOverrideRedirect, &attr);
++			if (flags & COMPOSITE)
++				XCompositeRedirectWindow(dpy, win, CompositeRedirectManual);
++			XMapWindow(dpy, win);
++
++			DRI2CreateDrawable(dpy, win);
++			DRI2GetMSC(dpy, win, &ignore, &msc, &ignore);
++			msc++;
++			for (count = 0; count < loop; count++) {
++				xcb_discard_reply(c,
++						xcb_dri2_wait_msc(c, win,
++							upper_32_bits(msc),
++							lower_32_bits(msc),
++							0, 0, 0, 0).sequence);
++				msc += divisors[n];
++			}
++			XFlush(dpy);
++			XCloseDisplay(dpy);
++			printf("."); fflush(stdout);
++		} while (--loop);
++		printf("*\n");
++	}
++}
++
++static void race_client(int width, int height,
++			unsigned int *attachments, int nattachments,
++			unsigned flags, const char *name)
++{
++	Display *mgr = XOpenDisplay(NULL);
++	XSetWindowAttributes attr;
++	int count, loop, n;
++
++	if (flags & COMPOSITE && !has_composite(NULL))
++		return;
++
++	printf("%s(%s)\n", __func__, name);
++
++	/* Be nasty and install a fullscreen window on top so that we
++	 * can guarantee we do not get clipped by children.
++	 */
++	attr.override_redirect = 1;
++	for (n = 0; n < N_DIVISORS; n++) {
++		printf("DRI2SwapBuffers(divisor=%d)", divisors[n]);
++		fflush(stdout);
++		loop = 256 >> ffs(divisors[n]);
++		do {
++			Display *dpy = XOpenDisplay(NULL);
++			Window win;
++
++			if (error_get()) {
++				XCloseDisplay(dpy);
++				printf("+"); fflush(stdout);
++				continue;
++			}
++
++			win = XCreateWindow(dpy, DefaultRootWindow(dpy),
++					    0, 0, width, height, 0,
++					    DefaultDepth(dpy, DefaultScreen(dpy)),
++					    InputOutput,
++					    DefaultVisual(dpy, DefaultScreen(dpy)),
++					    CWOverrideRedirect, &attr);
++			if (flags & COMPOSITE)
++				XCompositeRedirectWindow(dpy, win, CompositeRedirectManual);
++			XMapWindow(dpy, win);
++
++			DRI2CreateDrawable(dpy, win);
++			free(DRI2GetBuffers(dpy, win, &width, &height,
++					    attachments, nattachments, &count));
++			if (count == nattachments) {
++				for (count = 0; count < loop; count++)
++					DRI2SwapBuffers(dpy, win, 0, divisors[n], count & (divisors[n]-1));
++			}
++
++			XFlush(dpy);
++			XKillClient(mgr, win);
++			XFlush(mgr);
++
++			XCloseDisplay(dpy);
++			printf("."); fflush(stdout);
++
++			error_put();
++		} while (--loop);
++		printf("*\n");
++	}
++
++	for (n = 0; n < N_DIVISORS; n++) {
++		printf("xcb_dri2_swap_buffers(divisor=%d)", divisors[n]);
++		fflush(stdout);
++		loop = 256 >> ffs(divisors[n]);
++		do {
++			Display *dpy = XOpenDisplay(NULL);
++			Window win;
++
++			if (error_get()) {
++				XCloseDisplay(dpy);
++				printf("+"); fflush(stdout);
++				continue;
++			}
++
++			win = XCreateWindow(dpy, DefaultRootWindow(dpy),
++					    0, 0, width, height, 0,
++					    DefaultDepth(dpy, DefaultScreen(dpy)),
++					    InputOutput,
++					    DefaultVisual(dpy, DefaultScreen(dpy)),
++					    CWOverrideRedirect, &attr);
++			if (flags & COMPOSITE)
++				XCompositeRedirectWindow(dpy, win, CompositeRedirectManual);
++			XMapWindow(dpy, win);
++
++			DRI2CreateDrawable(dpy, win);
++			free(DRI2GetBuffers(dpy, win, &width, &height,
++					    attachments, nattachments, &count));
++			if (count == nattachments) {
++				for (count = 0; count < loop; count++)
++					swap_buffers(dpy, win, divisors[n], attachments, nattachments);
++			}
++
++			XFlush(dpy);
++			XKillClient(mgr, win);
++			XFlush(mgr);
++
++			XCloseDisplay(dpy);
++			printf("."); fflush(stdout);
++
++			error_put();
++		} while (--loop);
++		printf("*\n");
++	}
++
++	for (n = 0; n < N_DIVISORS; n++) {
++		printf("DRI2WaitMsc(divisor=%d)", divisors[n]);
++		fflush(stdout);
++		loop = 256 >> ffs(divisors[n]);
++		do {
++			Display *dpy = XOpenDisplay(NULL);
++			uint64_t ignore, msc;
++			xcb_connection_t *c;
++			Window win;
++
++			if (error_get()) {
++				XCloseDisplay(dpy);
++				printf("+"); fflush(stdout);
++				continue;
++			}
++
++			win = XCreateWindow(dpy, DefaultRootWindow(dpy),
++					    0, 0, width, height, 0,
++					    DefaultDepth(dpy, DefaultScreen(dpy)),
++					    InputOutput,
++					    DefaultVisual(dpy, DefaultScreen(dpy)),
++					    CWOverrideRedirect, &attr);
++			if (flags & COMPOSITE)
++				XCompositeRedirectWindow(dpy, win, CompositeRedirectManual);
++			XMapWindow(dpy, win);
++
++			DRI2CreateDrawable(dpy, win);
++			DRI2GetMSC(dpy, win, &ignore, &msc, &ignore);
++			c = XGetXCBConnection(dpy);
++			msc++;
++			for (count = 0; count < loop; count++) {
++				xcb_discard_reply(c,
++						  xcb_dri2_wait_msc(c, win,
++								    upper_32_bits(msc),
++								    lower_32_bits(msc),
++								    0, 0, 0, 0).sequence);
++				msc += divisors[n];
++			}
++
++			XFlush(dpy);
++			XKillClient(mgr, win);
++			XFlush(mgr);
++
++			XCloseDisplay(dpy);
++			printf("."); fflush(stdout);
++
++			error_put();
++		} while (--loop);
++		printf("*\n");
++	}
++
++	XCloseDisplay(mgr);
+ }
+ 
+ int main(void)
+@@ -91,7 +790,10 @@ int main(void)
+ 		DRI2BufferFrontLeft,
+ 	};
+ 
+-	dpy = XOpenDisplay (NULL);
++	saved_io_error = XSetIOErrorHandler(io_error);
++	XSetErrorHandler(x_error);
++
++	dpy = XOpenDisplay(NULL);
+ 	if (dpy == NULL)
+ 		return 77;
+ 
+@@ -101,13 +803,52 @@ int main(void)
+ 
+ 	width = WidthOfScreen(DefaultScreenOfDisplay(dpy));
+ 	height = HeightOfScreen(DefaultScreenOfDisplay(dpy));
+-	run(dpy, width, height, attachments, 1, "fullscreen");
+-	run(dpy, width, height, attachments, 2, "fullscreen (with front)");
++	race_window(dpy, width, height, attachments, 1, 0, "fullscreen");
++	race_window(dpy, width, height, attachments, 1, COMPOSITE, "composite fullscreen");
++	race_window(dpy, width, height, attachments, 2, 0, "fullscreen (with front)");
++	race_window(dpy, width, height, attachments, 2, COMPOSITE, "composite fullscreen (with front)");
++
++	race_resize(dpy, width, height, attachments, 1, 0, "");
++	race_resize(dpy, width, height, attachments, 1, COMPOSITE, "composite");
++	race_resize(dpy, width, height, attachments, 2, 0, "with front");
++	race_resize(dpy, width, height, attachments, 2, COMPOSITE, "composite with front");
++
++	race_manager(dpy, width, height, attachments, 1, 0, "fullscreen");
++	race_manager(dpy, width, height, attachments, 1, COMPOSITE, "composite fullscreen");
++	race_manager(dpy, width, height, attachments, 2, 0, "fullscreen (with front)");
++	race_manager(dpy, width, height, attachments, 2, COMPOSITE, "composite fullscreen (with front)");
++
++	race_close(width, height, attachments, 1, 0, "fullscreen");
++	race_close(width, height, attachments, 1, COMPOSITE, "composite fullscreen");
++	race_close(width, height, attachments, 2, 0, "fullscreen (with front)");
++	race_close(width, height, attachments, 2, COMPOSITE, "composite fullscreen (with front)");
++
++	race_client(width, height, attachments, 1, 0, "fullscreen");
++	race_client(width, height, attachments, 1, COMPOSITE, "composite fullscreen");
++	race_client(width, height, attachments, 2, 0, "fullscreen (with front)");
++	race_client(width, height, attachments, 2, COMPOSITE, "composite fullscreen (with front)");
+ 
+ 	width /= 2;
+ 	height /= 2;
+-	run(dpy, width, height, attachments, 1, "windowed");
+-	run(dpy, width, height, attachments, 2, "windowed (with front)");
++	race_window(dpy, width, height, attachments, 1, 0, "windowed");
++	race_window(dpy, width, height, attachments, 1, COMPOSITE, "composite windowed");
++	race_window(dpy, width, height, attachments, 2, 0, "windowed (with front)");
++	race_window(dpy, width, height, attachments, 2, COMPOSITE, "composite windowed (with front)");
++
++	race_manager(dpy, width, height, attachments, 1, 0, "windowed");
++	race_manager(dpy, width, height, attachments, 1, COMPOSITE, "composite windowed");
++	race_manager(dpy, width, height, attachments, 2, 0, "windowed (with front)");
++	race_manager(dpy, width, height, attachments, 2, COMPOSITE, "composite windowed (with front)");
++
++	race_close(width, height, attachments, 1, 0, "windowed");
++	race_close(width, height, attachments, 1, COMPOSITE, "composite windowed");
++	race_close(width, height, attachments, 2, 0, "windowed (with front)");
++	race_close(width, height, attachments, 2, COMPOSITE, "composite windowed (with front)");
++
++	race_client(width, height, attachments, 1, 0, "windowed");
++	race_client(width, height, attachments, 1, COMPOSITE, "composite windowed");
++	race_client(width, height, attachments, 2, 0, "windowed (with front)");
++	race_client(width, height, attachments, 2, COMPOSITE, "composite windowed (with front)");
+ 
+ 	return 0;
+ }
+diff --git a/test/dri2-speed.c b/test/dri2-speed.c
+new file mode 100644
+index 00000000..87b9d0b6
+--- /dev/null
++++ b/test/dri2-speed.c
+@@ -0,0 +1,342 @@
++/*
++ * Copyright (c) 2015 Intel Corporation
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice (including the next
++ * paragraph) shall be included in all copies or substantial portions of the
++ * Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ *
++ */
++
++#ifdef HAVE_CONFIG_H
++#include "config.h"
++#endif
++
++#include <X11/Xlib.h>
++#include <X11/Xatom.h>
++#include <X11/Xlib-xcb.h>
++#include <X11/Xutil.h>
++#include <X11/Xlibint.h>
++#include <X11/extensions/dpms.h>
++#include <X11/extensions/randr.h>
++#include <X11/extensions/Xcomposite.h>
++#include <X11/extensions/Xdamage.h>
++#include <X11/extensions/Xrandr.h>
++#include <xcb/xcb.h>
++#include <xcb/dri2.h>
++#include <xf86drm.h>
++
++#include <stdio.h>
++#include <string.h>
++#include <fcntl.h>
++#include <unistd.h>
++#include <assert.h>
++#include <errno.h>
++#include <setjmp.h>
++#include <signal.h>
++
++#include "dri2.h"
++
++static int _x_error_occurred;
++
++static int
++_check_error_handler(Display     *display,
++		     XErrorEvent *event)
++{
++	printf("X11 error from display %s, serial=%ld, error=%d, req=%d.%d\n",
++	       DisplayString(display),
++	       event->serial,
++	       event->error_code,
++	       event->request_code,
++	       event->minor_code);
++	_x_error_occurred++;
++	return False; /* ignored */
++}
++
++static double elapsed(const struct timespec *start,
++		      const struct timespec *end)
++{
++	return 1e6*(end->tv_sec - start->tv_sec) + (end->tv_nsec - start->tv_nsec)/1000;
++}
++
++static void run(Display *dpy, Window win, const char *name)
++{
++	xcb_connection_t *c = XGetXCBConnection(dpy);
++	struct timespec start, end;
++	int n, completed = 0;
++
++	_x_error_occurred = 0;
++
++	clock_gettime(CLOCK_MONOTONIC, &start);
++	do {
++		for (n = 0; n < 1000; n++) {
++			unsigned int attachments[] = { DRI2BufferBackLeft };
++			unsigned int seq[2];
++
++			seq[0] = xcb_dri2_swap_buffers_unchecked(c, win,
++								 0, 0, 0, 0, 0, 0).sequence;
++
++
++			seq[1] = xcb_dri2_get_buffers_unchecked(c, win,
++								1, 1, attachments).sequence;
++
++			xcb_flush(c);
++			xcb_discard_reply(c, seq[0]);
++			xcb_discard_reply(c, seq[1]);
++			completed++;
++		}
++		clock_gettime(CLOCK_MONOTONIC, &end);
++	} while (end.tv_sec < start.tv_sec + 10);
++
++	XSync(dpy, True);
++	if (_x_error_occurred)
++		abort();
++
++	printf("%s: Completed %d swaps in %.1fs, %.3fus each (%.1f FPS)\n",
++	       name, completed, elapsed(&start, &end) / 1000000,
++	       elapsed(&start, &end) / completed,
++	       completed / (elapsed(&start, &end) / 1000000));
++}
++
++static inline XRRScreenResources *_XRRGetScreenResourcesCurrent(Display *dpy, Window window)
++{
++	XRRScreenResources *res;
++
++	res = XRRGetScreenResourcesCurrent(dpy, window);
++	if (res == NULL)
++		res = XRRGetScreenResources(dpy, window);
++
++	return res;
++}
++
++static XRRModeInfo *lookup_mode(XRRScreenResources *res, int id)
++{
++	int i;
++
++	for (i = 0; i < res->nmode; i++) {
++		if (res->modes[i].id == id)
++			return &res->modes[i];
++	}
++
++	return NULL;
++}
++
++static int dri2_open(Display *dpy)
++{
++	drm_auth_t auth;
++	char *driver, *device;
++	int fd;
++
++	if (!DRI2Connect(dpy, DefaultRootWindow(dpy), &driver, &device))
++		return -1;
++
++	printf ("Connecting to %s driver on %s\n", driver, device);
++
++	fd = open(device, O_RDWR);
++	if (fd < 0)
++		return -1;
++
++	if (drmIoctl(fd, DRM_IOCTL_GET_MAGIC, &auth))
++		return -1;
++
++	if (!DRI2Authenticate(dpy, DefaultRootWindow(dpy), auth.magic))
++		return -1;
++
++	return fd;
++}
++
++static void fullscreen(Display *dpy, Window win)
++{
++	Atom atom = XInternAtom(dpy, "_NET_WM_STATE_FULLSCREEN", False);
++	XChangeProperty(dpy, win,
++			XInternAtom(dpy, "_NET_WM_STATE", False),
++			XA_ATOM, 32, PropModeReplace,
++			(unsigned char *)&atom, 1);
++}
++
++static int has_composite(Display *dpy)
++{
++	int event, error;
++	int major, minor;
++
++	if (!XDamageQueryExtension (dpy, &event, &error))
++		return 0;
++
++	if (!XCompositeQueryExtension(dpy, &event, &error))
++		return 0;
++
++	XCompositeQueryVersion(dpy, &major, &minor);
++
++	return major > 0 || minor >= 4;
++}
++
++int main(void)
++{
++	Display *dpy;
++	Window root, win;
++	XRRScreenResources *res;
++	XRRCrtcInfo **original_crtc;
++	XSetWindowAttributes attr;
++	int i, j, fd;
++
++	attr.override_redirect = 1;
++
++	dpy = XOpenDisplay(NULL);
++	if (dpy == NULL)
++		return 77;
++
++	fd = dri2_open(dpy);
++	if (fd < 0)
++		return 77;
++
++	if (DPMSQueryExtension(dpy, &i, &i))
++		DPMSDisable(dpy);
++
++	root = DefaultRootWindow(dpy);
++
++	signal(SIGALRM, SIG_IGN);
++	XSetErrorHandler(_check_error_handler);
++
++	res = NULL;
++	if (XRRQueryVersion(dpy, &i, &i))
++		res = _XRRGetScreenResourcesCurrent(dpy, root);
++	if (res == NULL)
++		return 77;
++
++	original_crtc = malloc(sizeof(XRRCrtcInfo *)*res->ncrtc);
++	for (i = 0; i < res->ncrtc; i++)
++		original_crtc[i] = XRRGetCrtcInfo(dpy, res, res->crtcs[i]);
++
++	printf("noutput=%d, ncrtc=%d\n", res->noutput, res->ncrtc);
++	for (i = 0; i < res->ncrtc; i++)
++		XRRSetCrtcConfig(dpy, res, res->crtcs[i], CurrentTime,
++				 0, 0, None, RR_Rotate_0, NULL, 0);
++
++	DRI2CreateDrawable(dpy, root);
++	DRI2SwapInterval(dpy, root, 0);
++	run(dpy, root, "off");
++	XSync(dpy, True);
++
++	for (i = 0; i < res->noutput; i++) {
++		XRROutputInfo *output;
++		XRRModeInfo *mode;
++
++		output = XRRGetOutputInfo(dpy, res, res->outputs[i]);
++		if (output == NULL)
++			continue;
++
++		mode = NULL;
++		if (res->nmode)
++			mode = lookup_mode(res, output->modes[0]);
++
++		for (j = 0; mode && j < 2*output->ncrtc; j++) {
++			int c = j;
++			if (c >= output->ncrtc)
++				c = 2*output->ncrtc - j - 1;
++
++			printf("[%d, %d] -- OUTPUT:%ld, CRTC:%ld: %dx%d\n",
++			       i, c, (long)res->outputs[i], (long)output->crtcs[c],
++			       mode->width, mode->height);
++			XRRSetCrtcConfig(dpy, res, output->crtcs[c], CurrentTime,
++					 0, 0, output->modes[0], RR_Rotate_0, &res->outputs[i], 1);
++
++			run(dpy, root, "root");
++			XSync(dpy, True);
++
++			win = XCreateWindow(dpy, root,
++					    0, 0, mode->width, mode->height, 0,
++					    DefaultDepth(dpy, DefaultScreen(dpy)),
++					    InputOutput,
++					    DefaultVisual(dpy, DefaultScreen(dpy)),
++					    CWOverrideRedirect, &attr);
++			DRI2CreateDrawable(dpy, win);
++			DRI2SwapInterval(dpy, win, 0);
++			fullscreen(dpy, win);
++			XMapWindow(dpy, win);
++			run(dpy, win, "fullscreen");
++			XDestroyWindow(dpy, win);
++			XSync(dpy, True);
++
++			win = XCreateWindow(dpy, root,
++					    0, 0, mode->width, mode->height, 0,
++					    DefaultDepth(dpy, DefaultScreen(dpy)),
++					    InputOutput,
++					    DefaultVisual(dpy, DefaultScreen(dpy)),
++					    CWOverrideRedirect, &attr);
++			DRI2CreateDrawable(dpy, win);
++			DRI2SwapInterval(dpy, win, 0);
++			XMapWindow(dpy, win);
++			run(dpy, win, "windowed");
++			XDestroyWindow(dpy, win);
++			XSync(dpy, True);
++
++			if (has_composite(dpy)) {
++				Damage damage;
++
++				_x_error_occurred = 0;
++				win = XCreateWindow(dpy, root,
++						    0, 0, mode->width, mode->height, 0,
++						    DefaultDepth(dpy, DefaultScreen(dpy)),
++						    InputOutput,
++						    DefaultVisual(dpy, DefaultScreen(dpy)),
++						    CWOverrideRedirect, &attr);
++				XCompositeRedirectWindow(dpy, win, CompositeRedirectManual);
++				damage = XDamageCreate(dpy, win, XDamageReportRawRectangles);
++				DRI2CreateDrawable(dpy, win);
++				DRI2SwapInterval(dpy, win, 0);
++				XMapWindow(dpy, win);
++				XSync(dpy, True);
++				if (!_x_error_occurred)
++					run(dpy, win, "composited");
++				XDamageDestroy(dpy, damage);
++				XDestroyWindow(dpy, win);
++				XSync(dpy, True);
++			}
++
++			win = XCreateWindow(dpy, root,
++					    0, 0, mode->width/2, mode->height/2, 0,
++					    DefaultDepth(dpy, DefaultScreen(dpy)),
++					    InputOutput,
++					    DefaultVisual(dpy, DefaultScreen(dpy)),
++					    CWOverrideRedirect, &attr);
++			DRI2CreateDrawable(dpy, win);
++			DRI2SwapInterval(dpy, win, 0);
++			XMapWindow(dpy, win);
++			run(dpy, win, "half");
++			XDestroyWindow(dpy, win);
++			XSync(dpy, True);
++
++			XRRSetCrtcConfig(dpy, res, output->crtcs[c], CurrentTime,
++					 0, 0, None, RR_Rotate_0, NULL, 0);
++		}
++
++		XRRFreeOutputInfo(output);
++	}
++
++	for (i = 0; i < res->ncrtc; i++)
++		XRRSetCrtcConfig(dpy, res, res->crtcs[i], CurrentTime,
++				 original_crtc[i]->x,
++				 original_crtc[i]->y,
++				 original_crtc[i]->mode,
++				 original_crtc[i]->rotation,
++				 original_crtc[i]->outputs,
++				 original_crtc[i]->noutput);
++
++	if (DPMSQueryExtension(dpy, &i, &i))
++		DPMSEnable(dpy);
++	return 0;
++}
+diff --git a/test/dri2-test.c b/test/dri2-test.c
+index dd4179f3..bdf01f38 100644
+--- a/test/dri2-test.c
++++ b/test/dri2-test.c
+@@ -6,6 +6,10 @@
+ #include <X11/Xutil.h>
+ #include <X11/extensions/Xfixes.h>
+ #include <X11/extensions/Xrandr.h>
++#include <X11/Xlib-xcb.h>
++#include <xcb/xcb.h>
++#include <xcb/xcbext.h>
++#include <xcb/dri2.h>
+ #include <unistd.h>
+ #include <fcntl.h>
+ #include <string.h>
+@@ -18,6 +22,8 @@
+ 
+ #define COUNT 60
+ 
++static int prime[] = { 0, 1, 2, 3, 5, 7, 11, 13, 17, 19, 23, 27, 29, 31, 37, 41, 43, 47, 51, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131 };
++
+ static inline XRRScreenResources *_XRRGetScreenResourcesCurrent(Display *dpy, Window window)
+ {
+ 	XRRScreenResources *res;
+@@ -101,16 +107,41 @@ static uint64_t check_msc(Display *dpy, Window win, uint64_t last_msc)
+ 	return current_msc;
+ }
+ 
++static void wait_next_vblank(Display *dpy, Window win)
++{
++	uint64_t msc, ust, sbc;
++	DRI2WaitMSC(dpy, win, 0, 1, 0, &ust, &msc, &sbc);
++}
++
++static void swap_buffers(xcb_connection_t *c, Window win,
++		unsigned int *attachments, int nattachments)
++{
++	unsigned int seq[2];
++
++	seq[0] = xcb_dri2_swap_buffers_unchecked(c, win,
++						 0, 0, 0, 0, 0, 0).sequence;
++
++
++	seq[1] = xcb_dri2_get_buffers_unchecked(c, win,
++						nattachments, nattachments,
++						attachments).sequence;
++
++	xcb_flush(c);
++	xcb_discard_reply(c, seq[0]);
++	xcb_discard_reply(c, seq[1]);
++}
++
+ static void run(Display *dpy, int width, int height,
+ 		unsigned int *attachments, int nattachments,
+ 		const char *name)
+ {
++	xcb_connection_t *c = XGetXCBConnection(dpy);
+ 	Window win;
+ 	XSetWindowAttributes attr;
+-	int count;
+ 	DRI2Buffer *buffers;
+ 	struct timespec start, end;
+-	uint64_t msc;
++	uint64_t start_msc, end_msc;
++	int modulus, remainder, count;
+ 
+ 	/* Be nasty and install a fullscreen window on top so that we
+ 	 * can guarantee we do not get clipped by children.
+@@ -125,42 +156,99 @@ static void run(Display *dpy, int width, int height,
+ 	XMapWindow(dpy, win);
+ 
+ 	DRI2CreateDrawable(dpy, win);
+-	msc = check_msc(dpy, win, 0);
++	DRI2SwapInterval(dpy, win, 1);
++	start_msc = check_msc(dpy, win, 0);
+ 
+ 	buffers = DRI2GetBuffers(dpy, win, &width, &height,
+ 				 attachments, nattachments, &count);
+ 	if (count != nattachments)
+ 		return;
+ 
+-	msc = check_msc(dpy, win, msc);
++	swap_buffers(c, win, attachments, nattachments);
++	start_msc = check_msc(dpy, win, start_msc);
+ 	clock_gettime(CLOCK_MONOTONIC, &start);
+ 	for (count = 0; count < COUNT; count++)
+-		DRI2SwapBuffers(dpy, win, 0, 0, 0);
+-	msc = check_msc(dpy, win, msc);
++		swap_buffers(c, win, attachments, nattachments);
++	end_msc = check_msc(dpy, win, start_msc);
+ 	clock_gettime(CLOCK_MONOTONIC, &end);
+-	printf("%d %s (%dx%d) swaps in %fs.\n",
+-	       count, name, width, height, elapsed(&start, &end));
++	printf("%d [%ld] %s (%dx%d) swaps in %fs.\n",
++	       count, (long)(end_msc - start_msc),
++	       name, width, height, elapsed(&start, &end));
+ 
+-	msc = check_msc(dpy, win, msc);
++	swap_buffers(c, win, attachments, nattachments);
++	start_msc = check_msc(dpy, win, end_msc);
+ 	clock_gettime(CLOCK_MONOTONIC, &start);
+ 	for (count = 0; count < COUNT; count++)
+ 		dri2_copy_swap(dpy, win, width, height, nattachments == 2);
+-	msc = check_msc(dpy, win, msc);
++	end_msc = check_msc(dpy, win, start_msc);
+ 	clock_gettime(CLOCK_MONOTONIC, &end);
+ 
+-	printf("%d %s (%dx%d) blits in %fs.\n",
+-	       count, name, width, height, elapsed(&start, &end));
++	printf("%d [%ld] %s (%dx%d) blits in %fs.\n",
++	       count, (long)(end_msc - start_msc),
++	       name, width, height, elapsed(&start, &end));
+ 
+ 	DRI2SwapInterval(dpy, win, 0);
++	wait_next_vblank(dpy, win);
++
++	swap_buffers(c, win, attachments, nattachments);
++	start_msc = check_msc(dpy, win, end_msc);
++	clock_gettime(CLOCK_MONOTONIC, &start);
++	for (count = 0; count < COUNT; count++)
++		swap_buffers(c, win, attachments, nattachments);
++	end_msc = check_msc(dpy, win, start_msc);
++	clock_gettime(CLOCK_MONOTONIC, &end);
++	printf("%d [%ld] %s (%dx%d) vblank=0 swaps in %fs.\n",
++	       count, (long)(end_msc - start_msc),
++	       name, width, height, elapsed(&start, &end));
+ 
+-	msc = check_msc(dpy, win, msc);
++	start_msc = check_msc(dpy, win, end_msc);
+ 	clock_gettime(CLOCK_MONOTONIC, &start);
+ 	for (count = 0; count < COUNT; count++)
+-		DRI2SwapBuffers(dpy, win, 0, 0, 0);
+-	msc = check_msc(dpy, win, msc);
++		wait_next_vblank(dpy, win);
++	end_msc = check_msc(dpy, win, start_msc);
+ 	clock_gettime(CLOCK_MONOTONIC, &end);
+-	printf("%d %s (%dx%d) vblank=0 swaps in %fs.\n",
+-	       count, name, width, height, elapsed(&start, &end));
++	printf("%d [%ld] %s waits in %fs.\n",
++	       count, (long)(end_msc - start_msc),
++	       name, elapsed(&start, &end));
++
++	printf("Testing past & future waits\n");
++	for (modulus = 1; modulus <= 128; modulus <<= 1) {
++		for (count = 0;  prime[count] < modulus; count++) {
++			uint64_t msc, ust, sbc;
++			uint64_t target;
++
++			remainder = prime[count];
++
++			DRI2WaitMSC(dpy, win, 0, 1, 0, &ust, &msc, &sbc);
++
++			target = msc + modulus + 1;
++			target &= -modulus;
++			target += remainder;
++
++			DRI2WaitMSC(dpy, win, target, modulus, remainder,
++				    &ust, &msc, &sbc);
++			if (msc != target) {
++				printf("Missed future MSC (%d, %d): expected=%lld, found=%lld\n",
++				       modulus, remainder,
++				       (long long)target, (long long)msc);
++			}
++
++			target = msc;
++			target &= -modulus;
++			target += remainder;
++			if (target <= msc)
++				target += modulus;
++
++			DRI2WaitMSC(dpy, win, msc, modulus, remainder,
++				    &ust, &msc, &sbc);
++
++			if (msc != target) {
++				printf("Missed past MSC (%d, %d): expected=%lld, found=%lld\n",
++				       modulus, remainder,
++				       (long long)target, (long long)msc);
++			}
++		}
++	}
+ 
+ 	XDestroyWindow(dpy, win);
+ 	free(buffers);
+diff --git a/test/dri3-test.c b/test/dri3-test.c
+index c66da313..78e105a8 100644
+--- a/test/dri3-test.c
++++ b/test/dri3-test.c
+@@ -93,14 +93,9 @@ static const struct pci_id_match ids[] = {
+ 	INTEL_IVB_D_IDS(070),
+ 	INTEL_IVB_M_IDS(070),
+ 
+-	INTEL_HSW_D_IDS(075),
+-	INTEL_HSW_M_IDS(075),
+-
+-	INTEL_VLV_D_IDS(071),
+-	INTEL_VLV_M_IDS(071),
+-
+-	INTEL_BDW_D_IDS(0100),
+-	INTEL_BDW_M_IDS(0100),
++	INTEL_HSW_IDS(075),
++	INTEL_VLV_IDS(071),
++	INTEL_BDW_IDS(0100),
+ };
+ 
+ static int i915_gen(int device)
+@@ -1020,6 +1015,67 @@ fail:
+ 	return 1;
+ }
+ 
++static int gem_set_tiling(int fd, uint32_t handle, int tiling, int stride)
++{
++	struct drm_i915_gem_set_tiling set_tiling;
++
++	set_tiling.handle = handle;
++	set_tiling.tiling_mode = tiling;
++	set_tiling.stride = stride;
++
++	return drmIoctl(fd, DRM_IOCTL_I915_GEM_SET_TILING, &set_tiling) == 0;
++}
++
++static int test_tiling(Display *dpy, int device)
++{
++	Window root = RootWindow(dpy, DefaultScreen(dpy));
++	const int tiling[] = { I915_TILING_NONE, I915_TILING_X, I915_TILING_Y };
++	int line = -1;
++	int t;
++
++	_x_error_occurred = 0;
++
++	for (t = 0; t < sizeof(tiling)/sizeof(tiling[0]); t++) {
++		uint32_t src;
++		int src_fd;
++		Pixmap src_pix;
++
++		src = gem_create(device, 4*4096);
++		if (!src) {
++			line = __LINE__;
++			goto fail;
++		}
++
++		gem_set_tiling(device, src, tiling[t], 512);
++
++		src_fd = gem_export(device, src);
++		if (src_fd < 0) {
++			line = __LINE__;
++			goto fail;
++		}
++
++		src_pix = dri3_create_pixmap(dpy, root,
++					     128, 32, 32,
++					     src_fd, 32, 512, 4*4096);
++		XSync(dpy, True);
++		if (_x_error_occurred) {
++			line = __LINE__;
++			goto fail;
++		}
++		XFreePixmap(dpy, src_pix);
++		_x_error_occurred = 0;
++
++		close(src_fd);
++		gem_close(device, src);
++	}
++
++	return 0;
++
++fail:
++	printf("%s failed with tiling %d, line %d\n", __func__, tiling[t], line);
++	return 1;
++}
++
+ static int
+ _check_error_handler(Display     *display,
+ 		     XErrorEvent *event)
+@@ -1060,6 +1116,7 @@ int main(void)
+ 
+ 	error += test_bad_size(dpy, device);
+ 	error += test_bad_pitch(dpy, device);
++	error += test_tiling(dpy, device);
+ 
+ 	error += test_shm(dpy, device, 400, 300);
+ 	error += test_shm(dpy, device, 300, 400);
+diff --git a/test/dri3.c b/test/dri3.c
+index 45f3285c..e5644629 100644
+--- a/test/dri3.c
++++ b/test/dri3.c
+@@ -29,6 +29,7 @@
+ #include <xcb/dri3.h>
+ #include <xcb/sync.h>
+ #include <unistd.h>
++#include <stdlib.h>
+ 
+ #include "dri3.h"
+ 
+@@ -109,12 +110,45 @@ void dri3_fence_free(Display *dpy, struct dri3_fence *fence)
+ 	xcb_sync_destroy_fence(c, fence->xid);
+ }
+ 
++static void dri3_query_version(xcb_connection_t *c, int *major, int *minor)
++{
++	xcb_dri3_query_version_reply_t *reply;
++
++	reply = xcb_dri3_query_version_reply(c,
++					     xcb_dri3_query_version(c,
++								    XCB_DRI3_MAJOR_VERSION,
++								    XCB_DRI3_MINOR_VERSION),
++					     NULL);
++	if (reply != NULL) {
++		*major = reply->major_version;
++		*minor = reply->minor_version;
++		free(reply);
++	}
++}
++
++static int dri3_exists(xcb_connection_t *c)
++{
++	const xcb_query_extension_reply_t *ext;
++	int major, minor;
++
++	major = minor = -1;
++
++	ext = xcb_get_extension_data(c, &xcb_dri3_id);
++	if (ext != NULL && ext->present)
++		dri3_query_version(c, &major, &minor);
++
++	return major >= 0;
++}
++
+ int dri3_open__full(Display *dpy, Window root, unsigned provider)
+ {
+ 	xcb_connection_t *c = XGetXCBConnection(dpy);
+ 	xcb_dri3_open_cookie_t cookie;
+ 	xcb_dri3_open_reply_t *reply;
+ 
++	if (!dri3_exists(c))
++		return -1;
++
+ 	cookie = xcb_dri3_open(c, root, provider);
+ 	reply = xcb_dri3_open_reply(c, cookie, NULL);
+ 
+diff --git a/test/present-race.c b/test/present-race.c
+new file mode 100644
+index 00000000..b2b6aa2b
+--- /dev/null
++++ b/test/present-race.c
+@@ -0,0 +1,484 @@
++/*
++ * Copyright (c) 2014 Intel Corporation
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice (including the next
++ * paragraph) shall be included in all copies or substantial portions of the
++ * Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ *
++ */
++
++#ifdef HAVE_CONFIG_H
++#include "config.h"
++#endif
++
++#include <X11/Xlib.h>
++#include <X11/Xlib-xcb.h>
++#include <X11/xshmfence.h>
++#include <X11/Xutil.h>
++#include <X11/Xlibint.h>
++#include <X11/extensions/dpms.h>
++#include <X11/extensions/randr.h>
++#include <X11/extensions/Xcomposite.h>
++#include <X11/extensions/Xrandr.h>
++#include <X11/extensions/Xrender.h>
++#include <X11/extensions/XShm.h>
++#if HAVE_X11_EXTENSIONS_SHMPROTO_H
++#include <X11/extensions/shmproto.h>
++#elif HAVE_X11_EXTENSIONS_SHMSTR_H
++#include <X11/extensions/shmstr.h>
++#else
++#error Failed to find the right header for X11 MIT-SHM protocol definitions
++#endif
++#include <xcb/xcb.h>
++#include <xcb/present.h>
++#include <xcb/xfixes.h>
++#include <xcb/dri3.h>
++#include <xf86drm.h>
++#include <i915_drm.h>
++
++#include <stdio.h>
++#include <string.h>
++#include <fcntl.h>
++#include <unistd.h>
++#include <assert.h>
++#include <errno.h>
++#include <setjmp.h>
++#include <signal.h>
++
++#include <sys/mman.h>
++#include <sys/ipc.h>
++#include <sys/shm.h>
++#include <pciaccess.h>
++
++#include "dri3.h"
++
++static int _x_error_occurred;
++static uint32_t stamp;
++
++static int
++_check_error_handler(Display     *display,
++		     XErrorEvent *event)
++{
++	printf("X11 error from display %s, serial=%ld, error=%d, req=%d.%d\n",
++	       DisplayString(display),
++	       event->serial,
++	       event->error_code,
++	       event->request_code,
++	       event->minor_code);
++	_x_error_occurred++;
++	return False; /* ignored */
++}
++
++static int has_composite(Display *dpy)
++{
++	int event, error;
++	int major, minor;
++
++	if (!XCompositeQueryExtension(dpy, &event, &error))
++		return 0;
++
++	XCompositeQueryVersion(dpy, &major, &minor);
++
++	return major > 0 || minor >= 4;
++}
++
++static void *setup_msc(Display *dpy, Window win)
++{
++	xcb_connection_t *c = XGetXCBConnection(dpy);
++	xcb_void_cookie_t cookie;
++	uint32_t id = xcb_generate_id(c);
++	xcb_generic_error_t *error;
++	void *q;
++
++	cookie = xcb_present_select_input_checked(c, id, win, XCB_PRESENT_EVENT_MASK_COMPLETE_NOTIFY);
++	q = xcb_register_for_special_xge(c, &xcb_present_id, id, &stamp);
++
++	error = xcb_request_check(c, cookie);
++	assert(error == NULL);
++
++	return q;
++}
++
++static void teardown_msc(Display *dpy, void *q)
++{
++	xcb_unregister_for_special_event(XGetXCBConnection(dpy), q);
++}
++
++static uint64_t wait_vblank(Display *dpy, Window win)
++{
++	xcb_connection_t *c = XGetXCBConnection(dpy);
++	static uint32_t serial = 1;
++	uint64_t msc = 0;
++	int complete = 0;
++	void *q;
++
++	if (win == 0)
++		win = DefaultRootWindow(dpy);
++
++	q = setup_msc(dpy, win);
++
++	xcb_present_notify_msc(c, win, serial ^ 0xdeadbeef, 0, 1, 0);
++	xcb_flush(c);
++
++	do {
++		xcb_present_complete_notify_event_t *ce;
++		xcb_generic_event_t *ev;
++
++		ev = xcb_wait_for_special_event(c, q);
++		if (ev == NULL)
++			break;
++
++		ce = (xcb_present_complete_notify_event_t *)ev;
++		if (ce->kind == XCB_PRESENT_COMPLETE_KIND_NOTIFY_MSC &&
++		    ce->serial == (serial ^ 0xdeadbeef)) {
++			msc = ce->msc;
++			complete = 1;
++		}
++		free(ev);
++	} while (!complete);
++
++	if (++serial == 0)
++		serial = 1;
++
++	teardown_msc(dpy, q);
++
++	return msc;
++}
++
++static int test_basic(Display *dpy, int dummy)
++{
++	xcb_connection_t *c = XGetXCBConnection(dpy);
++	XSetWindowAttributes attr;
++	Visual *visual = DefaultVisual(dpy, DefaultScreen(dpy));
++	Pixmap pixmap;
++	struct dri3_fence fence;
++	Window root, win;
++	unsigned int width, height;
++	unsigned border, depth;
++	int x, y, ret = 1;
++	const char *phase;
++	uint64_t msc;
++
++	root = DefaultRootWindow(dpy);
++	XGetGeometry(dpy, root,
++		     &win, &x, &y,
++		     &width, &height, &border, &depth);
++
++	_x_error_occurred = 0;
++	attr.override_redirect = 1;
++	switch (dummy) {
++	case 0:
++		win = root;
++		phase = "root";
++		break;
++	case 1:
++		win = XCreateWindow(dpy, root,
++				    0, 0, width, height, 0, depth,
++				    InputOutput, visual,
++				    CWOverrideRedirect, &attr);
++		phase = "fullscreen";
++		break;
++	case 2:
++		width /= 2;
++		height /= 2;
++		win = XCreateWindow(dpy, root,
++				    0, 0, width, height, 0, depth,
++				    InputOutput, visual,
++				    CWOverrideRedirect, &attr);
++		phase = "window";
++		break;
++	case 3:
++		if (!has_composite(dpy))
++			return 0;
++
++		win = XCreateWindow(dpy, root,
++				    0, 0, width, height, 0,
++				    DefaultDepth(dpy, DefaultScreen(dpy)),
++				    InputOutput,
++				    DefaultVisual(dpy, DefaultScreen(dpy)),
++				    CWOverrideRedirect, &attr);
++		XCompositeRedirectWindow(dpy, win, CompositeRedirectManual);
++		phase = "composite";
++		break;
++
++	default:
++		phase = "broken";
++		win = root;
++		abort();
++		break;
++	}
++
++	XMapWindow(dpy, win);
++	XSync(dpy, True);
++	if (_x_error_occurred)
++		return 1;
++
++	if (dri3_create_fence(dpy, win, &fence))
++		return 0;
++
++	printf("%s: Testing basic flip: %dx%d\n", phase, width, height);
++	fflush(stdout);
++	_x_error_occurred = 0;
++
++	xshmfence_reset(fence.addr);
++	msc = wait_vblank(dpy, win);
++
++	pixmap = XCreatePixmap(dpy, win, width, height, depth);
++	xcb_present_pixmap(c, win, pixmap, 0,
++			   0, /* valid */
++			   0, /* update */
++			   0, /* x_off */
++			   0, /* y_off */
++			   None,
++			   None, /* wait fence */
++			   fence.xid,
++			   XCB_PRESENT_OPTION_NONE,
++			   (msc + 64) & -64, /* target msc */
++			   64, /* divisor */
++			   32, /* remainder */
++			   0, NULL);
++	XFreePixmap(dpy, pixmap);
++
++	pixmap = XCreatePixmap(dpy, win, width, height, depth);
++	xcb_present_pixmap(c, win, pixmap, 0,
++			   0, /* valid */
++			   0, /* update */
++			   0, /* x_off */
++			   0, /* y_off */
++			   None,
++			   None, /* wait fence */
++			   None, /* sync fence */
++			   XCB_PRESENT_OPTION_NONE,
++			   (msc + 64) & -64, /* target msc */
++			   64, /* divisor */
++			   48, /* remainder */
++			   0, NULL);
++	XFreePixmap(dpy, pixmap);
++	XDestroyWindow(dpy, win);
++	XFlush(dpy);
++
++	ret = !!xshmfence_await(fence.addr);
++	dri3_fence_free(dpy, &fence);
++
++	XSync(dpy, True);
++	ret += !!_x_error_occurred;
++
++	return ret;
++}
++
++static int test_race(Display *dpy, int dummy)
++{
++	Display *mgr = XOpenDisplay(NULL);
++	xcb_connection_t *c = XGetXCBConnection(dpy);
++	XSetWindowAttributes attr;
++	Visual *visual = DefaultVisual(dpy, DefaultScreen(dpy));
++	Pixmap pixmap;
++	struct dri3_fence fence;
++	Window root, win;
++	unsigned int width, height;
++	unsigned border, depth;
++	int x, y, ret = 1;
++	const char *phase;
++	uint64_t msc;
++
++	root = DefaultRootWindow(dpy);
++	XGetGeometry(dpy, root,
++		     &win, &x, &y,
++		     &width, &height, &border, &depth);
++
++	_x_error_occurred = 0;
++	attr.override_redirect = 1;
++	switch (dummy) {
++	case 0:
++		win = root;
++		phase = "root";
++		break;
++	case 1:
++		win = XCreateWindow(dpy, root,
++				    0, 0, width, height, 0, depth,
++				    InputOutput, visual,
++				    CWOverrideRedirect, &attr);
++		phase = "fullscreen";
++		break;
++	case 2:
++		width /= 2;
++		height /= 2;
++		win = XCreateWindow(dpy, root,
++				    0, 0, width, height, 0, depth,
++				    InputOutput, visual,
++				    CWOverrideRedirect, &attr);
++		phase = "window";
++		break;
++	case 3:
++		if (!has_composite(dpy))
++			return 0;
++
++		win = XCreateWindow(dpy, root,
++				    0, 0, width, height, 0,
++				    DefaultDepth(dpy, DefaultScreen(dpy)),
++				    InputOutput,
++				    DefaultVisual(dpy, DefaultScreen(dpy)),
++				    CWOverrideRedirect, &attr);
++		XCompositeRedirectWindow(dpy, win, CompositeRedirectManual);
++		phase = "composite";
++		break;
++
++	default:
++		phase = "broken";
++		win = root;
++		abort();
++		break;
++	}
++
++	XMapWindow(dpy, win);
++	XSync(dpy, True);
++	if (_x_error_occurred)
++		return 1;
++
++	if (dri3_create_fence(dpy, win, &fence))
++		return 0;
++
++	printf("%s: Testing race with manager: %dx%d\n", phase, width, height);
++	fflush(stdout);
++	_x_error_occurred = 0;
++
++	xshmfence_reset(fence.addr);
++	msc = wait_vblank(dpy, win);
++
++	pixmap = XCreatePixmap(dpy, win, width, height, depth);
++	xcb_present_pixmap(c, win, pixmap, 0,
++			   0, /* valid */
++			   0, /* update */
++			   0, /* x_off */
++			   0, /* y_off */
++			   None,
++			   None, /* wait fence */
++			   fence.xid,
++			   XCB_PRESENT_OPTION_NONE,
++			   (msc + 64) & -64, /* target msc */
++			   64, /* divisor */
++			   32, /* remainder */
++			   0, NULL);
++	XFreePixmap(dpy, pixmap);
++
++	XFlush(dpy);
++	XDestroyWindow(mgr, win);
++	XFlush(mgr);
++
++	pixmap = XCreatePixmap(dpy, win, width, height, depth);
++	xcb_present_pixmap(c, win, pixmap, 0,
++			   0, /* valid */
++			   0, /* update */
++			   0, /* x_off */
++			   0, /* y_off */
++			   None,
++			   None, /* wait fence */
++			   None, /* sync fence */
++			   XCB_PRESENT_OPTION_NONE,
++			   (msc + 64) & -64, /* target msc */
++			   64, /* divisor */
++			   48, /* remainder */
++			   0, NULL);
++	XFreePixmap(dpy, pixmap);
++	XFlush(dpy);
++
++	ret = !!xshmfence_await(fence.addr);
++	dri3_fence_free(dpy, &fence);
++
++	XSync(dpy, True);
++	ret += !!_x_error_occurred;
++
++	XCloseDisplay(mgr);
++
++	return ret;
++}
++
++static int has_present(Display *dpy)
++{
++	xcb_connection_t *c = XGetXCBConnection(dpy);
++	xcb_generic_error_t *error = NULL;
++	void *reply;
++
++	reply = xcb_xfixes_query_version_reply(c,
++					       xcb_xfixes_query_version(c,
++									XCB_XFIXES_MAJOR_VERSION,
++									XCB_XFIXES_MINOR_VERSION),
++					       &error);
++	free(reply);
++	free(error);
++	if (reply == NULL) {
++		fprintf(stderr, "XFixes not supported on %s\n", DisplayString(dpy));
++		return 0;
++	}
++
++	reply = xcb_dri3_query_version_reply(c,
++					     xcb_dri3_query_version(c,
++								    XCB_DRI3_MAJOR_VERSION,
++								    XCB_DRI3_MINOR_VERSION),
++					     &error);
++	free(reply);
++	free(error);
++	if (reply == NULL) {
++		fprintf(stderr, "DRI3 not supported on %s\n", DisplayString(dpy));
++		return 0;
++	}
++
++	reply = xcb_present_query_version_reply(c,
++						xcb_present_query_version(c,
++									  XCB_PRESENT_MAJOR_VERSION,
++									  XCB_PRESENT_MINOR_VERSION),
++						&error);
++
++	free(reply);
++	free(error);
++	if (reply == NULL) {
++		fprintf(stderr, "Present not supported on %s\n", DisplayString(dpy));
++		return 0;
++	}
++
++	return 1;
++}
++
++int main(void)
++{
++	Display *dpy;
++	int dummy;
++	int error = 0;
++
++	dpy = XOpenDisplay(NULL);
++	if (dpy == NULL)
++		return 77;
++
++	if (!has_present(dpy))
++		return 77;
++
++	if (DPMSQueryExtension(dpy, &dummy, &dummy))
++		DPMSDisable(dpy);
++
++	signal(SIGALRM, SIG_IGN);
++	XSetErrorHandler(_check_error_handler);
++
++	for (dummy = 0; dummy <= 3; dummy++) {
++		error += test_basic(dpy, dummy);
++		error += test_race(dpy, dummy);
++	}
++
++	if (DPMSQueryExtension(dpy, &dummy, &dummy))
++		DPMSEnable(dpy);
++	return !!error;
++}
+diff --git a/test/present-speed.c b/test/present-speed.c
+new file mode 100644
+index 00000000..eccde931
+--- /dev/null
++++ b/test/present-speed.c
+@@ -0,0 +1,1015 @@
++/*
++ * Copyright (c) 2015 Intel Corporation
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice (including the next
++ * paragraph) shall be included in all copies or substantial portions of the
++ * Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ *
++ */
++
++#ifdef HAVE_CONFIG_H
++#include "config.h"
++#endif
++
++#include <X11/Xlib.h>
++#include <X11/Xatom.h>
++#include <X11/Xlib-xcb.h>
++#include <X11/xshmfence.h>
++#include <X11/Xutil.h>
++#include <X11/Xlibint.h>
++#include <X11/extensions/Xcomposite.h>
++#include <X11/extensions/Xdamage.h>
++#include <X11/extensions/dpms.h>
++#include <X11/extensions/randr.h>
++#include <X11/extensions/Xrandr.h>
++#include <xcb/xcb.h>
++#include <xcb/present.h>
++#include <xcb/dri3.h>
++#include <xcb/xfixes.h>
++#include <xf86drm.h>
++#include <i915_drm.h>
++
++#include <stdio.h>
++#include <string.h>
++#include <fcntl.h>
++#include <unistd.h>
++#include <assert.h>
++#include <errno.h>
++#include <setjmp.h>
++#include <signal.h>
++#include <sys/wait.h>
++
++#include "dri3.h"
++
++static int _x_error_occurred;
++static uint32_t stamp;
++
++struct list {
++    struct list *next, *prev;
++};
++
++static void
++list_init(struct list *list)
++{
++    list->next = list->prev = list;
++}
++
++static inline void
++__list_add(struct list *entry,
++	    struct list *prev,
++	    struct list *next)
++{
++    next->prev = entry;
++    entry->next = next;
++    entry->prev = prev;
++    prev->next = entry;
++}
++
++static inline void
++list_add(struct list *entry, struct list *head)
++{
++    __list_add(entry, head, head->next);
++}
++
++static inline void
++__list_del(struct list *prev, struct list *next)
++{
++	next->prev = prev;
++	prev->next = next;
++}
++
++static inline void
++_list_del(struct list *entry)
++{
++    __list_del(entry->prev, entry->next);
++}
++
++static inline void
++list_move(struct list *list, struct list *head)
++{
++	if (list->prev != head) {
++		_list_del(list);
++		list_add(list, head);
++	}
++}
++
++#define __container_of(ptr, sample, member)				\
++    (void *)((char *)(ptr) - ((char *)&(sample)->member - (char *)(sample)))
++
++#define list_for_each_entry(pos, head, member)				\
++    for (pos = __container_of((head)->next, pos, member);		\
++	 &pos->member != (head);					\
++	 pos = __container_of(pos->member.next, pos, member))
++
++static int
++_check_error_handler(Display     *display,
++		     XErrorEvent *event)
++{
++	if (_x_error_occurred < 0)
++		return True;
++
++	printf("X11 error from display %s, serial=%ld, error=%d, req=%d.%d\n",
++	       DisplayString(display),
++	       event->serial,
++	       event->error_code,
++	       event->request_code,
++	       event->minor_code);
++	_x_error_occurred++;
++	return False; /* ignored */
++}
++
++static double elapsed(const struct timespec *start,
++		      const struct timespec *end)
++{
++	return 1e6*(end->tv_sec - start->tv_sec) + (end->tv_nsec - start->tv_nsec)/1000;
++}
++
++struct buffer {
++	struct list link;
++	Pixmap pixmap;
++	struct dri3_fence fence;
++	int fd;
++	int busy;
++	int id;
++};
++
++#define DRI3 1
++#define NOCOPY 2
++#define ASYNC 4
++static void run(Display *dpy, Window win, const char *name, unsigned options)
++{
++	xcb_connection_t *c = XGetXCBConnection(dpy);
++	struct timespec start, end;
++#define N_BACK 8
++	char test_name[128];
++	struct buffer buffer[N_BACK];
++	struct list mru;
++	Window root;
++	unsigned int width, height;
++	unsigned border, depth;
++	unsigned present_flags = 0;
++	xcb_xfixes_region_t update = 0;
++	int completed = 0;
++	int queued = 0;
++	uint32_t eid = 0;
++	void *Q = NULL;
++	int i, n;
++
++	list_init(&mru);
++
++	XGetGeometry(dpy, win,
++		     &root, &i, &n, &width, &height, &border, &depth);
++
++	_x_error_occurred = 0;
++
++	for (n = 0; n < N_BACK; n++) {
++		buffer[n].pixmap = xcb_generate_id(c);
++		xcb_create_pixmap(c, depth, buffer[n].pixmap, win,
++				  width, height);
++		buffer[n].fence.xid = 0;
++		buffer[n].fd = -1;
++		buffer[n].id = n;
++		if (options & DRI3) {
++			xcb_dri3_buffer_from_pixmap_reply_t *reply;
++			int *fds;
++
++			if (dri3_create_fence(dpy, win, &buffer[n].fence))
++				return;
++
++			reply = xcb_dri3_buffer_from_pixmap_reply (c,
++								   xcb_dri3_buffer_from_pixmap(c, buffer[n].pixmap),
++								   NULL);
++			if (reply == NULL)
++				return;
++
++			fds = xcb_dri3_buffer_from_pixmap_reply_fds (c, reply);
++			buffer[n].fd = fds[0];
++			free(reply);
++
++			/* start idle */
++			xshmfence_trigger(buffer[n].fence.addr);
++		}
++		buffer[n].busy = 0;
++		list_add(&buffer[n].link, &mru);
++	}
++	if (options & ASYNC)
++		present_flags |= XCB_PRESENT_OPTION_ASYNC;
++	if (options & NOCOPY) {
++		update = xcb_generate_id(c);
++		xcb_xfixes_create_region(c, update, 0, NULL);
++		present_flags |= XCB_PRESENT_OPTION_COPY;
++	}
++
++	if (!(options & DRI3)) {
++		eid = xcb_generate_id(c);
++		xcb_present_select_input(c, eid, win,
++					 (options & NOCOPY ? 0 : XCB_PRESENT_EVENT_MASK_IDLE_NOTIFY) |
++					 XCB_PRESENT_EVENT_MASK_COMPLETE_NOTIFY);
++		Q = xcb_register_for_special_xge(c, &xcb_present_id, eid, &stamp);
++	}
++
++	clock_gettime(CLOCK_MONOTONIC, &start);
++	do {
++		for (n = 0; n < 1000; n++) {
++			struct buffer *tmp, *b = NULL;
++retry:
++			list_for_each_entry(tmp, &mru, link) {
++				if (tmp->fence.xid)
++					tmp->busy = !xshmfence_query(tmp->fence.addr);
++				if (!tmp->busy) {
++					b = tmp;
++					break;
++				}
++			}
++			if (options & DRI3) {
++				if (b == NULL)
++					goto retry;
++
++				xshmfence_reset(b->fence.addr);
++				queued--;
++				completed++;
++			} else while (b == NULL) {
++				xcb_present_generic_event_t *ev;
++
++				ev = (xcb_present_generic_event_t *)
++					xcb_wait_for_special_event(c, Q);
++				if (ev == NULL)
++					abort();
++
++				do {
++					switch (ev->evtype) {
++					case XCB_PRESENT_COMPLETE_NOTIFY:
++						completed++;
++						queued--;
++						break;
++
++					case XCB_PRESENT_EVENT_IDLE_NOTIFY:
++						{
++							xcb_present_idle_notify_event_t *ie = (xcb_present_idle_notify_event_t *)ev;
++							assert(ie->serial < N_BACK);
++							buffer[ie->serial].busy = 0;
++							if (b == NULL)
++								b = &buffer[ie->serial];
++							break;
++						}
++					}
++					free(ev);
++				} while ((ev = (xcb_present_generic_event_t *)xcb_poll_for_special_event(c, Q)));
++			}
++
++			b->busy = (options & NOCOPY) == 0;
++			xcb_present_pixmap(c, win, b->pixmap, b->id,
++					   0, /* valid */
++					   update, /* update */
++					   0, /* x_off */
++					   0, /* y_off */
++					   None,
++					   None, /* wait fence */
++					   b->fence.xid,
++					   present_flags,
++					   0, /* target msc */
++					   0, /* divisor */
++					   0, /* remainder */
++					   0, NULL);
++			list_move(&b->link, &mru);
++			queued++;
++			xcb_flush(c);
++		}
++		clock_gettime(CLOCK_MONOTONIC, &end);
++	} while (end.tv_sec < start.tv_sec + 10);
++
++	if (options & DRI3) {
++		struct buffer *b;
++		XID pixmap;
++
++		pixmap = xcb_generate_id(c);
++		xcb_create_pixmap(c, depth, pixmap, win, width, height);
++		xcb_present_pixmap(c, win, pixmap, 0xdeadbeef,
++				   0, /* valid */
++				   None, /* update */
++				   0, /* x_off */
++				   0, /* y_off */
++				   None,
++				   None, /* wait fence */
++				   None,
++				   0,
++				   0, /* target msc */
++				   0, /* divisor */
++				   0, /* remainder */
++				   0, NULL);
++		xcb_flush(c);
++
++		list_for_each_entry(b, &mru, link)
++			xshmfence_await(b->fence.addr);
++
++		xcb_free_pixmap(c, pixmap);
++		completed += queued;
++	} else while (queued) {
++		xcb_present_generic_event_t *ev;
++
++		ev = (xcb_present_generic_event_t *)
++			xcb_wait_for_special_event(c, Q);
++		if (ev == NULL)
++			abort();
++
++		do {
++			switch (ev->evtype) {
++			case XCB_PRESENT_COMPLETE_NOTIFY:
++				completed++;
++				queued--;
++				break;
++
++			case XCB_PRESENT_EVENT_IDLE_NOTIFY:
++				break;
++			}
++			free(ev);
++		} while ((ev = (xcb_present_generic_event_t *)xcb_poll_for_special_event(c, Q)));
++	}
++	clock_gettime(CLOCK_MONOTONIC, &end);
++
++	if (update)
++		xcb_xfixes_destroy_region(c, update);
++	for (n = 0; n < N_BACK; n++) {
++		if (buffer[n].fence.xid)
++			dri3_fence_free(dpy, &buffer[n].fence);
++		if (buffer[n].fd != -1)
++			close(buffer[n].fd);
++		xcb_free_pixmap(c, buffer[n].pixmap);
++	}
++
++	if (Q) {
++		xcb_discard_reply(c, xcb_present_select_input_checked(c, eid, win, 0).sequence);
++		XSync(dpy, True);
++		xcb_unregister_for_special_event(c, Q);
++	}
++
++	test_name[0] = '\0';
++	if (options) {
++		snprintf(test_name, sizeof(test_name), "(%s%s%s )",
++			 options & NOCOPY ? " no-copy" : "",
++			 options & DRI3 ? " dri3" : "",
++			 options & ASYNC ? " async" : "");
++	}
++	printf("%s%s: Completed %d presents in %.1fs, %.3fus each (%.1f FPS)\n",
++	       name, test_name,
++	       completed, elapsed(&start, &end) / 1000000,
++	       elapsed(&start, &end) / completed,
++	       completed / (elapsed(&start, &end) / 1000000));
++}
++
++struct perpixel {
++	Window win;
++	struct buffer buffer[N_BACK];
++	struct list mru;
++	uint32_t eid;
++	void *Q;
++	int queued;
++};
++
++static void perpixel(Display *dpy,
++		     int max_width, int max_height, unsigned options)
++{
++	//const int sz = max_width * max_height;
++	const int sz = 1048;
++	struct perpixel *pp;
++	xcb_connection_t *c = XGetXCBConnection(dpy);
++	struct timespec start, end;
++	char test_name[128];
++	unsigned present_flags = 0;
++	xcb_xfixes_region_t update = 0;
++	int completed = 0;
++	int i, n;
++
++	pp = calloc(sz, sizeof(*pp));
++	if (!pp)
++		return;
++
++	for (i = 0; i < sz; i++) {
++		XSetWindowAttributes attr = { .override_redirect = 1 };
++		int depth = DefaultDepth(dpy, DefaultScreen(dpy));
++		pp[i].win = XCreateWindow(dpy, DefaultRootWindow(dpy),
++					  i % max_width, i / max_width, 1, 1, 0, depth,
++					  InputOutput,
++					  DefaultVisual(dpy, DefaultScreen(dpy)),
++					  CWOverrideRedirect, &attr);
++		XMapWindow(dpy, pp[i].win);
++		list_init(&pp[i].mru);
++		for (n = 0; n < N_BACK; n++) {
++			pp[i].buffer[n].pixmap = xcb_generate_id(c);
++			xcb_create_pixmap(c, depth, pp[i].buffer[n].pixmap,
++					  pp[i].win, 1, 1);
++			pp[i].buffer[n].fence.xid = 0;
++			pp[i].buffer[n].fd = -1;
++			pp[i].buffer[n].id = n;
++			if (options & DRI3) {
++				xcb_dri3_buffer_from_pixmap_reply_t *reply;
++				int *fds;
++
++				if (dri3_create_fence(dpy, pp[i].win, &pp[i].buffer[n].fence))
++					return;
++
++				reply = xcb_dri3_buffer_from_pixmap_reply(c,
++									  xcb_dri3_buffer_from_pixmap(c, pp[i].buffer[n].pixmap),
++									  NULL);
++				if (reply == NULL)
++					return;
++
++				fds = xcb_dri3_buffer_from_pixmap_reply_fds(c, reply);
++				pp[i].buffer[n].fd = fds[0];
++				free(reply);
++
++				/* start idle */
++				xshmfence_trigger(pp[i].buffer[n].fence.addr);
++			}
++			pp[i].buffer[n].busy = 0;
++			list_add(&pp[i].buffer[n].link, &pp[i].mru);
++		}
++
++		if (!(options & DRI3)) {
++			pp[i].eid = xcb_generate_id(c);
++			xcb_present_select_input(c, pp[i].eid, pp[i].win,
++						 (options & NOCOPY ? 0 : XCB_PRESENT_EVENT_MASK_IDLE_NOTIFY) |
++						 XCB_PRESENT_EVENT_MASK_COMPLETE_NOTIFY);
++			pp[i].Q = xcb_register_for_special_xge(c, &xcb_present_id, pp[i].eid, &stamp);
++		}
++		pp[i].queued = 0;
++	}
++
++	XSync(dpy, True);
++	_x_error_occurred = 0;
++
++	if (options & ASYNC)
++		present_flags |= XCB_PRESENT_OPTION_ASYNC;
++	if (options & NOCOPY) {
++		update = xcb_generate_id(c);
++		xcb_xfixes_create_region(c, update, 0, NULL);
++		present_flags |= XCB_PRESENT_OPTION_COPY;
++	}
++
++	clock_gettime(CLOCK_MONOTONIC, &start);
++	do {
++		for (i = 0; i < sz; i++) {
++			struct buffer *tmp, *b = NULL;
++retry:
++			list_for_each_entry(tmp, &pp[i].mru, link) {
++				if (tmp->fence.xid)
++					tmp->busy = !xshmfence_query(tmp->fence.addr);
++				if (!tmp->busy) {
++					b = tmp;
++					break;
++				}
++			}
++			if (options & DRI3) {
++				if (b == NULL)
++					goto retry;
++
++				xshmfence_reset(b->fence.addr);
++				pp[i].queued--;
++				completed++;
++			} else while (b == NULL) {
++				xcb_present_generic_event_t *ev;
++
++				ev = (xcb_present_generic_event_t *)
++					xcb_wait_for_special_event(c, pp[i].Q);
++				if (ev == NULL)
++					abort();
++
++				do {
++					switch (ev->evtype) {
++					case XCB_PRESENT_COMPLETE_NOTIFY:
++						completed++;
++						pp[i].queued--;
++						break;
++
++					case XCB_PRESENT_EVENT_IDLE_NOTIFY:
++						{
++							xcb_present_idle_notify_event_t *ie = (xcb_present_idle_notify_event_t *)ev;
++							assert(ie->serial < N_BACK);
++							pp[i].buffer[ie->serial].busy = 0;
++							if (b == NULL)
++								b = &pp[i].buffer[ie->serial];
++							break;
++						}
++					}
++					free(ev);
++				} while ((ev = (xcb_present_generic_event_t *)xcb_poll_for_special_event(c, pp[i].Q)));
++			}
++
++			b->busy = (options & NOCOPY) == 0;
++			xcb_present_pixmap(c, pp[i].win, b->pixmap, b->id,
++					   0, /* valid */
++					   update, /* update */
++					   0, /* x_off */
++					   0, /* y_off */
++					   None,
++					   None, /* wait fence */
++					   b->fence.xid,
++					   present_flags,
++					   0, /* target msc */
++					   0, /* divisor */
++					   0, /* remainder */
++					   0, NULL);
++			list_move(&b->link, &pp[i].mru);
++			pp[i].queued++;
++		}
++		xcb_flush(c);
++		clock_gettime(CLOCK_MONOTONIC, &end);
++	} while (end.tv_sec < start.tv_sec + 10);
++
++	for (i = 0; i < sz; i++) {
++		if (options & DRI3) {
++			int depth = DefaultDepth(dpy, DefaultScreen(dpy));
++			struct buffer *b;
++			XID pixmap;
++
++			pixmap = xcb_generate_id(c);
++			xcb_create_pixmap(c, depth, pixmap, pp[i].win, 1, 1);
++			xcb_present_pixmap(c, pp[i].win, pixmap, 0xdeadbeef,
++					   0, /* valid */
++					   None, /* update */
++					   0, /* x_off */
++					   0, /* y_off */
++					   None,
++					   None, /* wait fence */
++					   None,
++					   0,
++					   0, /* target msc */
++					   0, /* divisor */
++					   0, /* remainder */
++					   0, NULL);
++			xcb_flush(c);
++
++			list_for_each_entry(b, &pp[i].mru, link)
++				xshmfence_await(b->fence.addr);
++
++			xcb_free_pixmap(c, pixmap);
++			completed += pp[i].queued;
++		} else while (pp[i].queued) {
++			xcb_present_generic_event_t *ev;
++
++			ev = (xcb_present_generic_event_t *)
++				xcb_wait_for_special_event(c, pp[i].Q);
++			if (ev == NULL)
++				abort();
++
++			do {
++				switch (ev->evtype) {
++				case XCB_PRESENT_COMPLETE_NOTIFY:
++					completed++;
++					pp[i].queued--;
++					break;
++
++				case XCB_PRESENT_EVENT_IDLE_NOTIFY:
++					break;
++				}
++				free(ev);
++			} while ((ev = (xcb_present_generic_event_t *)xcb_poll_for_special_event(c, pp[i].Q)));
++		}
++	}
++	clock_gettime(CLOCK_MONOTONIC, &end);
++
++	if (update)
++		xcb_xfixes_destroy_region(c, update);
++
++	for (i = 0; i < sz; i++) {
++		for (n = 0; n < N_BACK; n++) {
++			if (pp[i].buffer[n].fence.xid)
++				dri3_fence_free(dpy, &pp[i].buffer[n].fence);
++			if (pp[i].buffer[n].fd != -1)
++				close(pp[i].buffer[n].fd);
++			xcb_free_pixmap(c, pp[i].buffer[n].pixmap);
++		}
++
++		if (pp[i].Q) {
++			xcb_discard_reply(c, xcb_present_select_input_checked(c, pp[i].eid, pp[i].win, 0).sequence);
++			XSync(dpy, True);
++			xcb_unregister_for_special_event(c, pp[i].Q);
++		}
++
++		XDestroyWindow(dpy, pp[i].win);
++	}
++	free(pp);
++
++	test_name[0] = '\0';
++	if (options) {
++		snprintf(test_name, sizeof(test_name), "(%s%s%s )",
++			 options & NOCOPY ? " no-copy" : "",
++			 options & DRI3 ? " dri3" : "",
++			 options & ASYNC ? " async" : "");
++	}
++	printf("%s%s: Completed %d presents in %.1fs, %.3fus each (%.1f FPS)\n",
++	       __func__, test_name,
++	       completed, elapsed(&start, &end) / 1000000,
++	       elapsed(&start, &end) / completed,
++	       completed / (elapsed(&start, &end) / 1000000));
++}
++
++static int isqrt(int x)
++{
++	int i;
++
++	for (i = 2; i*i < x; i++)
++		;
++	return i;
++}
++
++struct sibling {
++	pthread_t thread;
++	Display *dpy;
++	int x, y;
++	int width, height;
++	unsigned options;
++};
++
++static void *sibling(void *arg)
++{
++	struct sibling *s = arg;
++	XSetWindowAttributes attr = { .override_redirect = 1 };
++	Window win = XCreateWindow(s->dpy, DefaultRootWindow(s->dpy),
++				   s->x, s->y, s->width, s->height, 0,
++				   DefaultDepth(s->dpy, DefaultScreen(s->dpy)),
++				   InputOutput,
++				   DefaultVisual(s->dpy, DefaultScreen(s->dpy)),
++				   CWOverrideRedirect, &attr);
++	XMapWindow(s->dpy, win);
++	run(s->dpy, win, "sibling", s->options);
++	return NULL;
++}
++
++static void siblings(Display *dpy,
++		     int max_width, int max_height, int ncpus, unsigned options)
++{
++	int sq_ncpus = isqrt(ncpus);
++	int width = max_width / sq_ncpus;
++	int height = max_height/ sq_ncpus;
++	struct sibling s[ncpus];
++	int child;
++
++	if (ncpus <= 1)
++		return;
++
++	for (child = 0; child < ncpus; child++) {
++		s[child].dpy = dpy;
++		s[child].x = (child % sq_ncpus) * width;
++		s[child].y = (child / sq_ncpus) * height;
++		s[child].width = width;
++		s[child].height = height;
++		s[child].options = options;
++		pthread_create(&s[child].thread, NULL, sibling, &s[child]);
++	}
++
++	for (child = 0; child < ncpus; child++)
++		pthread_join(s[child].thread, NULL);
++}
++
++static void cousins(int max_width, int max_height, int ncpus, unsigned options)
++{
++	int sq_ncpus = isqrt(ncpus);
++	int width = max_width / sq_ncpus;
++	int height = max_height/ sq_ncpus;
++	int child;
++
++	if (ncpus <= 1)
++		return;
++
++	for (child = 0; child < ncpus; child++) {
++		for (; fork() == 0; exit(0)) {
++			int x = (child % sq_ncpus) * width;
++			int y = (child / sq_ncpus) * height;
++			XSetWindowAttributes attr = { .override_redirect = 1 };
++			Display *dpy = XOpenDisplay(NULL);
++			Window win = XCreateWindow(dpy, DefaultRootWindow(dpy),
++						   x, y, width, height, 0,
++						   DefaultDepth(dpy, DefaultScreen(dpy)),
++						   InputOutput,
++						   DefaultVisual(dpy, DefaultScreen(dpy)),
++						   CWOverrideRedirect, &attr);
++			XMapWindow(dpy, win);
++			run(dpy, win, "cousin", options);
++		}
++	}
++
++	while (child) {
++		int status = -1;
++		pid_t pid = wait(&status);
++		if (pid == -1)
++			continue;
++		child--;
++	}
++}
++
++static int has_present(Display *dpy)
++{
++	xcb_connection_t *c = XGetXCBConnection(dpy);
++	xcb_generic_error_t *error = NULL;
++	void *reply;
++
++	reply = xcb_present_query_version_reply(c,
++						xcb_present_query_version(c,
++									  XCB_PRESENT_MAJOR_VERSION,
++									  XCB_PRESENT_MINOR_VERSION),
++						&error);
++
++	free(reply);
++	free(error);
++	if (reply == NULL) {
++		fprintf(stderr, "Present not supported on %s\n", DisplayString(dpy));
++		return 0;
++	}
++
++	return 1;
++}
++
++static int has_composite(Display *dpy)
++{
++	int event, error;
++	int major, minor;
++
++	if (!XDamageQueryExtension (dpy, &event, &error))
++		return 0;
++
++	if (!XCompositeQueryExtension(dpy, &event, &error))
++		return 0;
++
++	XCompositeQueryVersion(dpy, &major, &minor);
++
++	return major > 0 || minor >= 4;
++}
++
++static int dri3_query_version(Display *dpy, int *major, int *minor)
++{
++	xcb_connection_t *c = XGetXCBConnection(dpy);
++	xcb_dri3_query_version_reply_t *reply;
++	xcb_generic_error_t *error;
++
++	*major = *minor = -1;
++
++	reply = xcb_dri3_query_version_reply(c,
++					     xcb_dri3_query_version(c,
++								    XCB_DRI3_MAJOR_VERSION,
++								    XCB_DRI3_MINOR_VERSION),
++					     &error);
++	free(error);
++	if (reply == NULL)
++		return -1;
++
++	*major = reply->major_version;
++	*minor = reply->minor_version;
++	free(reply);
++
++	return 0;
++}
++
++static int has_dri3(Display *dpy)
++{
++	const xcb_query_extension_reply_t *ext;
++	int major, minor;
++
++	ext = xcb_get_extension_data(XGetXCBConnection(dpy), &xcb_dri3_id);
++	if (ext == NULL || !ext->present)
++		return 0;
++
++	if (dri3_query_version(dpy, &major, &minor) < 0)
++		return 0;
++
++	return major >= 0;
++}
++
++static int has_xfixes(Display *dpy)
++{
++	xcb_connection_t *c = XGetXCBConnection(dpy);
++	const xcb_query_extension_reply_t *ext;
++	void *reply;
++
++	ext = xcb_get_extension_data(c, &xcb_xfixes_id);
++	if (ext == NULL || !ext->present)
++		return 0;
++
++	reply = xcb_xfixes_query_version_reply(c,
++					       xcb_xfixes_query_version(c,
++									XCB_XFIXES_MAJOR_VERSION,
++									XCB_XFIXES_MINOR_VERSION),
++					       NULL);
++	free(reply);
++
++	return reply != NULL;
++}
++
++static inline XRRScreenResources *_XRRGetScreenResourcesCurrent(Display *dpy, Window window)
++{
++	XRRScreenResources *res;
++
++	res = XRRGetScreenResourcesCurrent(dpy, window);
++	if (res == NULL)
++		res = XRRGetScreenResources(dpy, window);
++
++	return res;
++}
++
++static XRRModeInfo *lookup_mode(XRRScreenResources *res, int id)
++{
++	int i;
++
++	for (i = 0; i < res->nmode; i++) {
++		if (res->modes[i].id == id)
++			return &res->modes[i];
++	}
++
++	return NULL;
++}
++
++static void fullscreen(Display *dpy, Window win)
++{
++	Atom atom = XInternAtom(dpy, "_NET_WM_STATE_FULLSCREEN", False);
++	XChangeProperty(dpy, win,
++			XInternAtom(dpy, "_NET_WM_STATE", False),
++			XA_ATOM, 32, PropModeReplace,
++			(unsigned char *)&atom, 1);
++}
++
++static void loop(Display *dpy, XRRScreenResources *res, unsigned options)
++{
++	Window root = DefaultRootWindow(dpy);
++	Window win;
++	XSetWindowAttributes attr;
++	int i, j;
++
++	attr.override_redirect = 1;
++
++	run(dpy, root, "off", options);
++	XSync(dpy, True);
++
++	for (i = 0; i < res->noutput; i++) {
++		XRROutputInfo *output;
++		XRRModeInfo *mode;
++
++		output = XRRGetOutputInfo(dpy, res, res->outputs[i]);
++		if (output == NULL)
++			continue;
++
++		mode = NULL;
++		if (res->nmode)
++			mode = lookup_mode(res, output->modes[0]);
++
++		for (j = 0; mode && j < 2*output->ncrtc; j++) {
++			int c = j;
++			if (c >= output->ncrtc)
++				c = 2*output->ncrtc - j - 1;
++
++			printf("[%d, %d] -- OUTPUT:%ld, CRTC:%ld: %dx%d\n",
++			       i, c, (long)res->outputs[i], (long)output->crtcs[c],
++			       mode->width, mode->height);
++			XRRSetCrtcConfig(dpy, res, output->crtcs[c], CurrentTime,
++					 0, 0, output->modes[0], RR_Rotate_0, &res->outputs[i], 1);
++
++			run(dpy, root, "root", options);
++			XSync(dpy, True);
++
++			win = XCreateWindow(dpy, root,
++					    0, 0, mode->width, mode->height, 0,
++					    DefaultDepth(dpy, DefaultScreen(dpy)),
++					    InputOutput,
++					    DefaultVisual(dpy, DefaultScreen(dpy)),
++					    CWOverrideRedirect, &attr);
++			fullscreen(dpy, win);
++			XMapWindow(dpy, win);
++			run(dpy, win, "fullscreen", options);
++			XDestroyWindow(dpy, win);
++			XSync(dpy, True);
++
++			win = XCreateWindow(dpy, root,
++					    0, 0, mode->width, mode->height, 0,
++					    DefaultDepth(dpy, DefaultScreen(dpy)),
++					    InputOutput,
++					    DefaultVisual(dpy, DefaultScreen(dpy)),
++					    CWOverrideRedirect, &attr);
++			XMapWindow(dpy, win);
++			run(dpy, win, "windowed", options);
++			XDestroyWindow(dpy, win);
++			XSync(dpy, True);
++
++			if (has_composite(dpy)) {
++				Damage damage;
++
++				_x_error_occurred = 0;
++				win = XCreateWindow(dpy, root,
++						    0, 0, mode->width, mode->height, 0,
++						    DefaultDepth(dpy, DefaultScreen(dpy)),
++						    InputOutput,
++						    DefaultVisual(dpy, DefaultScreen(dpy)),
++						    CWOverrideRedirect, &attr);
++				XCompositeRedirectWindow(dpy, win, CompositeRedirectManual);
++				damage = XDamageCreate(dpy, win, XDamageReportNonEmpty);
++				XMapWindow(dpy, win);
++				XSync(dpy, True);
++				if (!_x_error_occurred)
++					run(dpy, win, "composited", options);
++				XDamageDestroy(dpy, damage);
++				XDestroyWindow(dpy, win);
++				XSync(dpy, True);
++			}
++
++			win = XCreateWindow(dpy, root,
++					    0, 0, mode->width/2, mode->height/2, 0,
++					    DefaultDepth(dpy, DefaultScreen(dpy)),
++					    InputOutput,
++					    DefaultVisual(dpy, DefaultScreen(dpy)),
++					    CWOverrideRedirect, &attr);
++			XMapWindow(dpy, win);
++			run(dpy, win, "half", options);
++			XDestroyWindow(dpy, win);
++			XSync(dpy, True);
++
++			perpixel(dpy, mode->width, mode->height, options);
++
++			siblings(dpy, mode->width, mode->height,
++				 sysconf(_SC_NPROCESSORS_ONLN),
++				 options);
++
++			cousins(mode->width, mode->height,
++				sysconf(_SC_NPROCESSORS_ONLN),
++				options);
++
++			XRRSetCrtcConfig(dpy, res, output->crtcs[c], CurrentTime,
++					 0, 0, None, RR_Rotate_0, NULL, 0);
++		}
++
++		XRRFreeOutputInfo(output);
++	}
++
++}
++
++int main(void)
++{
++	Display *dpy;
++	XRRScreenResources *res;
++	XRRCrtcInfo **original_crtc;
++	int i;
++
++	XInitThreads();
++
++	dpy = XOpenDisplay(NULL);
++	if (dpy == NULL)
++		return 77;
++
++	if (!has_present(dpy))
++		return 77;
++
++	if (DPMSQueryExtension(dpy, &i, &i))
++		DPMSDisable(dpy);
++
++	signal(SIGALRM, SIG_IGN);
++	XSetErrorHandler(_check_error_handler);
++
++	res = NULL;
++	if (XRRQueryVersion(dpy, &i, &i))
++		res = _XRRGetScreenResourcesCurrent(dpy, DefaultRootWindow(dpy));
++	if (res == NULL)
++		return 77;
++
++	original_crtc = malloc(sizeof(XRRCrtcInfo *)*res->ncrtc);
++	for (i = 0; i < res->ncrtc; i++)
++		original_crtc[i] = XRRGetCrtcInfo(dpy, res, res->crtcs[i]);
++
++	printf("noutput=%d, ncrtc=%d\n", res->noutput, res->ncrtc);
++	for (i = 0; i < res->ncrtc; i++)
++		XRRSetCrtcConfig(dpy, res, res->crtcs[i], CurrentTime,
++				 0, 0, None, RR_Rotate_0, NULL, 0);
++
++	loop(dpy, res, 0);
++	loop(dpy, res, ASYNC);
++	if (has_xfixes(dpy))
++		loop(dpy, res, NOCOPY);
++	if (has_dri3(dpy)) {
++		loop(dpy, res, DRI3);
++		loop(dpy, res, DRI3 | ASYNC);
++	}
++
++	for (i = 0; i < res->ncrtc; i++)
++		XRRSetCrtcConfig(dpy, res, res->crtcs[i], CurrentTime,
++				 original_crtc[i]->x,
++				 original_crtc[i]->y,
++				 original_crtc[i]->mode,
++				 original_crtc[i]->rotation,
++				 original_crtc[i]->outputs,
++				 original_crtc[i]->noutput);
++
++	if (DPMSQueryExtension(dpy, &i, &i))
++		DPMSEnable(dpy);
++	return 0;
++}
+diff --git a/test/present-test.c b/test/present-test.c
+index 6b562eb0..5a12a24f 100644
+--- a/test/present-test.c
++++ b/test/present-test.c
+@@ -31,7 +31,9 @@
+ #include <X11/xshmfence.h>
+ #include <X11/Xutil.h>
+ #include <X11/Xlibint.h>
++#include <X11/extensions/dpms.h>
+ #include <X11/extensions/randr.h>
++#include <X11/extensions/Xcomposite.h>
+ #include <X11/extensions/Xrandr.h>
+ #include <X11/extensions/Xrender.h>
+ #include <X11/extensions/XShm.h>
+@@ -44,6 +46,8 @@
+ #endif
+ #include <xcb/xcb.h>
+ #include <xcb/present.h>
++#include <xcb/xfixes.h>
++#include <xcb/dri3.h>
+ #include <xf86drm.h>
+ #include <i915_drm.h>
+ 
+@@ -134,12 +138,14 @@ static void *setup_msc(Display *dpy,  Window win)
+ 	return q;
+ }
+ 
+-static uint64_t check_msc(Display *dpy, Window win, void *q, uint64_t last_msc)
++static uint64_t check_msc(Display *dpy, Window win, void *q, uint64_t last_msc, uint64_t *ust)
+ {
+ 	xcb_connection_t *c = XGetXCBConnection(dpy);
++	static uint32_t serial = 1;
+ 	uint64_t msc = 0;
++	int complete = 0;
+ 
+-	xcb_present_notify_msc(c, win, 0, 0, 0, 0);
++	xcb_present_notify_msc(c, win, serial ^ 0xcc00ffee, 0, 0, 0);
+ 	xcb_flush(c);
+ 
+ 	do {
+@@ -151,82 +157,1268 @@ static uint64_t check_msc(Display *dpy, Window win, void *q, uint64_t last_msc)
+ 			break;
+ 
+ 		ce = (xcb_present_complete_notify_event_t *)ev;
+-		if (ce->kind != XCB_PRESENT_COMPLETE_KIND_PIXMAP)
++		if (ce->kind == XCB_PRESENT_COMPLETE_KIND_NOTIFY_MSC &&
++		    ce->serial == (serial ^ 0xcc00ffee)) {
++			msc = ce->msc;
++			if (ust)
++				*ust = ce->ust;
++			complete = 1;
++		}
++		free(ev);
++	} while (!complete);
++
++	if ((int64_t)(msc - last_msc) < 0) {
++		printf("Invalid MSC: was %llu, now %llu\n",
++		       (long long)last_msc, (long long)msc);
++	}
++
++	if (++serial == 0)
++		serial = 1;
++
++	return msc;
++}
++
++static uint64_t wait_vblank(Display *dpy, Window win, void *q)
++{
++	xcb_connection_t *c = XGetXCBConnection(dpy);
++	static uint32_t serial = 1;
++	uint64_t msc = 0;
++	int complete = 0;
++
++	xcb_present_notify_msc(c, win, serial ^ 0xdeadbeef, 0, 1, 0);
++	xcb_flush(c);
++
++	do {
++		xcb_present_complete_notify_event_t *ce;
++		xcb_generic_event_t *ev;
++
++		ev = xcb_wait_for_special_event(c, q);
++		if (ev == NULL)
++			break;
++
++		ce = (xcb_present_complete_notify_event_t *)ev;
++		if (ce->kind == XCB_PRESENT_COMPLETE_KIND_NOTIFY_MSC &&
++		    ce->serial == (serial ^ 0xdeadbeef)) {
+ 			msc = ce->msc;
++			complete = 1;
++		}
++		free(ev);
++	} while (!complete);
++
++	if (++serial == 0)
++		serial = 1;
++
++	return msc;
++}
++
++static uint64_t msc_interval(Display *dpy, Window win, void *q)
++{
++	xcb_connection_t *c = XGetXCBConnection(dpy);
++	uint64_t msc, ust;
++	int complete = 0;
++
++	msc = check_msc(dpy, win, q, 0, NULL);
++
++	xcb_present_notify_msc(c, win, 0xc0ffee00, msc, 0, 0);
++	xcb_present_notify_msc(c, win, 0xc0ffee01, msc + 10, 0, 0);
++	xcb_flush(c);
++
++	ust = msc = 0;
++	do {
++		xcb_present_complete_notify_event_t *ce;
++		xcb_generic_event_t *ev;
++
++		ev = xcb_wait_for_special_event(c, q);
++		if (ev == NULL)
++			break;
++
++		ce = (xcb_present_complete_notify_event_t *)ev;
++		if (ce->kind == XCB_PRESENT_COMPLETE_KIND_NOTIFY_MSC &&
++		    ce->serial == 0xc0ffee00) {
++			msc -= ce->msc;
++			ust -= ce->ust;
++			complete++;
++		}
++		if (ce->kind == XCB_PRESENT_COMPLETE_KIND_NOTIFY_MSC &&
++		    ce->serial == 0xc0ffee01) {
++			msc += ce->msc;
++			ust += ce->ust;
++			complete++;
++		}
++		free(ev);
++	} while (complete != 2);
++
++	printf("10 frame interval: msc=%lld, ust=%lld\n",
++	       (long long)msc, (long long)ust);
++	XSync(dpy, True);
++	if (msc == 0)
++		return 0;
++
++	return (ust + msc/2) / msc;
++}
++
++static void teardown_msc(Display *dpy, void *q)
++{
++	xcb_unregister_for_special_event(XGetXCBConnection(dpy), q);
++}
++
++static int test_whole(Display *dpy, Window win, const char *phase)
++{
++	xcb_connection_t *c = XGetXCBConnection(dpy);
++	Pixmap pixmap;
++	struct dri3_fence fence;
++	Window root;
++	unsigned int width, height;
++	unsigned border, depth;
++	int x, y, ret = 1;
++
++	XGetGeometry(dpy, win,
++		     &root, &x, &y, &width, &height, &border, &depth);
++
++	if (dri3_create_fence(dpy, win, &fence))
++		return 0;
++
++	printf("%s: Testing simple flip: %dx%d\n", phase, width, height);
++	_x_error_occurred = 0;
++
++	xshmfence_reset(fence.addr);
++
++	pixmap = XCreatePixmap(dpy, win, width, height, depth);
++	xcb_present_pixmap(c, win, pixmap, 0,
++			   0, /* valid */
++			   0, /* update */
++			   0, /* x_off */
++			   0, /* y_off */
++			   None,
++			   None, /* wait fence */
++			   fence.xid,
++			   XCB_PRESENT_OPTION_NONE,
++			   0, /* target msc */
++			   0, /* divisor */
++			   0, /* remainder */
++			   0, NULL);
++	XFreePixmap(dpy, pixmap);
++
++	pixmap = XCreatePixmap(dpy, win, width, height, depth);
++	xcb_present_pixmap(c, win, pixmap, 0,
++			   0, /* valid */
++			   0, /* update */
++			   0, /* x_off */
++			   0, /* y_off */
++			   None,
++			   None, /* wait fence */
++			   None, /* sync fence */
++			   XCB_PRESENT_OPTION_NONE,
++			   0, /* target msc */
++			   0, /* divisor */
++			   0, /* remainder */
++			   0, NULL);
++	XFreePixmap(dpy, pixmap);
++	XFlush(dpy);
++
++	ret = !!xshmfence_await(fence.addr);
++	dri3_fence_free(dpy, &fence);
++
++	XSync(dpy, True);
++	ret += !!_x_error_occurred;
++
++	return ret;
++}
++
++static uint64_t flush_flips(Display *dpy, Window win, Pixmap pixmap, void *Q, uint64_t *ust)
++{
++	xcb_connection_t *c = XGetXCBConnection(dpy);
++	uint64_t msc;
++	int complete;
++
++	msc = check_msc(dpy, win, Q, 0, NULL);
++	xcb_present_pixmap(c, win, pixmap,
++			   0xdeadbeef, /* serial */
++			   0, /* valid */
++			   0, /* update */
++			   0, /* x_off */
++			   0, /* y_off */
++			   None,
++			   None, /* wait fence */
++			   None,
++			   XCB_PRESENT_OPTION_NONE,
++			   msc + 60, /* target msc */
++			   0, /* divisor */
++			   0, /* remainder */
++			   0, NULL);
++	xcb_flush(c);
++	complete = 0;
++	do {
++		xcb_present_complete_notify_event_t *ce;
++		xcb_generic_event_t *ev;
++
++		ev = xcb_wait_for_special_event(c, Q);
++		if (ev == NULL)
++			break;
++
++		ce = (xcb_present_complete_notify_event_t *)ev;
++		complete = (ce->kind == XCB_PRESENT_COMPLETE_KIND_PIXMAP &&
++			    ce->serial == 0xdeadbeef);
++		free(ev);
++	} while (!complete);
++	XSync(dpy, True);
++
++	return check_msc(dpy, win, Q, msc, ust);
++}
++
++static int test_double(Display *dpy, Window win, const char *phase, void *Q)
++{
++#define COUNT (15*60)
++	xcb_connection_t *c = XGetXCBConnection(dpy);
++	Pixmap pixmap;
++	Window root;
++	unsigned int width, height;
++	unsigned border, depth;
++	int x, y, n, ret;
++	struct {
++		uint64_t msc, ust;
++	} frame[COUNT+1];
++	int offset = 0;
++
++	XGetGeometry(dpy, win,
++		     &root, &x, &y, &width, &height, &border, &depth);
++
++	printf("%s: Testing flip double buffering: %dx%d\n", phase, width, height);
++	_x_error_occurred = 0;
++
++	pixmap = XCreatePixmap(dpy, win, width, height, depth);
++	flush_flips(dpy, win, pixmap, Q, NULL);
++	for (n = 0; n <= COUNT; n++) {
++		int complete;
++
++		xcb_present_pixmap(c, win, pixmap, n,
++				   0, /* valid */
++				   0, /* update */
++				   0, /* x_off */
++				   0, /* y_off */
++				   None,
++				   None, /* wait fence */
++				   None,
++				   XCB_PRESENT_OPTION_NONE,
++				   0, /* target msc */
++				   0, /* divisor */
++				   0, /* remainder */
++				   0, NULL);
++		xcb_flush(c);
++
++		complete = 0;
++		do {
++			xcb_present_complete_notify_event_t *ce;
++			xcb_generic_event_t *ev;
++
++			ev = xcb_wait_for_special_event(c, Q);
++			if (ev == NULL)
++				break;
++
++			ce = (xcb_present_complete_notify_event_t *)ev;
++			if (ce->kind == XCB_PRESENT_COMPLETE_KIND_PIXMAP &&
++			    ce->serial == n) {
++				frame[n].msc = ce->msc;
++				frame[n].ust = ce->ust;
++				complete = 1;
++			}
++			free(ev);
++		} while (!complete);
++	}
++	XFreePixmap(dpy, pixmap);
++
++	XSync(dpy, True);
++	ret = !!_x_error_occurred;
++
++	if (frame[COUNT].msc - frame[0].msc != COUNT) {
++		printf("Expected %d frames interval, %d elapsed instead\n",
++		       COUNT, (int)(frame[COUNT].msc - frame[0].msc));
++		for (n = 0; n <= COUNT; n++) {
++			if (frame[n].msc - frame[0].msc != n + offset) {
++				printf("frame[%d]: msc=%03lld, ust=%lld\n", n,
++				       (long long)(frame[n].msc - frame[0].msc),
++				       (long long)(frame[n].ust - frame[0].ust));
++				offset = frame[n].msc - frame[0].msc - n;
++				ret++;
++			}
++		}
++	}
++
++	return ret;
++}
++
++static int test_future(Display *dpy, Window win, const char *phase, void *Q)
++{
++	xcb_connection_t *c = XGetXCBConnection(dpy);
++	Pixmap pixmap;
++	struct dri3_fence fence;
++	Window root;
++	unsigned int width, height;
++	unsigned border, depth;
++	int x, y, ret = 0, n;
++	uint64_t msc, ust;
++	int complete, count;
++	int early = 0, late = 0;
++	int earliest = 0, latest = 0;
++	uint64_t interval;
++
++	XGetGeometry(dpy, win,
++		     &root, &x, &y, &width, &height, &border, &depth);
++
++	if (dri3_create_fence(dpy, win, &fence))
++		return 0;
++
++	printf("%s: Testing flips into the future: %dx%d\n", phase, width, height);
++	_x_error_occurred = 0;
++
++	interval = msc_interval(dpy, win, Q);
++	if (interval == 0) {
++		printf("Zero delay between frames\n");
++		return 1;
++	}
++
++	pixmap = XCreatePixmap(dpy, win, width, height, depth);
++	msc = flush_flips(dpy, win, pixmap, Q, &ust);
++	for (n = 1; n <= 10; n++)
++		xcb_present_pixmap(c, win, pixmap,
++				   n, /* serial */
++				   0, /* valid */
++				   0, /* update */
++				   0, /* x_off */
++				   0, /* y_off */
++				   None,
++				   None, /* wait fence */
++				   None,
++				   XCB_PRESENT_OPTION_NONE,
++				   msc + 60 + n*15*60, /* target msc */
++				   0, /* divisor */
++				   0, /* remainder */
++				   0, NULL);
++	xcb_present_pixmap(c, win, pixmap,
++			   0xdeadbeef, /* serial */
++			   0, /* valid */
++			   0, /* update */
++			   0, /* x_off */
++			   0, /* y_off */
++			   None,
++			   None, /* wait fence */
++			   None,
++			   XCB_PRESENT_OPTION_NONE,
++			   msc + 60 + n*15*60, /* target msc */
++			   0, /* divisor */
++			   0, /* remainder */
++			   0, NULL);
++	xcb_flush(c);
++
++	complete = 0;
++	count = 0;
++	do {
++		xcb_present_complete_notify_event_t *ce;
++		xcb_generic_event_t *ev;
++
++		ev = xcb_wait_for_special_event(c, Q);
++		if (ev == NULL)
++			break;
++
++		ce = (xcb_present_complete_notify_event_t *)ev;
++		assert(ce->kind == XCB_PRESENT_COMPLETE_KIND_PIXMAP);
++
++		if (ce->serial == 0xdeadbeef) {
++			int64_t time;
++
++			time = ce->ust - (ust + (60 + 15*60*n) * interval);
++			if (time < -(int64_t)interval) {
++				fprintf(stderr,
++					"\tflips completed too early by %lldms\n",
++					(long long)(-time / 1000));
++			} else if (time > (int64_t)interval) {
++				fprintf(stderr,
++					"\tflips completed too late by %lldms\n",
++					(long long)(time / 1000));
++			}
++			complete = 1;
++		} else {
++			int diff = (int64_t)(ce->msc - (15*60*ce->serial + msc + 60));
++			if (diff < 0) {
++				if (-diff > earliest) {
++					fprintf(stderr, "\tframe %d displayed early by %d frames\n", ce->serial, -diff);
++					earliest = -diff;
++				}
++				early++;
++				ret++;
++			} else if (diff > 0) {
++				if (diff > latest) {
++					fprintf(stderr, "\tframe %d displayed late by %d frames\n", ce->serial, diff);
++					latest = diff;
++				}
++				late++;
++				ret++;
++			}
++			count++;
++		}
++		free(ev);
++	} while (!complete);
++
++	if (early)
++		printf("\t%d frames shown too early (worst %d)!\n", early, earliest);
++	if (late)
++		printf("\t%d frames shown too late (worst %d)!\n", late, latest);
++
++	if (count != 10) {
++		fprintf(stderr, "Sentinel frame received too early! %d frames outstanding\n", 10 - count);
++		ret++;
++
++		do {
++			xcb_present_complete_notify_event_t *ce;
++			xcb_generic_event_t *ev;
++
++			ev = xcb_wait_for_special_event(c, Q);
++			if (ev == NULL)
++				break;
++
++			ce = (xcb_present_complete_notify_event_t *)ev;
++			assert(ce->kind == XCB_PRESENT_COMPLETE_KIND_PIXMAP);
++			free(ev);
++		} while (++count != 10);
++	}
++
++	ret += !!_x_error_occurred;
++
++	return ret;
++}
++
++static int test_exhaustion(Display *dpy, Window win, const char *phase, void *Q)
++{
++#define N_VBLANKS 256 /* kernel event queue length: 128 vblanks */
++	xcb_connection_t *c = XGetXCBConnection(dpy);
++	Pixmap pixmap;
++	struct dri3_fence fence[2];
++	Window root;
++	xcb_xfixes_region_t region;
++	unsigned int width, height;
++	unsigned border, depth;
++	int x, y, ret = 0, n;
++	uint64_t target, final;
++
++	XGetGeometry(dpy, win,
++		     &root, &x, &y, &width, &height, &border, &depth);
++
++	if (dri3_create_fence(dpy, win, &fence[0]) ||
++	    dri3_create_fence(dpy, win, &fence[1]))
++		return 0;
++
++	printf("%s: Testing flips with long vblank queues: %dx%d\n", phase, width, height);
++	_x_error_occurred = 0;
++
++	region = xcb_generate_id(c);
++	xcb_xfixes_create_region(c, region, 0, NULL);
++
++	pixmap = XCreatePixmap(dpy, win, width, height, depth);
++	xshmfence_reset(fence[0].addr);
++	xshmfence_reset(fence[1].addr);
++	target = check_msc(dpy, win, Q, 0, NULL);
++	for (n = N_VBLANKS; n--; )
++		xcb_present_pixmap(c, win, pixmap, 0,
++				   0, /* valid */
++				   region, /* update */
++				   0, /* x_off */
++				   0, /* y_off */
++				   None,
++				   None, /* wait fence */
++				   None,
++				   XCB_PRESENT_OPTION_NONE,
++				   target + N_VBLANKS, /* target msc */
++				   1, /* divisor */
++				   0, /* remainder */
++				   0, NULL);
++	xcb_present_pixmap(c, win, pixmap, 0,
++			   region, /* valid */
++			   region, /* update */
++			   0, /* x_off */
++			   0, /* y_off */
++			   None,
++			   None, /* wait fence */
++			   fence[0].xid,
++			   XCB_PRESENT_OPTION_NONE,
++			   target, /* target msc */
++			   0, /* divisor */
++			   0, /* remainder */
++			   0, NULL);
++	for (n = 1; n < N_VBLANKS; n++)
++		xcb_present_pixmap(c, win, pixmap, 0,
++				   region, /* valid */
++				   region, /* update */
++				   0, /* x_off */
++				   0, /* y_off */
++				   None,
++				   None, /* wait fence */
++				   None,
++				   XCB_PRESENT_OPTION_NONE,
++				   target + n, /* target msc */
++				   0, /* divisor */
++				   0, /* remainder */
++				   0, NULL);
++	xcb_present_pixmap(c, win, pixmap, 0,
++			   region, /* valid */
++			   region, /* update */
++			   0, /* x_off */
++			   0, /* y_off */
++			   None,
++			   None, /* wait fence */
++			   fence[1].xid,
++			   XCB_PRESENT_OPTION_NONE,
++			   target + N_VBLANKS, /* target msc */
++			   0, /* divisor */
++			   0, /* remainder */
++			   0, NULL);
++	xcb_flush(c);
++
++	ret += !!xshmfence_await(fence[0].addr);
++	final = check_msc(dpy, win, Q, 0, NULL);
++	if (final < target) {
++		printf("\tFirst flip too early, MSC was %llu, expected %llu\n",
++		       (long long)final, (long long)target);
++		ret++;
++	} else if (final > target + 1) {
++		printf("\tFirst flip too late, MSC was %llu, expected %llu\n",
++		       (long long)final, (long long)target);
++		ret++;
++	}
++
++	ret += !!xshmfence_await(fence[1].addr);
++	final = check_msc(dpy, win, Q, 0, NULL);
++	if (final < target + N_VBLANKS) {
++		printf("\tLast flip too early, MSC was %llu, expected %llu\n",
++		       (long long)final, (long long)(target + N_VBLANKS));
++		ret++;
++	} else if (final > target + N_VBLANKS + 1) {
++		printf("\tLast flip too late, MSC was %llu, expected %llu\n",
++		       (long long)final, (long long)(target + N_VBLANKS));
++		ret++;
++	}
++
++	flush_flips(dpy, win, pixmap, Q, NULL);
++
++	XFreePixmap(dpy, pixmap);
++	xcb_xfixes_destroy_region(c, region);
++	dri3_fence_free(dpy, &fence[1]);
++	dri3_fence_free(dpy, &fence[0]);
++
++	XSync(dpy, True);
++	ret += !!_x_error_occurred;
++
++	return ret;
++#undef N_VBLANKS
++}
++
++static int test_accuracy(Display *dpy, Window win, const char *phase, void *Q)
++{
++#define N_VBLANKS (60 * 120) /* ~2 minutes */
++	xcb_connection_t *c = XGetXCBConnection(dpy);
++	Pixmap pixmap;
++	Window root;
++	unsigned int width, height;
++	unsigned border, depth;
++	int x, y, ret = 0, n;
++	uint64_t target;
++	int early = 0, late = 0;
++	int earliest = 0, latest = 0;
++	int complete, count;
++
++	XGetGeometry(dpy, win,
++		     &root, &x, &y, &width, &height, &border, &depth);
++
++	printf("%s: Testing flip accuracy: %dx%d\n", phase, width, height);
++	_x_error_occurred = 0;
++
++	pixmap = XCreatePixmap(dpy, win, width, height, depth);
++	target = flush_flips(dpy, win, pixmap, Q, NULL);
++	for (n = 0; n <= N_VBLANKS; n++)
++		xcb_present_pixmap(c, win, pixmap,
++				   n, /* serial */
++				   0, /* valid */
++				   0, /* update */
++				   0, /* x_off */
++				   0, /* y_off */
++				   None,
++				   None, /* wait fence */
++				   None,
++				   XCB_PRESENT_OPTION_NONE,
++				   target + 60 + n, /* target msc */
++				   0, /* divisor */
++				   0, /* remainder */
++				   0, NULL);
++	xcb_present_pixmap(c, win, pixmap,
++			   0xdeadbeef, /* serial */
++			   0, /* valid */
++			   0, /* update */
++			   0, /* x_off */
++			   0, /* y_off */
++			   None,
++			   None, /* wait fence */
++			   None,
++			   XCB_PRESENT_OPTION_NONE,
++			   target + 60 + n, /* target msc */
++			   0, /* divisor */
++			   0, /* remainder */
++			   0, NULL);
++	xcb_flush(c);
++
++	complete = 0;
++	count = 0;
++	do {
++		xcb_present_complete_notify_event_t *ce;
++		xcb_generic_event_t *ev;
++
++		ev = xcb_wait_for_special_event(c, Q);
++		if (ev == NULL)
++			break;
++
++		ce = (xcb_present_complete_notify_event_t *)ev;
++		assert(ce->kind == XCB_PRESENT_COMPLETE_KIND_PIXMAP);
++
++		if (ce->serial != 0xdeadbeef) {
++			int diff = (int64_t)(ce->msc - (target + ce->serial + 60));
++			if (diff < 0) {
++				if (-diff > earliest) {
++					fprintf(stderr, "\tframe %d displayed early by %d frames\n", ce->serial, -diff);
++					earliest = -diff;
++				}
++				early++;
++				ret++;
++			} else if (diff > 0) {
++				if (diff > latest) {
++					fprintf(stderr, "\tframe %d displayed late by %d frames\n", ce->serial, diff);
++					latest = diff;
++				}
++				late++;
++				ret++;
++			}
++			count++;
++		} else
++			complete = 1;
+ 		free(ev);
+-	} while (msc == 0);
++	} while (!complete);
++
++	if (early)
++		printf("\t%d frames shown too early (worst %d)!\n", early, earliest);
++	if (late)
++		printf("\t%d frames shown too late (worst %d)!\n", late, latest);
++
++	if (count != N_VBLANKS+1) {
++		fprintf(stderr, "Sentinel frame received too early! %d frames outstanding\n", N_VBLANKS+1 - count);
++		ret++;
++		do {
++			xcb_present_complete_notify_event_t *ce;
++			xcb_generic_event_t *ev;
++
++			ev = xcb_wait_for_special_event(c, Q);
++			if (ev == NULL)
++				break;
++
++			ce = (xcb_present_complete_notify_event_t *)ev;
++			assert(ce->kind == XCB_PRESENT_COMPLETE_KIND_PIXMAP);
++			free(ev);
++		} while (++count != N_VBLANKS+1);
++	}
++
++	XFreePixmap(dpy, pixmap);
++
++	XSync(dpy, True);
++	ret += !!_x_error_occurred;
++
++	return ret;
++#undef N_VBLANKS
++}
++
++static int test_modulus(Display *dpy, Window win, const char *phase, void *Q)
++{
++	xcb_connection_t *c = XGetXCBConnection(dpy);
++	Pixmap pixmap;
++	Window root;
++	unsigned int width, height;
++	unsigned border, depth;
++	xcb_xfixes_region_t region;
++	int x, y, ret = 0;
++	uint64_t target;
++	int early = 0, late = 0;
++	int earliest = 0, latest = 0;
++	int complete, expect, count;
++
++	XGetGeometry(dpy, win,
++		     &root, &x, &y, &width, &height, &border, &depth);
++
++	printf("%s: Testing flip modulus: %dx%d\n", phase, width, height);
++	_x_error_occurred = 0;
++
++	region = xcb_generate_id(c);
++	xcb_xfixes_create_region(c, region, 0, NULL);
++
++	pixmap = XCreatePixmap(dpy, win, width, height, depth);
++	target = flush_flips(dpy, win, pixmap, Q, NULL);
++	expect = 0;
++	for (x = 1; x <= 7; x++) {
++		for (y = 0; y < x; y++) {
++			xcb_present_pixmap(c, win, pixmap,
++					   y << 16 | x, /* serial */
++					   region, /* valid */
++					   region, /* update */
++					   0, /* x_off */
++					   0, /* y_off */
++					   None,
++					   None, /* wait fence */
++					   None,
++					   XCB_PRESENT_OPTION_NONE,
++					   0, /* target msc */
++					   x, /* divisor */
++					   y, /* remainder */
++					   0, NULL);
++			expect++;
++		}
++	}
++	xcb_present_pixmap(c, win, pixmap,
++			   0xdeadbeef, /* serial */
++			   0, /* valid */
++			   0, /* update */
++			   0, /* x_off */
++			   0, /* y_off */
++			   None,
++			   None, /* wait fence */
++			   None,
++			   XCB_PRESENT_OPTION_NONE,
++			   target + 2*x, /* target msc */
++			   0, /* divisor */
++			   0, /* remainder */
++			   0, NULL);
++	xcb_flush(c);
++
++	complete = 0;
++	count = 0;
++	do {
++		xcb_present_complete_notify_event_t *ce;
++		xcb_generic_event_t *ev;
++
++		ev = xcb_wait_for_special_event(c, Q);
++		if (ev == NULL)
++			break;
++
++		ce = (xcb_present_complete_notify_event_t *)ev;
++		if (ce->kind != XCB_PRESENT_COMPLETE_KIND_PIXMAP)
++			break;
++
++		assert(ce->serial);
++		if (ce->serial != 0xdeadbeef) {
++			uint64_t msc;
++			int diff;
++
++			x = ce->serial & 0xffff;
++			y = ce->serial >> 16;
++
++			msc = target;
++			msc -= target % x;
++			msc += y;
++			if (msc <= target)
++				msc += x;
++
++			diff = (int64_t)(ce->msc - msc);
++			if (diff < 0) {
++				if (-diff > earliest) {
++					fprintf(stderr, "\tframe (%d, %d) displayed early by %d frames\n", y, x, -diff);
++					earliest = -diff;
++				}
++				early++;
++				ret++;
++			} else if (diff > 0) {
++				if (diff > latest) {
++					fprintf(stderr, "\tframe (%d, %d) displayed late by %d frames\n", y, x, diff);
++					latest = diff;
++				}
++				late++;
++				ret++;
++			}
++			count++;
++		} else
++			complete = 1;
++		free(ev);
++	} while (!complete);
++
++	if (early)
++		printf("\t%d frames shown too early (worst %d)!\n", early, earliest);
++	if (late)
++		printf("\t%d frames shown too late (worst %d)!\n", late, latest);
++
++	if (count != expect) {
++		fprintf(stderr, "Sentinel frame received too early! %d frames outstanding\n", expect - count);
++		ret++;
++		do {
++			xcb_present_complete_notify_event_t *ce;
++			xcb_generic_event_t *ev;
++
++			ev = xcb_wait_for_special_event(c, Q);
++			if (ev == NULL)
++				break;
++
++			ce = (xcb_present_complete_notify_event_t *)ev;
++			assert(ce->kind == XCB_PRESENT_COMPLETE_KIND_NOTIFY_MSC);
++			free(ev);
++		} while (++count != expect);
++	}
++
++	XFreePixmap(dpy, pixmap);
++	xcb_xfixes_destroy_region(c, region);
++
++	XSync(dpy, True);
++	ret += !!_x_error_occurred;
++
++	return ret;
++}
++
++static int test_future_msc(Display *dpy, void *Q)
++{
++	xcb_connection_t *c = XGetXCBConnection(dpy);
++	Window root = DefaultRootWindow(dpy);
++	int ret = 0, n;
++	uint64_t msc, ust;
++	int complete, count;
++	int early = 0, late = 0;
++	int earliest = 0, latest = 0;
++	uint64_t interval;
++
++	printf("Testing notifies into the future\n");
++	_x_error_occurred = 0;
++
++	interval = msc_interval(dpy, root, Q);
++	if (interval == 0) {
++		printf("Zero delay between frames\n");
++		return 1;
++	}
++	msc = check_msc(dpy, root, Q, 0, &ust);
++	printf("Initial msc=%llx, interval between frames %lldus\n",
++	       (long long)msc, (long long)interval);
++
++	for (n = 1; n <= 10; n++)
++		xcb_present_notify_msc(c, root, n, msc + 60 + n*15*60, 0, 0);
++	xcb_present_notify_msc(c, root, 0xdeadbeef, msc + 60 + n*15*60, 0, 0);
++	xcb_flush(c);
++
++	complete = 0;
++	count = 0;
++	do {
++		xcb_present_complete_notify_event_t *ce;
++		xcb_generic_event_t *ev;
++
++		ev = xcb_wait_for_special_event(c, Q);
++		if (ev == NULL)
++			break;
++
++		ce = (xcb_present_complete_notify_event_t *)ev;
++		assert(ce->kind == XCB_PRESENT_COMPLETE_KIND_NOTIFY_MSC);
++
++		if (ce->serial == 0xdeadbeef) {
++			int64_t time, tolerance;
++
++			tolerance = 60 + 15*60*n/10;
++			if (tolerance < interval)
++				tolerance = interval;
++
++			time = ce->ust - (ust + (60 + 15*60*n) * interval);
++			if (time < -(int64_t)tolerance) {
++				fprintf(stderr,
++					"\tnotifies completed too early by %lldms, tolerance %lldus\n",
++					(long long)(-time / 1000), (long long)tolerance);
++			} else if (time > (int64_t)tolerance) {
++				fprintf(stderr,
++					"\tnotifies completed too late by %lldms, tolerance %lldus\n",
++					(long long)(time / 1000), (long long)tolerance);
++			}
++			complete = 1;
++		} else {
++			int diff = (int64_t)(ce->msc - (15*60*ce->serial + msc + 60));
++
++			if (ce->serial != count + 1) {
++				fprintf(stderr, "vblank received out of order! expected %d, received %d\n",
++					count + 1, (int)ce->serial);
++				ret++;
++			}
++			count++;
++
++			if (diff < 0) {
++				if (-diff > earliest) {
++					fprintf(stderr, "\tnotify %d early by %d msc\n", ce->serial, -diff);
++					earliest = -diff;
++				}
++				early++;
++				ret++;
++			} else if (diff > 0) {
++				if (diff > latest) {
++					fprintf(stderr, "\tnotify %d late by %d msc\n", ce->serial, diff);
++					latest = diff;
++				}
++				late++;
++				ret++;
++			}
++		}
++		free(ev);
++	} while (!complete);
++
++	if (early)
++		printf("\t%d notifies too early (worst %d)!\n", early, earliest);
++	if (late)
++		printf("\t%d notifies too late (worst %d)!\n", late, latest);
++
++	if (count != 10) {
++		fprintf(stderr, "Sentinel vblank received too early! %d waits outstanding\n", 10 - count);
++		ret++;
++		do {
++			xcb_present_complete_notify_event_t *ce;
++			xcb_generic_event_t *ev;
++
++			ev = xcb_wait_for_special_event(c, Q);
++			if (ev == NULL)
++				break;
++
++			ce = (xcb_present_complete_notify_event_t *)ev;
++			assert(ce->kind == XCB_PRESENT_COMPLETE_KIND_NOTIFY_MSC);
++			free(ev);
++		} while (++count != 10);
++	}
++
++	XSync(dpy, True);
++	ret += !!_x_error_occurred;
++
++	return ret;
++}
++
++static int test_wrap_msc(Display *dpy)
++{
++	xcb_connection_t *c = XGetXCBConnection(dpy);
++	Window root, win;
++	int x, y;
++	unsigned int width, height;
++	unsigned border, depth;
++	XSetWindowAttributes attr;
++	int ret = 0, n;
++	uint64_t msc, ust;
++	int complete;
++	uint64_t interval;
++	void *Q;
++
++	XGetGeometry(dpy, DefaultRootWindow(dpy),
++		     &root, &x, &y, &width, &height, &border, &depth);
++
++	attr.override_redirect = 1;
++	win = XCreateWindow(dpy, root,
++			    0, 0, width, height, 0, depth,
++			    InputOutput, DefaultVisual(dpy, DefaultScreen(dpy)),
++			    CWOverrideRedirect, &attr);
++	XMapWindow(dpy, win);
++	XSync(dpy, True);
++	if (_x_error_occurred)
++		return 1;
+ 
+-	if (msc < last_msc) {
+-		printf("Invalid MSC: was %llu, now %llu\n",
+-		       (long long)last_msc, (long long)msc);
++	printf("Testing wraparound notifies\n");
++	_x_error_occurred = 0;
++
++	Q = setup_msc(dpy, win);
++	interval = msc_interval(dpy, win, Q);
++	if (interval == 0) {
++		printf("Zero delay between frames\n");
++		return 1;
+ 	}
++	msc = check_msc(dpy, win, Q, 0, &ust);
++	printf("Initial msc=%llx, interval between frames %lldus\n",
++	       (long long)msc, (long long)interval);
++
++	for (n = 1; n <= 10; n++)
++		xcb_present_notify_msc(c, win, n,
++				       msc + ((long long)n<<32) + n,
++				       0, 0);
++	for (n = 1; n <= 10; n++)
++		xcb_present_notify_msc(c, win, -n,
++				       0, (long long)n << 32, 0);
++	xcb_present_notify_msc(c, win, 0xdeadbeef, msc + 60*10, 0, 0);
++	xcb_flush(c);
+ 
+-	return msc;
++	complete = 0;
++	do {
++		xcb_present_complete_notify_event_t *ce;
++		xcb_generic_event_t *ev;
++
++		ev = xcb_wait_for_special_event(c, Q);
++		if (ev == NULL)
++			break;
++
++		ce = (xcb_present_complete_notify_event_t *)ev;
++		assert(ce->kind == XCB_PRESENT_COMPLETE_KIND_NOTIFY_MSC);
++
++		if (ce->serial == 0xdeadbeef) {
++			complete = 1;
++		} else {
++			fprintf(stderr,
++				"\tnotify %d recieved at +%llu\n",
++				ce->serial, ce->msc - msc);
++			ret++;
++		}
++		free(ev);
++	} while (!complete);
++
++	teardown_msc(dpy, Q);
++	XDestroyWindow(dpy, win);
++	XSync(dpy, True);
++
++	return ret;
+ }
+ 
+-static void teardown_msc(Display *dpy, void *q)
++static int test_exhaustion_msc(Display *dpy, void *Q)
+ {
+-	xcb_unregister_for_special_event(XGetXCBConnection(dpy), q);
++#define N_VBLANKS 256 /* kernel event queue length: 128 vblanks */
++	xcb_connection_t *c = XGetXCBConnection(dpy);
++	Window root = DefaultRootWindow(dpy);
++	int ret = 0, n, complete;
++	int earliest = 0, early = 0;
++	int latest = 0, late = 0;
++	uint64_t msc;
++
++	printf("Testing notifies with long queues\n");
++	_x_error_occurred = 0;
++
++	msc = check_msc(dpy, root, Q, 0, NULL);
++	for (n = N_VBLANKS; n--; )
++		xcb_present_notify_msc(c, root, N_VBLANKS, msc + N_VBLANKS, 0, 0);
++	for (n = 1; n <= N_VBLANKS ; n++)
++		xcb_present_notify_msc(c, root, n, msc + n, 0, 0);
++	xcb_flush(c);
++
++	complete = 2*N_VBLANKS;
++	do {
++		xcb_present_complete_notify_event_t *ce;
++		xcb_generic_event_t *ev;
++		int diff;
++
++		ev = xcb_wait_for_special_event(c, Q);
++		if (ev == NULL)
++			break;
++
++		ce = (xcb_present_complete_notify_event_t *)ev;
++		assert(ce->kind == XCB_PRESENT_COMPLETE_KIND_NOTIFY_MSC);
++
++		diff = (int64_t)(ce->msc - msc - ce->serial);
++		if (diff < 0) {
++			if (-diff > earliest) {
++				fprintf(stderr, "\tnotify %d early by %d msc\n",(int)ce->serial, -diff);
++				earliest = -diff;
++			}
++			early++;
++			ret++;
++		} else if (diff > 0) {
++			if (diff > latest) {
++				fprintf(stderr, "\tnotify %d late by %d msc\n", (int)ce->serial, diff);
++				latest = diff;
++			}
++			late++;
++			ret++;
++		}
++		free(ev);
++	} while (--complete);
++
++	if (early)
++		printf("\t%d notifies too early (worst %d)!\n", early, earliest);
++	if (late)
++		printf("\t%d notifies too late (worst %d)!\n", late, latest);
++
++	XSync(dpy, True);
++	ret += !!_x_error_occurred;
++
++	return ret;
++#undef N_VBLANKS
+ }
+-static int test_whole(Display *dpy)
++
++static int test_accuracy_msc(Display *dpy, void *Q)
+ {
+-	Pixmap pixmap;
+-	struct dri3_fence fence;
+-	Window root;
+-	unsigned int width, height;
+-	unsigned border, depth;
+-	int x, y, ret = 1;
++#define N_VBLANKS (60 * 120) /* ~2 minutes */
++	xcb_connection_t *c = XGetXCBConnection(dpy);
++	Window root = DefaultRootWindow(dpy);
++	int ret = 0, n;
++	uint64_t msc;
++	int early = 0, late = 0;
++	int earliest = 0, latest = 0;
++	int complete, count;
+ 
+-	XGetGeometry(dpy, DefaultRootWindow(dpy),
+-		     &root, &x, &y, &width, &height, &border, &depth);
++	printf("Testing notify accuracy\n");
++	_x_error_occurred = 0;
+ 
+-	if (dri3_create_fence(dpy, root, &fence))
+-		return 0;
++	msc = check_msc(dpy, root, Q, 0, NULL);
++	for (n = 0; n <= N_VBLANKS; n++)
++		xcb_present_notify_msc(c, root, n, msc + 60 + n, 0, 0);
++	xcb_present_notify_msc(c, root, 0xdeadbeef, msc + 60 + n, 0, 0);
++	xcb_flush(c);
++
++	complete = 0;
++	count = 0;
++	do {
++		xcb_present_complete_notify_event_t *ce;
++		xcb_generic_event_t *ev;
++
++		ev = xcb_wait_for_special_event(c, Q);
++		if (ev == NULL)
++			break;
++
++		ce = (xcb_present_complete_notify_event_t *)ev;
++		assert(ce->kind == XCB_PRESENT_COMPLETE_KIND_NOTIFY_MSC);
++
++		if (ce->serial != 0xdeadbeef) {
++			int diff = (int64_t)(ce->msc - (msc + ce->serial + 60));
++			if (diff < 0) {
++				if (-diff > earliest) {
++					fprintf(stderr, "\tnotify %d early by %d msc\n", ce->serial, -diff);
++					earliest = -diff;
++				}
++				early++;
++				ret++;
++			} else if (diff > 0) {
++				if (diff > latest) {
++					fprintf(stderr, "\tnotify %d late by %d msc\n", ce->serial, diff);
++					latest = diff;
++				}
++				late++;
++				ret++;
++			}
++			count++;
++		} else
++			complete = 1;
++		free(ev);
++	} while (!complete);
++
++	if (early)
++		printf("\t%d notifies too early (worst %d)!\n", early, earliest);
++	if (late)
++		printf("\t%d notifies too late (worst %d)!\n", late, latest);
++
++	if (count != N_VBLANKS+1) {
++		fprintf(stderr, "Sentinel vblank received too early! %d waits outstanding\n", N_VBLANKS+1 - count);
++		ret++;
++		do {
++			xcb_present_complete_notify_event_t *ce;
++			xcb_generic_event_t *ev;
++
++			ev = xcb_wait_for_special_event(c, Q);
++			if (ev == NULL)
++				break;
++
++			ce = (xcb_present_complete_notify_event_t *)ev;
++			assert(ce->kind == XCB_PRESENT_COMPLETE_KIND_NOTIFY_MSC);
++			free(ev);
++		} while (++count != N_VBLANKS+1);
++	}
++
++	XSync(dpy, True);
++	ret += !!_x_error_occurred;
++
++	return ret;
++#undef N_VBLANKS
++}
+ 
+-	printf("Testing whole screen flip: %dx%d\n", width, height);
++static int test_modulus_msc(Display *dpy, void *Q)
++{
++	xcb_connection_t *c = XGetXCBConnection(dpy);
++	Window root = DefaultRootWindow(dpy);
++	xcb_present_complete_notify_event_t *ce;
++	xcb_generic_event_t *ev;
++	int x, y, ret = 0;
++	uint64_t target;
++	int early = 0, late = 0;
++	int earliest = 0, latest = 0;
++	int complete, count, expect;
++
++	printf("Testing notify modulus\n");
+ 	_x_error_occurred = 0;
+ 
+-	xshmfence_reset(fence.addr);
++	target = wait_vblank(dpy, root, Q);
+ 
+-	pixmap = XCreatePixmap(dpy, root, width, height, depth);
+-	xcb_present_pixmap(XGetXCBConnection(dpy),
+-			   root, pixmap,
+-			   0, /* sbc */
+-			   0, /* valid */
+-			   0, /* update */
+-			   0, /* x_off */
+-			   0, /* y_off */
+-			   None,
+-			   None, /* wait fence */
+-			   fence.xid,
+-			   XCB_PRESENT_OPTION_NONE,
+-			   0, /* target msc */
+-			   0, /* divisor */
+-			   0, /* remainder */
+-			   0, NULL);
+-	XFreePixmap(dpy, pixmap);
++	expect = 0;
++	xcb_present_notify_msc(c, root, 0, 0, 0, 0);
++	for (x = 1; x <= 19; x++) {
++		for (y = 0; y < x; y++) {
++			xcb_present_notify_msc(c, root, y << 16 | x, 0, x, y);
++			expect++;
++		}
++	}
++	xcb_present_notify_msc(c, root, 0xdeadbeef, target + 2*x, 0, 0);
++	xcb_flush(c);
+ 
+-	pixmap = XCreatePixmap(dpy, root, width, height, depth);
+-	xcb_present_pixmap(XGetXCBConnection(dpy),
+-			   root, pixmap,
+-			   0, /* sbc */
+-			   0, /* valid */
+-			   0, /* update */
+-			   0, /* x_off */
+-			   0, /* y_off */
+-			   None,
+-			   None, /* wait fence */
+-			   None, /* sync fence */
+-			   XCB_PRESENT_OPTION_NONE,
+-			   0, /* target msc */
+-			   0, /* divisor */
+-			   0, /* remainder */
+-			   0, NULL);
+-	XFreePixmap(dpy, pixmap);
+-	XFlush(dpy);
++	ev = xcb_wait_for_special_event(c, Q);
++	if (ev) {
++		ce = (xcb_present_complete_notify_event_t *)ev;
++		assert(ce->kind == XCB_PRESENT_COMPLETE_KIND_NOTIFY_MSC);
++		assert(ce->serial == 0);
++		assert(target == ce->msc);
++		target = ce->msc;
++	}
+ 
+-	ret = !!xshmfence_await(fence.addr);
+-	dri3_fence_free(dpy, &fence);
++	complete = 0;
++	count = 0;
++	do {
++		ev = xcb_wait_for_special_event(c, Q);
++		if (ev == NULL)
++			break;
++
++		ce = (xcb_present_complete_notify_event_t *)ev;
++		assert(ce->kind == XCB_PRESENT_COMPLETE_KIND_NOTIFY_MSC);
++
++		assert(ce->serial);
++		if (ce->serial != 0xdeadbeef) {
++			uint64_t msc;
++			int diff;
++
++			x = ce->serial & 0xffff;
++			y = ce->serial >> 16;
++
++			msc = target;
++			msc -= target % x;
++			msc += y;
++			if (msc <= target)
++				msc += x;
++
++			diff = (int64_t)(ce->msc - msc);
++			if (diff < 0) {
++				if (-diff > earliest) {
++					fprintf(stderr, "\tnotify (%d, %d) early by %d msc (target %lld, reported %lld)\n", y, x, -diff, (long long)msc, (long long)ce->msc);
++					earliest = -diff;
++				}
++				early++;
++				ret++;
++			} else if (diff > 0) {
++				if (diff > latest) {
++					fprintf(stderr, "\tnotify (%d, %d) late by %d msc (target %lld, reported %lld)\n", y, x, diff, (long long)msc, (long long)ce->msc);
++					latest = diff;
++				}
++				late++;
++				ret++;
++			}
++			count++;
++		} else
++			complete = 1;
++		free(ev);
++	} while (!complete);
++
++	if (early)
++		printf("\t%d notifies too early (worst %d)!\n", early, earliest);
++	if (late)
++		printf("\t%d notifies too late (worst %d)!\n", late, latest);
++
++	if (count != expect) {
++		fprintf(stderr, "Sentinel vblank received too early! %d waits outstanding\n", expect - count);
++		ret++;
++		do {
++			ev = xcb_wait_for_special_event(c, Q);
++			if (ev == NULL)
++				break;
++
++			ce = (xcb_present_complete_notify_event_t *)ev;
++			assert(ce->kind == XCB_PRESENT_COMPLETE_KIND_NOTIFY_MSC);
++			free(ev);
++		} while (++count != expect);
++	}
+ 
+ 	XSync(dpy, True);
+ 	ret += !!_x_error_occurred;
+@@ -279,8 +1471,6 @@ static int for_each_crtc(Display *dpy,
+ 	for (i = 0; i < res->ncrtc; i++)
+ 		original_crtc[i] = XRRGetCrtcInfo(dpy, res, res->crtcs[i]);
+ 
+-	printf("noutput=%d, ncrtc=%d\n", res->noutput, res->ncrtc);
+-
+ 	for (i = 0; i < res->noutput; i++) {
+ 		XRROutputInfo *output;
+ 		XRRModeInfo *mode;
+@@ -322,7 +1512,7 @@ static int for_each_crtc(Display *dpy,
+ 	free(original_crtc);
+ 	XRRFreeScreenResources(res);
+ 
+-	return j;
++	return err;
+ }
+ 
+ struct test_crtc {
+@@ -335,6 +1525,7 @@ struct test_crtc {
+ 	uint64_t msc;
+ };
+ #define SYNC 0x1
++#define FUTURE 0x2
+ 
+ static int __test_crtc(Display *dpy, RRCrtc crtc,
+ 		       int width, int height,
+@@ -344,7 +1535,7 @@ static int __test_crtc(Display *dpy, RRCrtc crtc,
+ 	Pixmap pixmap;
+ 	int err = 0;
+ 
+-	test->msc = check_msc(dpy, test->win, test->queue, test->msc);
++	test->msc = check_msc(dpy, test->win, test->queue, test->msc, NULL);
+ 
+ 	if (test->flags & SYNC)
+ 		xshmfence_reset(test->fence.addr);
+@@ -361,16 +1552,14 @@ static int __test_crtc(Display *dpy, RRCrtc crtc,
+ 			   None, /* wait fence */
+ 			   test->flags & SYNC ? test->fence.xid : None,
+ 			   XCB_PRESENT_OPTION_NONE,
+-			   0, /* target msc */
++			   test->msc, /* target msc */
+ 			   1, /* divisor */
+ 			   0, /* remainder */
+ 			   0, NULL);
+-	XFreePixmap(dpy, pixmap);
+-
+ 	if (test->flags & SYNC) {
+-		pixmap = XCreatePixmap(dpy, test->win, width, height, test->depth);
++		Pixmap tmp = XCreatePixmap(dpy, test->win, width, height, test->depth);
+ 		xcb_present_pixmap(XGetXCBConnection(dpy),
+-				   test->win, pixmap,
++				   test->win, tmp,
+ 				   1, /* sbc */
+ 				   0, /* valid */
+ 				   0, /* update */
+@@ -380,16 +1569,17 @@ static int __test_crtc(Display *dpy, RRCrtc crtc,
+ 				   None, /* wait fence */
+ 				   None, /* sync fence */
+ 				   XCB_PRESENT_OPTION_NONE,
+-				   1, /* target msc */
++				   test->msc + (test->flags & FUTURE ? 5 * 16 : 1), /* target msc */
+ 				   1, /* divisor */
+ 				   0, /* remainder */
+ 				   0, NULL);
+-		XFreePixmap(dpy, pixmap);
++		XFreePixmap(dpy, tmp);
+ 		XFlush(dpy);
+ 		err += !!xshmfence_await(test->fence.addr);
+ 	}
++	XFreePixmap(dpy, pixmap);
+ 
+-	test->msc = check_msc(dpy, test->win, test->queue, test->msc);
++	test->msc = check_msc(dpy, test->win, test->queue, test->msc, NULL);
+ 	return err;
+ }
+ 
+@@ -410,15 +1600,23 @@ static int test_crtc(Display *dpy, void *queue, uint64_t last_msc)
+ 
+ 	printf("Testing each crtc, without waiting for each flip\n");
+ 	test.flags = 0;
++	test.msc = check_msc(dpy, test.win, test.queue, test.msc, NULL);
+ 	err += for_each_crtc(dpy, __test_crtc, &test);
++	test.msc = check_msc(dpy, test.win, test.queue, test.msc, NULL);
+ 
+ 	printf("Testing each crtc, waiting for flips to complete\n");
+ 	test.flags = SYNC;
++	test.msc = check_msc(dpy, test.win, test.queue, test.msc, NULL);
+ 	err += for_each_crtc(dpy, __test_crtc, &test);
++	test.msc = check_msc(dpy, test.win, test.queue, test.msc, NULL);
+ 
+-	test.msc = check_msc(dpy, test.win, test.queue, test.msc);
+-	dri3_fence_free(dpy, &test.fence);
++	printf("Testing each crtc, with future flips\n");
++	test.flags = FUTURE | SYNC;
++	test.msc = check_msc(dpy, test.win, test.queue, test.msc, NULL);
++	err += for_each_crtc(dpy, __test_crtc, &test);
++	test.msc = check_msc(dpy, test.win, test.queue, test.msc, NULL);
+ 
++	dri3_fence_free(dpy, &test.fence);
+ 	XSync(dpy, True);
+ 	err += !!_x_error_occurred;
+ 
+@@ -536,6 +1734,31 @@ static int gem_set_caching(int fd, uint32_t handle, int caching)
+ 	return drmIoctl(fd, LOCAL_IOCTL_I915_GEM_SET_CACHING, &arg) == 0;
+ }
+ 
++static int gem_set_tiling(int fd, uint32_t handle, int tiling, int stride)
++{
++	struct drm_i915_gem_set_tiling set_tiling;
++	int err;
++
++restart:
++	set_tiling.handle = handle;
++	set_tiling.tiling_mode = tiling;
++	set_tiling.stride = stride;
++
++	if (drmIoctl(fd, DRM_IOCTL_I915_GEM_SET_TILING, &set_tiling) == 0)
++		return 1;
++
++	err = errno;
++	if (err == EINTR)
++		goto restart;
++
++	if (err == EAGAIN) {
++		sched_yield();
++		goto restart;
++	}
++
++	return 0;
++}
++
+ static int gem_export(int fd, uint32_t handle)
+ {
+ 	struct drm_prime_handle args;
+@@ -557,6 +1780,126 @@ static void gem_close(int fd, uint32_t handle)
+ 	(void)drmIoctl(fd, DRM_IOCTL_GEM_CLOSE, &close);
+ }
+ 
++static int test_dri3_tiling(Display *dpy)
++{
++	Window win = DefaultRootWindow(dpy);
++	const int tiling[] = { I915_TILING_NONE, I915_TILING_X, I915_TILING_Y };
++	Window root;
++	unsigned int width, height;
++	unsigned border, depth, bpp;
++	unsigned stride, size;
++	void *Q;
++	int x, y;
++	int device;
++	int line = -1;
++	int t;
++
++	device = dri3_open(dpy);
++	if (device < 0)
++		return 0;
++
++	if (!is_intel(device))
++		return 0;
++
++	printf("Opened Intel DRI3 device\n");
++
++	XGetGeometry(dpy, win, &root, &x, &y,
++		     &width, &height, &border, &depth);
++
++	switch (depth) {
++	case 8: bpp = 8; break;
++	case 15: case 16: bpp = 16; break;
++	case 24: case 32: bpp = 32; break;
++	default: return 0;
++	}
++
++	stride = ALIGN(width * bpp/8, 512);
++	size = PAGE_ALIGN(stride * ALIGN(height, 32));
++	printf("Creating DRI3 %dx%d (source stride=%d, size=%d) for GTT\n",
++	       width, height, stride, size);
++
++	_x_error_occurred = 0;
++	Q = setup_msc(dpy, root);
++
++	for (t = 0; t < sizeof(tiling)/sizeof(tiling[0]); t++) {
++		uint64_t msc;
++		uint32_t src;
++		int src_fd;
++		Pixmap src_pix;
++
++		src = gem_create(device, size);
++		if (!src) {
++			line = __LINE__;
++			goto fail;
++		}
++
++		gem_set_tiling(device, src, tiling[t], stride);
++
++		src_fd = gem_export(device, src);
++		if (src_fd < 0) {
++			line = __LINE__;
++			goto fail;
++		}
++
++		src_pix = dri3_create_pixmap(dpy, root,
++					     width, height, depth,
++					     src_fd, bpp, stride, size);
++
++		msc = wait_vblank(dpy, root, Q);
++
++		xcb_present_pixmap(XGetXCBConnection(dpy),
++				   win, src_pix,
++				   0, /* sbc */
++				   0, /* valid */
++				   0, /* update */
++				   0, /* x_off */
++				   0, /* y_off */
++				   None,
++				   None, /* wait fence */
++				   None,
++				   XCB_PRESENT_OPTION_NONE,
++				   msc + 2, /* target msc */
++				   1, /* divisor */
++				   0, /* remainder */
++				   0, NULL);
++
++		xcb_present_pixmap(XGetXCBConnection(dpy),
++				   win, src_pix,
++				   0, /* sbc */
++				   0, /* valid */
++				   0, /* update */
++				   0, /* x_off */
++				   0, /* y_off */
++				   None,
++				   None, /* wait fence */
++				   None,
++				   XCB_PRESENT_OPTION_NONE,
++				   msc + 3, /* target msc */
++				   1, /* divisor */
++				   0, /* remainder */
++				   0, NULL);
++
++		XSync(dpy, True);
++		if (_x_error_occurred) {
++			line = __LINE__;
++			goto fail;
++		}
++		XFreePixmap(dpy, src_pix);
++		_x_error_occurred = 0;
++
++		close(src_fd);
++		gem_close(device, src);
++	}
++
++	teardown_msc(dpy, Q);
++	return 0;
++
++fail:
++	printf("%s failed with tiling %d, line %d\n", __func__, tiling[t], line);
++	teardown_msc(dpy, Q);
++	return 1;
++}
++
+ static int test_dri3(Display *dpy)
+ {
+ 	Window win = DefaultRootWindow(dpy);
+@@ -670,8 +2013,32 @@ fail:
+ static int has_present(Display *dpy)
+ {
+ 	xcb_connection_t *c = XGetXCBConnection(dpy);
+-	xcb_present_query_version_reply_t *reply;
+ 	xcb_generic_error_t *error = NULL;
++	void *reply;
++
++	reply = xcb_xfixes_query_version_reply(c,
++					       xcb_xfixes_query_version(c,
++									XCB_XFIXES_MAJOR_VERSION,
++									XCB_XFIXES_MINOR_VERSION),
++					       &error);
++	free(reply);
++	free(error);
++	if (reply == NULL) {
++		fprintf(stderr, "XFixes not supported on %s\n", DisplayString(dpy));
++		return 0;
++	}
++
++	reply = xcb_dri3_query_version_reply(c,
++					     xcb_dri3_query_version(c,
++								    XCB_DRI3_MAJOR_VERSION,
++								    XCB_DRI3_MINOR_VERSION),
++					     &error);
++	free(reply);
++	free(error);
++	if (reply == NULL) {
++		fprintf(stderr, "DRI3 not supported on %s\n", DisplayString(dpy));
++		return 0;
++	}
+ 
+ 	reply = xcb_present_query_version_reply(c,
+ 						xcb_present_query_version(c,
+@@ -681,14 +2048,32 @@ static int has_present(Display *dpy)
+ 
+ 	free(reply);
+ 	free(error);
++	if (reply == NULL) {
++		fprintf(stderr, "Present not supported on %s\n", DisplayString(dpy));
++		return 0;
++	}
++
++	return 1;
++}
++
++static int has_composite(Display *dpy)
++{
++	int event, error;
++	int major, minor;
++
++	if (!XCompositeQueryExtension(dpy, &event, &error))
++		return 0;
++
++	XCompositeQueryVersion(dpy, &major, &minor);
+ 
+-	return reply != NULL;
++	return major > 0 || minor >= 4;
+ }
+ 
+ int main(void)
+ {
+ 	Display *dpy;
+ 	Window root;
++	int dummy;
+ 	int error = 0;
+ 	uint64_t last_msc;
+ 	void *queue;
+@@ -700,27 +2085,135 @@ int main(void)
+ 	if (!has_present(dpy))
+ 		return 77;
+ 
++	if (DPMSQueryExtension(dpy, &dummy, &dummy))
++		DPMSDisable(dpy);
++
+ 	root = DefaultRootWindow(dpy);
+ 
+ 	signal(SIGALRM, SIG_IGN);
+ 	XSetErrorHandler(_check_error_handler);
+ 
+ 	queue = setup_msc(dpy, root);
+-	last_msc = check_msc(dpy, root, queue, 0);
++	last_msc = check_msc(dpy, root, queue, 0, NULL);
++
++	error += test_future_msc(dpy, queue);
++	last_msc = check_msc(dpy, root, queue, last_msc, NULL);
++
++	error += test_wrap_msc(dpy);
++	last_msc = check_msc(dpy, root, queue, last_msc, NULL);
++
++	error += test_accuracy_msc(dpy, queue);
++	last_msc = check_msc(dpy, root, queue, last_msc, NULL);
++
++	error += test_modulus_msc(dpy, queue);
++	last_msc = check_msc(dpy, root, queue, last_msc, NULL);
++
++	error += test_exhaustion_msc(dpy, queue);
++	last_msc = check_msc(dpy, root, queue, last_msc, NULL);
++
++	for (dummy = 0; dummy <= 3; dummy++) {
++		Window win;
++		uint64_t msc = 0;
++		XSetWindowAttributes attr;
++		Visual *visual = DefaultVisual(dpy, DefaultScreen(dpy));
++		unsigned int width, height;
++		unsigned border, depth;
++		const char *phase;
++		int x, y;
++		void *Q;
++
++		attr.override_redirect = 1;
++
++		XGetGeometry(dpy, root, &win, &x, &y,
++			     &width, &height, &border, &depth);
++
++		_x_error_occurred = 0;
++		switch (dummy) {
++		case 0:
++			win = root;
++			phase = "root";
++			break;
++		case 1:
++			win = XCreateWindow(dpy, root,
++					    0, 0, width, height, 0, depth,
++					    InputOutput, visual,
++					    CWOverrideRedirect, &attr);
++			phase = "fullscreen";
++			break;
++		case 2:
++			win = XCreateWindow(dpy, root,
++					    0, 0, width/2, height/2, 0, depth,
++					    InputOutput, visual,
++					    CWOverrideRedirect, &attr);
++			phase = "window";
++			break;
++		case 3:
++			if (!has_composite(dpy))
++				continue;
++
++			win = XCreateWindow(dpy, root,
++					    0, 0, width, height, 0,
++					    DefaultDepth(dpy, DefaultScreen(dpy)),
++					    InputOutput,
++					    DefaultVisual(dpy, DefaultScreen(dpy)),
++					    CWOverrideRedirect, &attr);
++			XCompositeRedirectWindow(dpy, win, CompositeRedirectManual);
++			phase = "composite";
++			break;
++
++		default:
++			phase = "broken";
++			win = root;
++			abort();
++			break;
++		}
++
++		XMapWindow(dpy, win);
++		XSync(dpy, True);
++		if (_x_error_occurred)
++			continue;
++
++		Q = setup_msc(dpy, win);
++		msc = check_msc(dpy, win, Q, msc, NULL);
+ 
+-	error += test_whole(dpy);
+-	last_msc = check_msc(dpy, root, queue, last_msc);
++		error += test_whole(dpy, win, phase);
++		msc = check_msc(dpy, win, Q, msc, NULL);
++
++		error += test_double(dpy, win, phase, Q);
++		msc = check_msc(dpy, win, Q, msc, NULL);
++
++		error += test_future(dpy, win, phase, Q);
++		msc = check_msc(dpy, win, Q, msc, NULL);
++
++		error += test_accuracy(dpy, win, phase, Q);
++		msc = check_msc(dpy, win, Q, msc, NULL);
++
++		error += test_modulus(dpy, win, phase, Q);
++		msc = check_msc(dpy, win, Q, msc, NULL);
++
++		error += test_exhaustion(dpy, win, phase, Q);
++		msc = check_msc(dpy, win, Q, msc, NULL);
++
++		teardown_msc(dpy, Q);
++		if (win != root)
++			XDestroyWindow(dpy, win);
++	}
+ 
+ 	error += test_crtc(dpy, queue, last_msc);
+-	last_msc = check_msc(dpy, root, queue, last_msc);
++	last_msc = check_msc(dpy, root, queue, last_msc, NULL);
+ 
+ 	error += test_shm(dpy);
+-	last_msc = check_msc(dpy, root, queue, last_msc);
++	last_msc = check_msc(dpy, root, queue, last_msc, NULL);
+ 
+ 	error += test_dri3(dpy);
+-	last_msc = check_msc(dpy, root, queue, last_msc);
++	last_msc = check_msc(dpy, root, queue, last_msc, NULL);
++
++	error += test_dri3_tiling(dpy);
++	last_msc = check_msc(dpy, root, queue, last_msc, NULL);
+ 
+ 	teardown_msc(dpy, queue);
+ 
++	if (DPMSQueryExtension(dpy, &dummy, &dummy))
++		DPMSEnable(dpy);
+ 	return !!error;
+ }
+diff --git a/test/render-glyphs.c b/test/render-glyphs.c
+new file mode 100644
+index 00000000..8822e36a
+--- /dev/null
++++ b/test/render-glyphs.c
+@@ -0,0 +1,441 @@
++#include <stdint.h>
++#include <stdio.h>
++#include <stdlib.h>
++#include <stdbool.h>
++#include <stdarg.h>
++#include <string.h>
++
++#include <X11/Xutil.h> /* for XDestroyImage */
++#include <pixman.h> /* for pixman blt functions */
++
++#include "test.h"
++
++static const XRenderColor colors[] = {
++	/* red, green, blue, alpha */
++	{ 0 },
++	{ 0, 0, 0, 0xffff },
++	{ 0xffff, 0, 0, 0xffff },
++	{ 0, 0xffff, 0, 0xffff },
++	{ 0, 0, 0xffff, 0xffff },
++	{ 0xffff, 0xffff, 0xffff, 0xffff },
++};
++
++static struct clip {
++	void *func;
++} clips[] = {
++	{ NULL },
++};
++
++static int _x_error_occurred;
++
++static int
++_check_error_handler(Display     *display,
++		     XErrorEvent *event)
++{
++	_x_error_occurred = 1;
++	return False; /* ignored */
++}
++
++static void clear(struct test_display *dpy,
++		  struct test_target *tt,
++		  const XRenderColor *c)
++{
++	XRenderFillRectangle(dpy->dpy, PictOpClear, tt->picture, c,
++			     0, 0, tt->width, tt->height);
++}
++
++static bool check_op(struct test_display *dpy, int op, struct test_target *tt)
++{
++	XRenderColor render_color = {0};
++
++	XSync(dpy->dpy, True);
++	_x_error_occurred = 0;
++
++	XRenderFillRectangle(dpy->dpy, op,
++			     tt->picture, &render_color,
++			     0, 0, 0, 0);
++
++	XSync(dpy->dpy, True);
++	return _x_error_occurred == 0;
++}
++
++struct glyph_iter {
++	enum {
++		GLYPHS, OP, DST, SRC, MASK, CLIP,
++	} stage;
++
++	int glyph_format;
++	int op;
++	int dst_color;
++	int src_color;
++	int mask_format;
++	int clip;
++
++	struct {
++		struct test_display *dpy;
++		struct test_target tt;
++		GlyphSet glyphset;
++		Picture src;
++		XRenderPictFormat *mask_format;
++	} ref, out;
++};
++
++static void glyph_iter_init(struct glyph_iter *gi,
++			    struct test *t, enum target target)
++{
++	memset(gi, 0, sizeof(*gi));
++
++	gi->out.dpy = &t->out;
++	test_target_create_render(&t->out, target, &gi->out.tt);
++
++	gi->ref.dpy = &t->ref;
++	test_target_create_render(&t->ref, target, &gi->ref.tt);
++
++	gi->stage = GLYPHS;
++	gi->glyph_format = -1;
++	gi->op = -1;
++	gi->dst_color = -1;
++	gi->src_color = -1;
++	gi->mask_format = -1;
++	gi->clip = -1;
++}
++
++static void render_clear(char *image, int image_size, int bpp)
++{
++	memset(image, 0, image_size);
++}
++
++static void render_black(char *image, int image_size, int bpp)
++{
++	if (bpp == 4) {
++		uint32_t *p = (uint32_t *)image;
++		image_size /= 4;
++		while (image_size--)
++			*p++ = 0x000000ff;
++	} else
++		memset(image, 0x55, image_size);
++}
++
++static void render_green(char *image, int image_size, int bpp)
++{
++	if (bpp == 4) {
++		uint32_t *p = (uint32_t *)image;
++		image_size /= 4;
++		while (image_size--)
++			*p++ = 0xffff0000;
++	} else
++		memset(image, 0xaa, image_size);
++}
++
++static void render_white(char *image, int image_size, int bpp)
++{
++	memset(image, 0xff, image_size);
++}
++
++static GlyphSet create_glyphs(Display *dpy, int format_id)
++{
++#define N_GLYPHS 4
++	XRenderPictFormat *format;
++	XGlyphInfo glyph = { 8, 8, 0, 0, 8, 0 };
++	char image[4*8*8];
++	GlyphSet glyphset;
++	Glyph gid;
++	int image_size;
++	int bpp;
++	int n;
++
++	format = XRenderFindStandardFormat(dpy, format_id);
++	if (format == NULL)
++		return 0;
++
++	switch (format_id) {
++	case PictStandardARGB32:
++	case PictStandardRGB24:
++		image_size = 4 * 8 * 8;
++		bpp = 4;
++		break;
++	case PictStandardA8:
++	case PictStandardA4:
++		image_size = 8 * 8;
++		bpp = 1;
++		break;
++	case PictStandardA1:
++		image_size = 8;
++		bpp = 0;
++		break;
++	default:
++		return 0;
++	}
++
++	glyphset = XRenderCreateGlyphSet(dpy, format);
++	for (n = 0; n < N_GLYPHS; n++) {
++		gid = n;
++
++		switch (n) {
++		case 0: render_clear(image, image_size, bpp); break;
++		case 1: render_black(image, image_size, bpp); break;
++		case 2: render_green(image, image_size, bpp); break;
++		case 3: render_white(image, image_size, bpp); break;
++		}
++
++		XRenderAddGlyphs(dpy, glyphset,
++				 &gid, &glyph, 1, image, image_size);
++	}
++
++	return glyphset;
++}
++
++static const char *glyph_name(int n)
++{
++	switch (n) {
++	case 0: return "clear";
++	case 1: return "black";
++	case 2: return "green";
++	case 3: return "white";
++	default: return "unknown";
++	}
++}
++
++static bool glyph_iter_next(struct glyph_iter *gi)
++{
++restart:
++	if (gi->stage == GLYPHS) {
++		if (++gi->glyph_format == PictStandardNUM)
++			return false;
++
++		if (gi->out.glyphset)
++			XRenderFreeGlyphSet(gi->out.dpy->dpy,
++					    gi->out.glyphset);
++		gi->out.glyphset = create_glyphs(gi->out.dpy->dpy,
++					       gi->glyph_format);
++
++		if (gi->ref.glyphset)
++			XRenderFreeGlyphSet(gi->ref.dpy->dpy,
++					    gi->ref.glyphset);
++		gi->ref.glyphset = create_glyphs(gi->ref.dpy->dpy,
++					       gi->glyph_format);
++
++		gi->stage++;
++	}
++
++	if (gi->stage == OP) {
++		do {
++			if (++gi->op == 255)
++				goto reset_op;
++		} while (!check_op(gi->out.dpy, gi->op, &gi->out.tt) ||
++			 !check_op(gi->ref.dpy, gi->op, &gi->ref.tt));
++
++		gi->stage++;
++	}
++
++	if (gi->stage == DST) {
++		if (++gi->dst_color == ARRAY_SIZE(colors))
++			goto reset_dst;
++
++		gi->stage++;
++	}
++
++	if (gi->stage == SRC) {
++		if (++gi->src_color == ARRAY_SIZE(colors))
++			goto reset_src;
++
++		if (gi->ref.src)
++			XRenderFreePicture(gi->ref.dpy->dpy, gi->ref.src);
++		gi->ref.src = XRenderCreateSolidFill(gi->ref.dpy->dpy,
++						     &colors[gi->src_color]);
++
++		if (gi->out.src)
++			XRenderFreePicture(gi->out.dpy->dpy, gi->out.src);
++		gi->out.src = XRenderCreateSolidFill(gi->out.dpy->dpy,
++						     &colors[gi->src_color]);
++
++		gi->stage++;
++	}
++
++	if (gi->stage == MASK) {
++		if (++gi->mask_format > PictStandardNUM)
++			goto reset_mask;
++
++		if (gi->mask_format == PictStandardRGB24)
++			gi->mask_format++;
++
++		if (gi->mask_format < PictStandardNUM) {
++			gi->out.mask_format = XRenderFindStandardFormat(gi->out.dpy->dpy,
++									gi->mask_format);
++			gi->ref.mask_format = XRenderFindStandardFormat(gi->ref.dpy->dpy,
++									gi->mask_format);
++		} else {
++			gi->out.mask_format = NULL;
++			gi->ref.mask_format = NULL;
++		}
++
++		gi->stage++;
++	}
++
++	if (gi->stage == CLIP) {
++		if (++gi->clip == ARRAY_SIZE(clips))
++			goto reset_clip;
++
++		gi->stage++;
++	}
++
++	gi->stage--;
++	return true;
++
++reset_op:
++	gi->op = -1;
++reset_dst:
++	gi->dst_color = -1;
++reset_src:
++	gi->src_color = -1;
++reset_mask:
++	gi->mask_format = -1;
++reset_clip:
++	gi->clip = -1;
++	gi->stage--;
++	goto restart;
++}
++
++static void glyph_iter_fini(struct glyph_iter *gi)
++{
++	if (gi->out.glyphset)
++		XRenderFreeGlyphSet (gi->out.dpy->dpy, gi->out.glyphset);
++	if (gi->ref.glyphset)
++		XRenderFreeGlyphSet (gi->ref.dpy->dpy, gi->ref.glyphset);
++
++	test_target_destroy_render(gi->out.dpy, &gi->out.tt);
++	test_target_destroy_render(gi->ref.dpy, &gi->ref.tt);
++}
++
++static const char *stdformat_to_str(int id)
++{
++	switch (id) {
++	case PictStandardARGB32: return "ARGB32";
++	case PictStandardRGB24: return "RGB24";
++	case PictStandardA8: return "A8";
++	case PictStandardA4: return "A4";
++	case PictStandardA1: return "A1";
++	default: return "none";
++	}
++}
++
++static char *glyph_iter_to_string(struct glyph_iter *gi,
++				  const char *format,
++				  ...)
++{
++	static char buf[100];
++	va_list ap;
++	int len;
++
++	len = sprintf(buf, "glyphs=%s, op=%d, dst=%08x, src=%08x, mask=%s",
++		      stdformat_to_str(gi->glyph_format), gi->op,
++		      xrender_color(&colors[gi->dst_color]),
++		      xrender_color(&colors[gi->src_color]),
++		      stdformat_to_str(gi->mask_format));
++
++	if (format) {
++		buf[len++] = ' ';
++		va_start(ap, format);
++		vsprintf(buf+len, format, ap);
++		va_end(ap);
++	}
++
++	return buf;
++}
++
++static void single(struct test *t, enum target target)
++{
++	struct glyph_iter gi;
++	int n;
++
++	printf("Testing single glyph (%s): ", test_target_name(target));
++	fflush(stdout);
++
++	glyph_iter_init(&gi, t, target);
++	while (glyph_iter_next(&gi)) {
++		XGlyphElt8 elt;
++		char id[N_GLYPHS];
++
++		for (n = 0; n < N_GLYPHS; n++) {
++			id[n] = n;
++
++			elt.chars = &id[n];
++			elt.nchars = 1;
++			elt.xOff = 0;
++			elt.yOff = 0;
++
++			clear(gi.out.dpy, &gi.out.tt, &colors[gi.dst_color]);
++			elt.glyphset = gi.out.glyphset;
++			XRenderCompositeText8 (gi.out.dpy->dpy, gi.op,
++					       gi.out.src,
++					       gi.out.tt.picture,
++					       gi.out.mask_format,
++					       0, 0,
++					       0, 8,
++					       &elt, 1);
++
++			clear(gi.ref.dpy, &gi.ref.tt, &colors[gi.dst_color]);
++			elt.glyphset = gi.ref.glyphset;
++			XRenderCompositeText8 (gi.ref.dpy->dpy, gi.op,
++					       gi.ref.src,
++					       gi.ref.tt.picture,
++					       gi.ref.mask_format,
++					       0, 0,
++					       0, 8,
++					       &elt, 1);
++			test_compare(t,
++				     gi.out.tt.draw, gi.out.tt.format,
++				     gi.ref.tt.draw, gi.ref.tt.format,
++				     0, 0, gi.out.tt.width, gi.out.tt.height,
++				     glyph_iter_to_string(&gi,
++							  "glyph=%s",
++							  glyph_name(n)));
++		}
++
++		elt.chars = &id[0];
++		elt.nchars = n;
++		clear(gi.out.dpy, &gi.out.tt, &colors[gi.dst_color]);
++		elt.glyphset = gi.out.glyphset;
++		XRenderCompositeText8 (gi.out.dpy->dpy, gi.op,
++				       gi.out.src,
++				       gi.out.tt.picture,
++				       gi.out.mask_format,
++				       0, 0,
++				       0, 8,
++				       &elt, 1);
++
++		clear(gi.ref.dpy, &gi.ref.tt, &colors[gi.dst_color]);
++		elt.glyphset = gi.ref.glyphset;
++		XRenderCompositeText8 (gi.ref.dpy->dpy, gi.op,
++				       gi.ref.src,
++				       gi.ref.tt.picture,
++				       gi.ref.mask_format,
++				       0, 0,
++				       0, 8,
++				       &elt, 1);
++		test_compare(t,
++			     gi.out.tt.draw, gi.out.tt.format,
++			     gi.ref.tt.draw, gi.ref.tt.format,
++			     0, 0, gi.out.tt.width, gi.out.tt.height,
++			     glyph_iter_to_string(&gi, "all"));
++	}
++	glyph_iter_fini(&gi);
++}
++
++int main(int argc, char **argv)
++{
++	struct test test;
++	int t;
++
++	test_init(&test, argc, argv);
++	XSetErrorHandler(_check_error_handler);
++
++	for (t = TARGET_FIRST; t <= TARGET_LAST; t++) {
++		single(&test, t);
++		//overlapping(&test, t);
++		//gap(&test, t);
++		//mixed(&test, t);
++	}
++
++	return 0;
++}
+diff --git a/test/render-trapezoid.c b/test/render-trapezoid.c
+index cd990143..f15a78e3 100644
+--- a/test/render-trapezoid.c
++++ b/test/render-trapezoid.c
+@@ -403,16 +403,141 @@ static void trap_tests(struct test *t,
+ 	free(traps);
+ }
+ 
++enum edge {
++	EDGE_SHARP = PolyEdgeSharp,
++	EDGE_SMOOTH,
++};
++
++static const char *edge_name(enum edge edge)
++{
++	switch (edge) {
++	default:
++	case EDGE_SHARP: return "sharp";
++	case EDGE_SMOOTH: return "smooth";
++	}
++}
++
++static void set_edge(Display *dpy, Picture p, enum edge edge)
++{
++	XRenderPictureAttributes a;
++
++	a.poly_edge = edge;
++	XRenderChangePicture(dpy, p, CPPolyEdge, &a);
++}
++
++static void edge_test(struct test *t,
++		      enum mask mask,
++		      enum edge edge,
++		      enum target target)
++{
++	struct test_target out, ref;
++	XRenderColor white = { 0xffff, 0xffff, 0xffff, 0xffff };
++	Picture src_ref, src_out;
++	XTrapezoid trap;
++	int left_or_right, p;
++
++	test_target_create_render(&t->out, target, &out);
++	set_edge(t->out.dpy, out.picture, edge);
++	src_out = XRenderCreateSolidFill(t->out.dpy, &white);
++
++	test_target_create_render(&t->ref, target, &ref);
++	set_edge(t->ref.dpy, ref.picture, edge);
++	src_ref = XRenderCreateSolidFill(t->ref.dpy, &white);
++
++	printf("Testing edges (with mask %s and %s edges) (%s): ",
++	       mask_name(mask),
++	       edge_name(edge),
++	       test_target_name(target));
++	fflush(stdout);
++
++	for (left_or_right = 0; left_or_right <= 1; left_or_right++) {
++		for (p = -64; p <= out.width + 64; p++) {
++			char buf[80];
++
++			if (left_or_right) {
++				trap.left.p1.x = 0;
++				trap.left.p1.y = 0;
++				trap.left.p2.x = 0;
++				trap.left.p2.y = out.height << 16;
++
++				trap.right.p1.x = p << 16;
++				trap.right.p1.y = 0;
++				trap.right.p2.x = out.width << 16;
++				trap.right.p2.y = out.height << 16;
++			} else {
++				trap.right.p1.x = out.width << 16;
++				trap.right.p1.y = 0;
++				trap.right.p2.x = out.width << 16;
++				trap.right.p2.y = out.height << 16;
++
++				trap.left.p1.x = 0;
++				trap.left.p1.y = 0;
++				trap.left.p2.x = p << 16;
++				trap.left.p2.y = out.height << 16;
++			}
++
++			trap.top = 0;
++			trap.bottom = out.height << 16;
++
++			sprintf(buf,
++				"trap=((%d, %d), (%d, %d)), ((%d, %d), (%d, %d))\n",
++				trap.left.p1.x >> 16, trap.left.p1.y >> 16,
++				trap.left.p2.x >> 16, trap.left.p2.y >> 16,
++				trap.right.p1.x >> 16, trap.right.p1.y >> 16,
++				trap.right.p2.x >> 16, trap.right.p2.y >> 16);
++
++			clear(&t->out, &out);
++			XRenderCompositeTrapezoids(t->out.dpy,
++						   PictOpSrc,
++						   src_out,
++						   out.picture,
++						   mask_format(t->out.dpy, mask),
++						   0, 0,
++						   &trap, 1);
++
++			clear(&t->ref, &ref);
++			XRenderCompositeTrapezoids(t->ref.dpy,
++						   PictOpSrc,
++						   src_ref,
++						   ref.picture,
++						   mask_format(t->ref.dpy, mask),
++						   0, 0,
++						   &trap, 1);
++
++			test_compare(t,
++				     out.draw, out.format,
++				     ref.draw, ref.format,
++				     0, 0, out.width, out.height,
++				     buf);
++		}
++	}
++
++	XRenderFreePicture(t->out.dpy, src_out);
++	test_target_destroy_render(&t->out, &out);
++
++	XRenderFreePicture(t->ref.dpy, src_ref);
++	test_target_destroy_render(&t->ref, &ref);
++
++	printf("pass\n");
++}
++
+ int main(int argc, char **argv)
+ {
+ 	struct test test;
+ 	int i, dx, dy;
+ 	enum target target;
+ 	enum mask mask;
++	enum edge edge;
+ 	enum trapezoid trapezoid;
+ 
+ 	test_init(&test, argc, argv);
+ 
++	for (target = TARGET_FIRST; target <= TARGET_LAST; target++) {
++		for (mask = MASK_NONE; mask <= MASK_A8; mask++)
++			for (edge = EDGE_SHARP; edge <= EDGE_SMOOTH; edge++)
++				edge_test(&test, mask, edge, target);
++	}
++
+ 	for (i = 0; i <= DEFAULT_ITERATIONS; i++) {
+ 		int reps = REPS(i), sets = SETS(i);
+ 
+diff --git a/test/render-triangle.c b/test/render-triangle.c
+new file mode 100644
+index 00000000..165834ce
+--- /dev/null
++++ b/test/render-triangle.c
+@@ -0,0 +1,180 @@
++#include <stdint.h>
++#include <stdio.h>
++#include <stdlib.h>
++
++#include "test.h"
++
++enum edge {
++	EDGE_SHARP = PolyEdgeSharp,
++	EDGE_SMOOTH,
++};
++
++static void set_edge(Display *dpy, Picture p, enum edge edge)
++{
++	XRenderPictureAttributes a;
++
++	a.poly_edge = edge;
++	XRenderChangePicture(dpy, p, CPPolyEdge, &a);
++}
++
++static XRenderPictFormat *mask_format(Display *dpy, enum mask mask)
++{
++	switch (mask) {
++	default:
++	case MASK_NONE: return NULL;
++	case MASK_A1: return XRenderFindStandardFormat(dpy, PictStandardA1);
++	case MASK_A8: return XRenderFindStandardFormat(dpy, PictStandardA8);
++	}
++}
++
++static const char *mask_name(enum mask mask)
++{
++	switch (mask) {
++	default:
++	case MASK_NONE: return "none";
++	case MASK_A1: return "a1";
++	case MASK_A8: return "a8";
++	}
++}
++
++static const char *edge_name(enum edge edge)
++{
++	switch (edge) {
++	default:
++	case EDGE_SHARP: return "sharp";
++	case EDGE_SMOOTH: return "smooth";
++	}
++}
++
++static void clear(struct test_display *dpy, struct test_target *tt)
++{
++	XRenderColor render_color = {0};
++	XRenderFillRectangle(dpy->dpy, PictOpClear, tt->picture, &render_color,
++			     0, 0, tt->width, tt->height);
++}
++
++static void step_to_point(int step, int width, int height, XPointFixed *p)
++{
++	do {
++		p->x = (step - 64) << 16;
++		p->y = -64 << 16;
++
++		step -= width - 128;
++		if (step <= 0)
++			return;
++
++		p->x = (width + 64) << 16;
++		p->y = (step - 64) << 16;
++		step -= height - 128;
++
++		if (step <= 0)
++			return;
++
++		p->x = (width + 64 - step) << 16;
++		p->y = (height + 64) << 16;
++		step -= width - 128;
++
++		if (step <= 0)
++			return;
++
++		p->x = -64 << 16;
++		p->y = (height + 64 - step) << 16;
++		step -= height - 128;
++	} while (step > 0);
++}
++
++static void edge_test(struct test *t,
++		      enum mask mask,
++		      enum edge edge,
++		      enum target target)
++{
++	struct test_target out, ref;
++	XRenderColor white = { 0xffff, 0xffff, 0xffff, 0xffff };
++	Picture src_ref, src_out;
++	XTriangle tri;
++	unsigned step, max;
++
++	test_target_create_render(&t->out, target, &out);
++	set_edge(t->out.dpy, out.picture, edge);
++	src_out = XRenderCreateSolidFill(t->out.dpy, &white);
++
++	test_target_create_render(&t->ref, target, &ref);
++	set_edge(t->ref.dpy, ref.picture, edge);
++	src_ref = XRenderCreateSolidFill(t->ref.dpy, &white);
++
++	printf("Testing edges (with mask %s and %s edges) (%s): ",
++	       mask_name(mask),
++	       edge_name(edge),
++	       test_target_name(target));
++	fflush(stdout);
++
++	max = 2*(out.width + 128 + out.height+128);
++	step = 0;
++	for (step = 0; step <= max; step++) {
++		char buf[80];
++
++		step_to_point(step, out.width, out.height, &tri.p1);
++		step_to_point(step + out.width + 128,
++			      out.width, out.height,
++			      &tri.p2);
++		step_to_point(step + out.height + 128 + 2*(out.width + 128),
++			      out.width, out.height,
++			      &tri.p3);
++
++		sprintf(buf,
++			"tri=((%d, %d), (%d, %d), (%d, %d))\n",
++			tri.p1.x >> 16, tri.p1.y >> 16,
++			tri.p2.x >> 16, tri.p2.y >> 16,
++			tri.p3.x >> 16, tri.p3.y >> 16);
++
++		clear(&t->out, &out);
++		XRenderCompositeTriangles(t->out.dpy,
++					  PictOpSrc,
++					  src_out,
++					  out.picture,
++					  mask_format(t->out.dpy, mask),
++					  0, 0,
++					  &tri, 1);
++
++		clear(&t->ref, &ref);
++		XRenderCompositeTriangles(t->ref.dpy,
++					  PictOpSrc,
++					  src_ref,
++					  ref.picture,
++					  mask_format(t->ref.dpy, mask),
++					  0, 0,
++					  &tri, 1);
++
++		test_compare(t,
++			     out.draw, out.format,
++			     ref.draw, ref.format,
++			     0, 0, out.width, out.height,
++			     buf);
++	}
++
++	XRenderFreePicture(t->out.dpy, src_out);
++	test_target_destroy_render(&t->out, &out);
++
++	XRenderFreePicture(t->ref.dpy, src_ref);
++	test_target_destroy_render(&t->ref, &ref);
++
++	printf("pass\n");
++}
++
++int main(int argc, char **argv)
++{
++	struct test test;
++	enum target target;
++	enum mask mask;
++	enum edge edge;
++
++	test_init(&test, argc, argv);
++
++	for (target = TARGET_FIRST; target <= TARGET_LAST; target++) {
++		for (mask = MASK_NONE; mask <= MASK_A8; mask++)
++			for (edge = EDGE_SHARP; edge <= EDGE_SMOOTH; edge++)
++				edge_test(&test, mask, edge, target);
++	}
++
++	return 0;
++}
+diff --git a/test/test.h b/test/test.h
+index a3ef979d..9eec1cf9 100644
+--- a/test/test.h
++++ b/test/test.h
+@@ -107,6 +107,15 @@ static inline uint32_t color(uint8_t red, uint8_t green, uint8_t blue, uint8_t a
+ 	return alpha << 24 | ra >> 8 << 16 | ga >> 8 << 8 | ba >> 8;
+ }
+ 
++static inline uint32_t xrender_color(const XRenderColor *c)
++{
++	uint32_t ra = c->red * c->alpha;
++	uint32_t ga = c->green * c->alpha;
++	uint32_t ba = c->blue * c->alpha;
++
++	return c->alpha >> 8 << 24 | ra >> 24 << 16 | ga >> 24 << 8 | ba >> 24;
++}
++
+ void test_timer_start(struct test_display *t, struct timespec *tv);
+ double test_timer_stop(struct test_display *t, struct timespec *tv);
+ 
+diff --git a/test/test_image.c b/test/test_image.c
+index d15a8af8..1c076990 100644
+--- a/test/test_image.c
++++ b/test/test_image.c
+@@ -197,13 +197,10 @@ void test_compare(struct test *t,
+ 		  const char *info)
+ {
+ 	XImage out_image, ref_image;
+-	Pixmap tmp;
+-	char *out, *ref;
++	uint32_t *out, *ref;
+ 	char buf[600];
+ 	uint32_t mask;
+ 	int i, j;
+-	XGCValues gcv;
+-	GC gc;
+ 
+ 	if (w * h * 4 > t->out.max_shm_size)
+ 		return test_compare_fallback(t,
+@@ -214,37 +211,24 @@ void test_compare(struct test *t,
+ 	test_init_image(&out_image, &t->out.shm, out_format, w, h);
+ 	test_init_image(&ref_image, &t->ref.shm, ref_format, w, h);
+ 
+-	gcv.graphics_exposures = 0;
+-
+ 	die_unless(out_image.depth == ref_image.depth);
+ 	die_unless(out_image.bits_per_pixel == ref_image.bits_per_pixel);
+ 	die_unless(out_image.bits_per_pixel == 32);
+ 
+-	mask = depth_mask(out_image.depth);
++	XShmGetImage(t->out.dpy, out_draw, &out_image, x, y, AllPlanes);
++	out = (uint32_t *)out_image.data;
+ 
+-	tmp = XCreatePixmap(t->out.dpy, out_draw, w, h, out_image.depth);
+-	gc = XCreateGC(t->out.dpy, tmp, GCGraphicsExposures, &gcv);
+-	XCopyArea(t->out.dpy, out_draw, tmp, gc, x, y, w, h, 0, 0);
+-	XShmGetImage(t->out.dpy, tmp, &out_image, 0, 0, AllPlanes);
+-	XFreeGC(t->out.dpy, gc);
+-	XFreePixmap(t->out.dpy, tmp);
+-	out = out_image.data;
+-
+-	tmp = XCreatePixmap(t->ref.dpy, ref_draw, w, h, ref_image.depth);
+-	gc = XCreateGC(t->ref.dpy, tmp, GCGraphicsExposures, &gcv);
+-	XCopyArea(t->ref.dpy, ref_draw, tmp, gc, x, y, w, h, 0, 0);
+-	XShmGetImage(t->ref.dpy, tmp, &ref_image, 0, 0, AllPlanes);
+-	XFreeGC(t->ref.dpy, gc);
+-	XFreePixmap(t->ref.dpy, tmp);
+-	ref = ref_image.data;
++	XShmGetImage(t->ref.dpy, ref_draw, &ref_image, x, y, AllPlanes);
++	ref = (uint32_t *)ref_image.data;
+ 
+ 	/* Start with an exact comparison. However, one quicky desires
+ 	 * a fuzzy comparator to hide hardware inaccuracies...
+ 	 */
++	mask = depth_mask(out_image.depth);
+ 	for (j = 0; j < h; j++) {
+ 		for (i = 0; i < w; i++) {
+-			uint32_t a = ((uint32_t *)out)[i] & mask;
+-			uint32_t b = ((uint32_t *)ref)[i] & mask;
++			uint32_t a = out[i] & mask;
++			uint32_t b = ref[i] & mask;
+ 			if (a != b && pixel_difference(a, b) > MAX_DELTA) {
+ 				show_pixels(buf,
+ 					    &out_image, &ref_image,
+@@ -255,8 +239,8 @@ void test_compare(struct test *t,
+ 				    x,i, y,j, a, b, pixel_difference(a, b), buf, info);
+ 			}
+ 		}
+-		out += out_image.bytes_per_line;
+-		ref += ref_image.bytes_per_line;
++		out = (uint32_t *)((char *)out + out_image.bytes_per_line);
++		ref = (uint32_t *)((char *)ref + ref_image.bytes_per_line);
+ 	}
+ }
+ 
+diff --git a/test/xvidmode.c b/test/xvidmode.c
+new file mode 100644
+index 00000000..5cde8286
+--- /dev/null
++++ b/test/xvidmode.c
+@@ -0,0 +1,54 @@
++#include <stdlib.h>
++#include <stdio.h>
++#include <string.h>
++#include <X11/Xlib.h>
++#include <X11/extensions/xf86vmode.h>
++
++int main(void)
++{
++	Display *dpy;
++	XF86VidModeModeLine current;
++	XF86VidModeModeInfo **modes;
++	int num_modes, i;
++	int saved_mode = -1;
++	int dotclock;
++
++	dpy = XOpenDisplay(NULL);
++	if (dpy == NULL)
++		dpy = XOpenDisplay(":0");
++
++	XF86VidModeGetModeLine(dpy, DefaultScreen(dpy), &dotclock, &current);
++	XF86VidModeGetAllModeLines(dpy, XDefaultScreen(dpy),
++				   &num_modes, &modes);
++	for (i = 0; i < num_modes; i++) {
++		int this;
++
++		this = (current.hdisplay == modes[i]->hdisplay &&
++			current.vdisplay == modes[i]->vdisplay &&
++			dotclock == modes[i]->dotclock);
++		if (this && saved_mode == -1)
++			saved_mode = i;
++
++		printf("[%d] %dx%d%s\n",
++		       i,
++		       modes[i]->hdisplay,
++		       modes[i]->vdisplay,
++		       this ? "*" : "");
++	}
++
++	for (i = 0; i < num_modes; i++) {
++		printf("Switching to mode %dx%d\n",
++		       modes[i]->hdisplay,
++		       modes[i]->vdisplay);
++		XF86VidModeSwitchToMode(dpy, XDefaultScreen(dpy), modes[i]);
++		XSync(dpy, True);
++	}
++
++	if (saved_mode != -1) {
++		XF86VidModeSwitchToMode(dpy, XDefaultScreen(dpy),
++					modes[saved_mode]);
++		XFlush(dpy);
++	}
++
++	return 0;
++}
+diff --git a/tools/Makefile.am b/tools/Makefile.am
+index b5de2c96..92df266b 100644
+--- a/tools/Makefile.am
++++ b/tools/Makefile.am
+@@ -26,13 +26,30 @@ AM_CFLAGS = \
+ drivermandir = $(DRIVER_MAN_DIR)
+ policydir = $(datarootdir)/polkit-1/actions
+ 
++bin_PROGRAMS =
++noinst_PROGRAMS =
++libexec_PROGRAMS =
++
+ if BUILD_TOOLS
+-bin_PROGRAMS = intel-virtual-output
++bin_PROGRAMS += intel-virtual-output
+ driverman_DATA = intel-virtual-output.$(DRIVER_MAN_SUFFIX)
+ endif
+ 
++if BUILD_TOOL_CURSOR
++noinst_PROGRAMS += cursor
++cursor_CFLAGS = $(TOOL_CURSOR_CFLAGS)
++cursor_LDADD = $(TOOL_CURSOR_LIBS)
++endif
++
++if X11_DRI3
++noinst_PROGRAMS += dri3info
++dri3info_SOURCES = dri3info.c
++dri3info_CFLAGS = $(X11_DRI3_CFLAGS) $(DRI_CFLAGS)
++dri3info_LDADD = $(X11_DRI3_LIBS) $(DRI_LIBS)
++endif
++
+ if BUILD_BACKLIGHT_HELPER
+-libexec_PROGRAMS = xf86-video-intel-backlight-helper
++libexec_PROGRAMS += xf86-video-intel-backlight-helper
+ nodist_policy_DATA = org.x.xf86-video-intel.backlight-helper.policy
+ 
+ backlight_helper = $(libexecdir)/xf86-video-intel-backlight-helper
+diff --git a/tools/backlight_helper.c b/tools/backlight_helper.c
+index 8b2667dc..aadb8fac 100644
+--- a/tools/backlight_helper.c
++++ b/tools/backlight_helper.c
+@@ -1,3 +1,7 @@
++#ifdef HAVE_CONFIG_H
++#include "config.h"
++#endif
++
+ #include <stdio.h>
+ #include <string.h>
+ #include <stdarg.h>
+@@ -9,6 +13,12 @@
+ #include <sys/types.h>
+ #include <sys/stat.h>
+ 
++#if MAJOR_IN_MKDEV
++#include <sys/mkdev.h>
++#elif MAJOR_IN_SYSMACROS
++#include <sys/sysmacros.h>
++#endif
++
+ #define DBG 0
+ 
+ #if defined(__GNUC__) && (__GNUC__ > 3)
+diff --git a/tools/cursor.c b/tools/cursor.c
+new file mode 100644
+index 00000000..6a2438ad
+--- /dev/null
++++ b/tools/cursor.c
+@@ -0,0 +1,127 @@
++/*
++ * Copyright © 2015 Intel Corporation
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice (including the next
++ * paragraph) shall be included in all copies or substantial portions of the
++ * Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ *
++ */
++
++#ifdef HAVE_CONFIG_H
++#include "config.h"
++#endif
++
++#include <X11/Xlib.h>
++#include <X11/extensions/Xfixes.h>
++
++#include <stdint.h>
++#include <stdio.h>
++#include <stdlib.h>
++#include <png.h>
++
++int main(int argc, char **argv)
++{
++	Display *dpy;
++	XFixesCursorImage *cur;
++	unsigned long *src; /* XXX deep sigh */
++	unsigned x, y;
++	png_struct *png;
++	png_info *info;
++	png_byte **rows;
++	FILE *file;
++
++	dpy = XOpenDisplay(NULL);
++	if (dpy == NULL)
++		return 1;
++
++	if (!XFixesQueryExtension(dpy, (int *)&x, (int *)&y))
++		return 1;
++
++	cur = XFixesGetCursorImage(dpy);
++	if (cur == NULL)
++		return 1;
++
++	printf("Cursor on display '%s': %dx%d, (hotspot %dx%d)\n",
++	       DisplayString(dpy),
++	       cur->width, cur->height,
++	       cur->xhot, cur->yhot);
++
++	if (1) {
++		int x, y;
++
++		src = cur->pixels;
++		for (y = 0; y < cur->height; y++) {
++			for (x = 0; x < cur->width; x++) {
++				if (x == cur->xhot && y == cur->yhot)
++					printf("+");
++				else
++					printf("%c", *src ? *src >> 24 >= 127 ? 'x' : '.' : ' ');
++				src++;
++			}
++			printf("\n");
++		}
++	}
++
++	file = fopen("cursor.png", "wb");
++	if (file == NULL)
++		return 2;
++
++	png = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
++	info = png_create_info_struct(png);
++	png_init_io(png, file);
++	png_set_IHDR(png, info,
++		     cur->width, cur->height, 8,
++		     PNG_COLOR_TYPE_RGB_ALPHA,
++		     PNG_INTERLACE_NONE,
++		     PNG_COMPRESSION_TYPE_DEFAULT,
++		     PNG_FILTER_TYPE_DEFAULT);
++	png_write_info(png, info);
++
++	src = cur->pixels;
++	rows = malloc(cur->height*sizeof(png_byte*));
++	if (rows == NULL)
++		return 3;
++
++	for (y = 0; y < cur->height; y++) {
++		rows[y] = malloc(cur->width * 4);
++		for (x = 0; x < cur->width; x++) {
++			uint32_t p = *src++;
++			uint8_t r = p >> 0;
++			uint8_t g = p >> 8;
++			uint8_t b = p >> 16;
++			uint8_t a = p >> 24;
++
++			if (a > 0x00 && a < 0xff) {
++				r = (r * 0xff + a /2) / a;
++				g = (g * 0xff + a /2) / a;
++				b = (b * 0xff + a /2) / a;
++			}
++
++			rows[y][4*x + 0] = b;
++			rows[y][4*x + 1] = g;
++			rows[y][4*x + 2] = r;
++			rows[y][4*x + 3] = a;
++		}
++	}
++
++	png_write_image(png, rows);
++	png_write_end(png, NULL);
++	fclose(file);
++
++	return 0;
++}
+diff --git a/tools/dri3info.c b/tools/dri3info.c
+new file mode 100644
+index 00000000..0c33fc5a
+--- /dev/null
++++ b/tools/dri3info.c
+@@ -0,0 +1,329 @@
++/*
++ * Copyright (c) 2015 Intel Corporation
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice (including the next
++ * paragraph) shall be included in all copies or substantial portions of the
++ * Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ *
++ * To compile standalone: gcc -o dri3info dri3info.c `pkg-config --cflags --libs xcb-dri3 x11-xcb xrandr xxf86vm libdrm`
++ */
++
++#include <X11/Xlib.h>
++#include <X11/Xlib-xcb.h>
++#include <xcb/xcb.h>
++#include <xcb/dri3.h>
++#include <unistd.h>
++#include <stdio.h>
++#include <stdlib.h>
++#include <stdint.h>
++#include <string.h>
++#include <sys/stat.h>
++#include <drm.h>
++#include <xf86drm.h>
++
++#include <X11/extensions/Xrandr.h>
++#include <X11/extensions/xf86vmode.h>
++
++static int dri3_query_version(Display *dpy, int *major, int *minor)
++{
++	xcb_connection_t *c = XGetXCBConnection(dpy);
++	xcb_dri3_query_version_reply_t *reply;
++	xcb_generic_error_t *error;
++
++	*major = *minor = -1;
++
++	reply = xcb_dri3_query_version_reply(c,
++					     xcb_dri3_query_version(c,
++								    XCB_DRI3_MAJOR_VERSION,
++								    XCB_DRI3_MINOR_VERSION),
++					     &error);
++	free(error);
++	if (reply == NULL)
++		return -1;
++
++	*major = reply->major_version;
++	*minor = reply->minor_version;
++	free(reply);
++
++	return 0;
++}
++
++static int dri3_exists(Display *dpy)
++{
++	const xcb_query_extension_reply_t *ext;
++	int major, minor;
++
++	ext = xcb_get_extension_data(XGetXCBConnection(dpy), &xcb_dri3_id);
++	if (ext == NULL || !ext->present)
++		return 0;
++
++	if (dri3_query_version(dpy, &major, &minor) < 0)
++		return 0;
++
++	return major >= 0;
++}
++
++static int dri3_open(Display *dpy)
++{
++	xcb_connection_t *c = XGetXCBConnection(dpy);
++	xcb_dri3_open_cookie_t cookie;
++	xcb_dri3_open_reply_t *reply;
++
++	if (!dri3_exists(dpy))
++		return -1;
++
++	cookie = xcb_dri3_open(c, RootWindow(dpy, DefaultScreen(dpy)), None);
++	reply = xcb_dri3_open_reply(c, cookie, NULL);
++
++	if (!reply)
++		return -1;
++
++	if (reply->nfd != 1)
++		return -1;
++
++	return xcb_dri3_open_reply_fds(c, reply)[0];
++}
++
++static void get_device_path(int fd, char *buf, int len)
++{
++	struct stat remote, local;
++	int i;
++
++	if (fstat(fd, &remote))
++		goto out;
++
++	for (i = 0; i < 16; i++) {
++		snprintf(buf, len, "/dev/dri/card%d", i);
++		if (stat(buf, &local))
++			continue;
++
++		if (local.st_mode == remote.st_mode &&
++		    local.st_rdev == remote.st_rdev)
++			return;
++
++		snprintf(buf, len, "/dev/dri/renderD%d", i + 128);
++		if (stat(buf, &local))
++			continue;
++
++		if (local.st_mode == remote.st_mode &&
++		    local.st_rdev == remote.st_rdev)
++			return;
++	}
++
++out:
++	strncpy(buf, "unknown path", len);
++}
++
++static void get_driver_name(int fd, char *name, int len)
++{
++	drm_version_t version;
++
++	memset(name, 0, len);
++	memset(&version, 0, sizeof(version));
++	version.name_len = len;
++	version.name = name;
++
++	(void)drmIoctl(fd, DRM_IOCTL_VERSION, &version);
++}
++
++static int compute_refresh_rate_from_mode(long n, long d, unsigned flags,
++					   int32_t *numerator,
++					   int32_t *denominator)
++{
++	int i;
++
++	/* The mode flags are only defined privately to the Xserver (in xf86str.h)
++	 * but they at least bit compatible between VidMode, RandR and DRM.
++	 */
++# define V_INTERLACE 0x010
++# define V_DBLSCAN   0x020
++
++	if (flags & V_INTERLACE)
++		n *= 2;
++	else if (flags & V_DBLSCAN)
++		d *= 2;
++
++	/* The OML_sync_control spec requires that if the refresh rate is a
++	 * whole number, that the returned numerator be equal to the refresh
++	 * rate and the denominator be 1.
++	 */
++
++	if (n % d == 0) {
++		n /= d;
++		d = 1;
++	}
++	else {
++		static const unsigned f[] = { 13, 11, 7, 5, 3, 2, 0 };
++
++		/* This is a poor man's way to reduce a fraction.  It's far from
++		 * perfect, but it will work well enough for this situation.
++		 */
++
++		for (i = 0; f[i] != 0; i++) {
++			while (n % f[i] == 0 && d % f[i] == 0) {
++				d /= f[i];
++				n /= f[i];
++			}
++		}
++	}
++
++	*numerator = n;
++	*denominator = d;
++	return 1;
++}
++
++static int RRGetMscRate(Display *dpy, int32_t *numerator, int32_t *denominator)
++{
++	int ret = 0;
++	Window root = RootWindow(dpy, DefaultScreen(dpy));
++	XRRScreenResources *res;
++	int rr_event, rr_error;
++	RROutput primary;
++	RRMode mode = 0;
++	int n;
++
++	if (!XRRQueryExtension(dpy, &rr_event, &rr_error))
++		return ret;
++
++	res = XRRGetScreenResourcesCurrent(dpy, root);
++	if (res == NULL)
++		return ret;
++
++	/* Use the primary output if specified, otherwise
++	 * use the mode on the first enabled crtc.
++	 */
++	primary = XRRGetOutputPrimary(dpy, root);
++	if (primary) {
++		XRROutputInfo *output;
++
++		output = XRRGetOutputInfo(dpy, res, primary);
++		if (output != NULL) {
++			if (output->crtc) {
++				XRRCrtcInfo *crtc;
++
++				crtc = XRRGetCrtcInfo(dpy, res, output->crtc);
++				if (crtc) {
++					mode = crtc->mode;
++					XRRFreeCrtcInfo(crtc);
++				}
++			}
++			XRRFreeOutputInfo(output);
++		}
++	}
++
++	for (n = 0; mode == 0 && n < res->ncrtc; n++) {
++		XRRCrtcInfo *crtc;
++
++		crtc = XRRGetCrtcInfo(dpy, res, res->crtcs[n]);
++		if (crtc) {
++			mode = crtc->mode;
++			XRRFreeCrtcInfo(crtc);
++		}
++	}
++
++	for (n = 0; n < res->nmode; n++) {
++		if (res->modes[n].id == mode) {
++			ret = compute_refresh_rate_from_mode(res->modes[n].dotClock,
++							     res->modes[n].hTotal*res->modes[n].vTotal,
++							     res->modes[n].modeFlags,
++							     numerator, denominator);
++			break;
++		}
++	}
++
++	XRRFreeScreenResources(res);
++	return ret;
++}
++
++static int VMGetMscRate(Display *dpy, int32_t *numerator, int32_t *denominator)
++{
++	XF86VidModeModeLine mode_line;
++	int dot_clock;
++	int i;
++
++	if (XF86VidModeQueryVersion(dpy, &i, &i) &&
++	    XF86VidModeGetModeLine(dpy, DefaultScreen(dpy), &dot_clock, &mode_line))
++		return compute_refresh_rate_from_mode(dot_clock * 1000,
++						      mode_line.vtotal * mode_line.htotal,
++						      mode_line.flags,
++						      numerator, denominator);
++
++	return 0;
++}
++
++static int get_refresh_rate(Display *dpy,
++			     int32_t *numerator,
++			     int32_t *denominator)
++{
++	if (RRGetMscRate(dpy, numerator, denominator))
++		return 1;
++
++	if (VMGetMscRate(dpy, numerator, denominator))
++		return 1;
++
++	return 0;
++}
++
++static void info(const char *dpyname)
++{
++	Display *dpy;
++	int device;
++	int32_t numerator, denominator;
++
++	dpy = XOpenDisplay(dpyname);
++	if (dpy == NULL) {
++		printf("Unable to connect to display '%s'\n",
++		       dpyname ?: getenv("DISPLAY") ?: "unset");
++		return;
++	}
++
++	printf("Display '%s'\n", DisplayString(dpy));
++	device = dri3_open(dpy);
++	if (device < 0) {
++		printf("\tUnable to connect to DRI3\n");
++	} else {
++		char device_path[1024];
++		char driver_name[1024];
++
++		get_device_path(device, device_path, sizeof(device_path));
++		get_driver_name(device, driver_name, sizeof(driver_name));
++
++		printf("Connected to DRI3, using fd %d which matches %s, driver %s\n",
++		       device, device_path, driver_name);
++		close(device);
++	}
++
++	if (get_refresh_rate(dpy, &numerator, &denominator))
++		printf("\tPrimary refresh rate: %d/%d (%.1fHz)\n",
++		       numerator, denominator, numerator/(float)denominator);
++
++	XCloseDisplay(dpy);
++}
++
++int main(int argc, char **argv)
++{
++	int i;
++
++	if (argc > 1) {
++		for (i = 1; i < argc; i++)
++			info(argv[i]);
++	} else
++		info(NULL);
++
++	return 0;
++}
+diff --git a/tools/virtual.c b/tools/virtual.c
+index 8e2b4a22..fc8db2b9 100644
+--- a/tools/virtual.c
++++ b/tools/virtual.c
+@@ -31,6 +31,7 @@
+ 
+ #include <X11/Xlibint.h>
+ #include <X11/extensions/record.h>
++#include <X11/extensions/scrnsaver.h>
+ #include <X11/extensions/XShm.h>
+ #if HAVE_X11_EXTENSIONS_SHMPROTO_H
+ #include <X11/extensions/shmproto.h>
+@@ -79,13 +80,15 @@ static int verbose;
+ #define DRAW 0x8
+ #define DAMAGE 0x10
+ #define CURSOR 0x20
+-#define POLL 0x40
++#define SCREEN 0x40
++#define POLL 0x80
+ 
+ struct display {
+ 	Display *dpy;
+ 	struct clone *clone;
+ 	struct context *ctx;
+ 
++	int saver_event, saver_error, saver_active;
+ 	int damage_event, damage_error;
+ 	int xfixes_event, xfixes_error;
+ 	int rr_event, rr_error, rr_active;
+@@ -98,6 +101,7 @@ struct display {
+ 	int width;
+ 	int height;
+ 	int depth;
++	int active;
+ 
+ 	XRenderPictFormat *root_format;
+ 	XRenderPictFormat *rgb16_format;
+@@ -111,7 +115,7 @@ struct display {
+ 	Cursor invisible_cursor;
+ 	Cursor visible_cursor;
+ 
+-	XcursorImage cursor_image;
++	XcursorImage cursor_image; /* first only */
+ 	int cursor_serial;
+ 	int cursor_x;
+ 	int cursor_y;
+@@ -123,6 +127,13 @@ struct display {
+ 	int send;
+ 	int skip_clone;
+ 	int skip_frame;
++
++	struct {
++		int timeout;
++		int interval;
++		int prefer_blank;
++		int allow_exp;
++	} saver;
+ };
+ 
+ struct output {
+@@ -145,6 +156,7 @@ struct output {
+ 	XRenderPictFormat *use_render;
+ 
+ 	int x, y;
++	int width, height;
+ 	XRRModeInfo mode;
+ 	Rotation rotation;
+ };
+@@ -218,6 +230,13 @@ static inline XRRScreenResources *_XRRGetScreenResourcesCurrent(Display *dpy, Wi
+ static int _x_error_occurred;
+ 
+ static int
++_io_error_handler(Display *display)
++{
++	fprintf(stderr, "XIO error on display %s\n", DisplayString(display));
++	abort();
++}
++
++static int
+ _check_error_handler(Display     *display,
+ 		     XErrorEvent *event)
+ {
+@@ -243,6 +262,10 @@ can_use_shm(Display *dpy,
+ 	XExtCodes *codes;
+ 	int major, minor, has_shm, has_pixmap;
+ 
++	*shm_event = 0;
++	*shm_opcode = 0;
++	*shm_pixmap = 0;
++
+ 	if (!XShmQueryExtension(dpy))
+ 		return 0;
+ 
+@@ -320,6 +343,7 @@ can_use_shm(Display *dpy,
+ #include <X11/Xlib-xcb.h>
+ #include <X11/xshmfence.h>
+ #include <xcb/xcb.h>
++#include <xcb/xcbext.h>
+ #include <xcb/dri3.h>
+ #include <xcb/sync.h>
+ static Pixmap dri3_create_pixmap(Display *dpy,
+@@ -357,6 +381,7 @@ static int dri3_query_version(Display *dpy, int *major, int *minor)
+ {
+ 	xcb_connection_t *c = XGetXCBConnection(dpy);
+ 	xcb_dri3_query_version_reply_t *reply;
++	xcb_generic_error_t *error;
+ 
+ 	*major = *minor = -1;
+ 
+@@ -364,7 +389,8 @@ static int dri3_query_version(Display *dpy, int *major, int *minor)
+ 					     xcb_dri3_query_version(c,
+ 								    XCB_DRI3_MAJOR_VERSION,
+ 								    XCB_DRI3_MINOR_VERSION),
+-					     NULL);
++					     &error);
++	free(error);
+ 	if (reply == NULL)
+ 		return -1;
+ 
+@@ -377,8 +403,13 @@ static int dri3_query_version(Display *dpy, int *major, int *minor)
+ 
+ static int dri3_exists(Display *dpy)
+ {
++	const xcb_query_extension_reply_t *ext;
+ 	int major, minor;
+ 
++	ext = xcb_get_extension_data(XGetXCBConnection(dpy), &xcb_dri3_id);
++	if (ext == NULL || !ext->present)
++		return 0;
++
+ 	if (dri3_query_version(dpy, &major, &minor) < 0)
+ 		return 0;
+ 
+@@ -809,6 +840,10 @@ static int clone_update_modes__fixed(struct clone *clone)
+ 	RRMode id;
+ 	int i, j, ret = ENOENT;
+ 
++	DBG(X11, ("%s-%s cloning modes fixed %dx%d\n",
++	     DisplayString(clone->dst.dpy), clone->dst.name,
++	     clone->dst.width, clone->dst.height));
++
+ 	assert(clone->src.rr_output);
+ 
+ 	res = _XRRGetScreenResourcesCurrent(clone->src.dpy, clone->src.window);
+@@ -837,8 +872,8 @@ static int clone_update_modes__fixed(struct clone *clone)
+ 
+ 	/* Create matching mode for the real output on the virtual */
+ 	memset(&mode, 0, sizeof(mode));
+-	mode.width = clone->width;
+-	mode.height = clone->height;
++	mode.width = clone->dst.width;
++	mode.height = clone->dst.height;
+ 	mode.nameLength = sprintf(mode_name, "FAKE-%dx%d", mode.width, mode.height);
+ 	mode.name = mode_name;
+ 
+@@ -942,6 +977,35 @@ out:
+ 	return rr_output;
+ }
+ 
++static int check_virtual(struct display *display)
++{
++	XRRScreenResources *res;
++	int found = -ENOENT;
++	int i;
++
++	res = _XRRGetScreenResourcesCurrent(display->dpy, display->root);
++	if (res == NULL)
++		return -ENOMEM;
++
++	for (i = 0; found == -ENOENT && i < res->noutput; i++) {
++		XRROutputInfo *output;
++
++		output = XRRGetOutputInfo(display->dpy, res, res->outputs[i]);
++		if (output == NULL)
++			continue;
++
++		if (strcmp(output->name, "VIRTUAL1") == 0)
++			found = 0;
++
++		XRRFreeOutputInfo(output);
++	}
++	XRRFreeScreenResources(res);
++
++	DBG(XRR, ("%s(%s): has VIRTUAL1? %d\n",
++		  __func__, DisplayString(display->dpy), found));
++	return found;
++}
++
+ static int stride_for_depth(int width, int depth)
+ {
+ 	if (depth == 24)
+@@ -1082,20 +1146,20 @@ static int clone_init_xfer(struct clone *clone)
+ 		width = 0;
+ 		height = 0;
+ 	} else if (clone->dri3.xid) {
+-		width = clone->dst.display->width;
+-		height = clone->dst.display->height;
++		width = clone->dst.width;
++		height = clone->dst.height;
+ 	} else {
+ 		width = mode_width(&clone->src.mode, clone->src.rotation);
+ 		height = mode_height(&clone->src.mode, clone->src.rotation);
+ 	}
+ 
++	DBG(DRAW, ("%s-%s create xfer, %dx%d (currently %dx%d)\n",
++	     DisplayString(clone->dst.dpy), clone->dst.name,
++	     width, height, clone->width, clone->height));
++
+ 	if (width == clone->width && height == clone->height)
+ 		return 0;
+ 
+-	DBG(DRAW, ("%s-%s create xfer, %dx%d\n",
+-	     DisplayString(clone->dst.dpy), clone->dst.name,
+-	     width, height));
+-
+ 	if (clone->shm.shmaddr) {
+ 		if (clone->src.use_shm)
+ 			XShmDetach(clone->src.dpy, &clone->src.shm);
+@@ -1225,6 +1289,56 @@ static void clone_update(struct clone *clone)
+ 	clone->rr_update = 0;
+ }
+ 
++static void screensaver_save(struct display *display)
++{
++	display->saver_active =
++		XScreenSaverQueryExtension(display->dpy,
++					   &display->saver_event,
++					   &display->saver_error);
++	DBG(SCREEN,
++	    ("%s screen saver active? %d [event=%d, error=%d]\n",
++	     DisplayString(display->dpy),
++	     display->saver_active,
++	     display->saver_event,
++	     display->saver_error));
++
++	XGetScreenSaver(display->dpy,
++			&display->saver.timeout,
++			&display->saver.interval,
++			&display->saver.prefer_blank,
++			&display->saver.allow_exp);
++
++	DBG(SCREEN,
++	    ("%s saving screen saver defaults: timeout=%d interval=%d prefer_blank=%d allow_exp=%d\n",
++	     DisplayString(display->dpy),
++	     display->saver.timeout,
++	     display->saver.interval,
++	     display->saver.prefer_blank,
++	     display->saver.allow_exp));
++}
++
++static void screensaver_disable(struct display *display)
++{
++	DBG(SCREEN,
++	    ("%s disabling screen saver\n", DisplayString(display->dpy)));
++
++	XSetScreenSaver(display->dpy, 0, 0, DefaultBlanking, DefaultExposures);
++	display_mark_flush(display);
++}
++
++static void screensaver_restore(struct display *display)
++{
++	DBG(SCREEN,
++	    ("%s restoring screen saver\n", DisplayString(display->dpy)));
++
++	XSetScreenSaver(display->dpy,
++			display->saver.timeout,
++			display->saver.interval,
++			display->saver.prefer_blank,
++			display->saver.allow_exp);
++	display_mark_flush(display);
++}
++
+ static int context_update(struct context *ctx)
+ {
+ 	Display *dpy = ctx->display->dpy;
+@@ -1325,8 +1439,19 @@ static int context_update(struct context *ctx)
+ 		struct clone *clone;
+ 		int x1, x2, y1, y2;
+ 
+-		if (display->rr_active == 0)
++		if (display->rr_active == 0) {
++			for (clone = display->clone; clone; clone = clone->next) {
++				struct output *output = &clone->src;
++				if (output->mode.id) {
++					clone->dst.mode.id = -1;
++					clone->dst.rr_crtc = -1;
++				} else {
++					clone->dst.mode.id = 0;
++					clone->dst.rr_crtc = 0;
++				}
++			}
+ 			continue;
++		}
+ 
+ 		x1 = y1 = INT_MAX;
+ 		x2 = y2 = INT_MIN;
+@@ -1570,6 +1695,13 @@ ungrab:
+ 		XUngrabServer(display->dpy);
+ 	}
+ 
++	for (n = 1; n < ctx->ndisplay; n++) {
++		struct display *display = &ctx->display[n];
++
++		display->active = 0;
++		screensaver_restore(display);
++	}
++
+ 	ctx->active = NULL;
+ 	for (n = 0; n < ctx->nclone; n++) {
+ 		struct clone *clone = &ctx->clones[n];
+@@ -1580,7 +1712,10 @@ ungrab:
+ 			continue;
+ 
+ 		DBG(XRR, ("%s-%s: added to active list\n",
+-		     DisplayString(clone->dst.display->dpy), clone->dst.name));
++			  DisplayString(clone->dst.display->dpy), clone->dst.name));
++
++		if (clone->dst.display->active++ == 0)
++			screensaver_disable(clone->dst.display);
+ 
+ 		clone->active = ctx->active;
+ 		ctx->active = clone;
+@@ -1599,14 +1734,17 @@ static Cursor display_load_invisible_cursor(struct display *display)
+ 
+ static Cursor display_get_visible_cursor(struct display *display)
+ {
+-	if (display->cursor_serial != display->cursor_image.size) {
+-		DBG(CURSOR, ("%s updating cursor\n", DisplayString(display->dpy)));
++	struct display *first = display->ctx->display;
++
++	if (display->cursor_serial != first->cursor_serial) {
++		DBG(CURSOR, ("%s updating cursor %dx%d, serial %d\n",
++		    DisplayString(display->dpy), first->cursor_image.width, first->cursor_image.height, first->cursor_serial));
+ 
+ 		if (display->visible_cursor)
+ 			XFreeCursor(display->dpy, display->visible_cursor);
+ 
+-		display->visible_cursor = XcursorImageLoadCursor(display->dpy, &display->cursor_image);
+-		display->cursor_serial = display->cursor_image.size;
++		display->visible_cursor = XcursorImageLoadCursor(display->dpy, &first->cursor_image);
++		display->cursor_serial = first->cursor_serial;
+ 	}
+ 
+ 	return display->visible_cursor;
+@@ -1629,7 +1767,7 @@ static void display_load_visible_cursor(struct display *display, XFixesCursorIma
+ 	display->cursor_image.height = cur->height;
+ 	display->cursor_image.xhot = cur->xhot;
+ 	display->cursor_image.yhot = cur->yhot;
+-	display->cursor_image.size++;
++	display->cursor_serial++;
+ 
+ 	n = cur->width*cur->height;
+ 	src = cur->pixels;
+@@ -1637,11 +1775,24 @@ static void display_load_visible_cursor(struct display *display, XFixesCursorIma
+ 	while (n--)
+ 		*dst++ = *src++;
+ 
+-	DBG(CURSOR, ("%s marking cursor changed\n", DisplayString(display->dpy)));
+-	display->cursor_moved++;
+-	if (display->cursor != display->invisible_cursor) {
+-		display->cursor_visible++;
+-		context_enable_timer(display->ctx);
++	if (verbose & CURSOR) {
++		int x, y;
++
++		printf("%s cursor image %dx%d, serial %d:\n",
++		       DisplayString(display->dpy),
++		       cur->width, cur->height,
++		       display->cursor_serial);
++		dst = display->cursor_image.pixels;
++		for (y = 0; y < cur->height; y++) {
++			for (x = 0; x < cur->width; x++) {
++				if (x == cur->xhot && y == cur->yhot)
++					printf("+");
++				else
++					printf("%c", *dst ? *dst >> 24 >= 127 ? 'x' : '.' : ' ');
++				dst++;
++			}
++			printf("\n");
++		}
+ 	}
+ }
+ 
+@@ -1685,6 +1836,8 @@ static void display_flush_cursor(struct display *display)
+ 	if (cursor == None)
+ 		cursor = display->invisible_cursor;
+ 	if (cursor != display->cursor) {
++		DBG(CURSOR, ("%s setting cursor shape %lx\n",
++		    DisplayString(display->dpy), (long)cursor));
+ 		XDefineCursor(display->dpy, display->root, cursor);
+ 		display->cursor = cursor;
+ 	}
+@@ -1762,6 +1915,8 @@ static void get_src(struct clone *c, const XRectangle *clip)
+ 	c->image.obdata = (char *)&c->src.shm;
+ 
+ 	if (c->src.use_render) {
++		DBG(DRAW, ("%s-%s get_src via XRender\n",
++			   DisplayString(c->dst.dpy), c->dst.name));
+ 		XRenderComposite(c->src.dpy, PictOpSrc,
+ 				 c->src.win_picture, 0, c->src.pix_picture,
+ 				 clip->x, clip->y,
+@@ -1782,16 +1937,22 @@ static void get_src(struct clone *c, const XRectangle *clip)
+ 				     &c->image, 0, 0);
+ 		}
+ 	} else if (c->src.pixmap) {
++		DBG(DRAW, ("%s-%s get_src XCopyArea (SHM/DRI3)\n",
++			   DisplayString(c->dst.dpy), c->dst.name));
+ 		XCopyArea(c->src.dpy, c->src.window, c->src.pixmap, c->src.gc,
+ 			  clip->x, clip->y,
+ 			  clip->width, clip->height,
+ 			  0, 0);
+ 		XSync(c->src.dpy, False);
+ 	} else if (c->src.use_shm) {
++		DBG(DRAW, ("%s-%s get_src XShmGetImage\n",
++			   DisplayString(c->dst.dpy), c->dst.name));
+ 		ximage_prepare(&c->image, clip->width, clip->height);
+ 		XShmGetImage(c->src.dpy, c->src.window, &c->image,
+ 			     clip->x, clip->y, AllPlanes);
+ 	} else {
++		DBG(DRAW, ("%s-%s get_src XGetSubImage (slow)\n",
++			   DisplayString(c->dst.dpy), c->dst.name));
+ 		ximage_prepare(&c->image, c->width, c->height);
+ 		XGetSubImage(c->src.dpy, c->src.window,
+ 			     clip->x, clip->y, clip->width, clip->height,
+@@ -1838,7 +1999,7 @@ static void put_dst(struct clone *c, const XRectangle *clip)
+ 				 clip->width, clip->height);
+ 		c->dst.display->send |= c->dst.use_shm;
+ 	} else if (c->dst.pixmap) {
+-		DBG(DRAW, ("%s-%s using SHM pixmap\n",
++		DBG(DRAW, ("%s-%s using SHM or DRI3 pixmap\n",
+ 		     DisplayString(c->dst.dpy), c->dst.name));
+ 		c->dst.serial = NextRequest(c->dst.dpy);
+ 		XCopyArea(c->dst.dpy, c->dst.pixmap, c->dst.window, c->dst.gc,
+@@ -1870,6 +2031,9 @@ static int clone_paint(struct clone *c)
+ {
+ 	XRectangle clip;
+ 
++	if (c->width == 0 || c->height == 0)
++		return 0;
++
+ 	DBG(DRAW, ("%s-%s paint clone, damaged (%d, %d), (%d, %d) [(%d, %d), (%d,  %d)]\n",
+ 	     DisplayString(c->dst.dpy), c->dst.name,
+ 	     c->damaged.x1, c->damaged.y1,
+@@ -1944,6 +2108,10 @@ static int clone_paint(struct clone *c)
+ 		clip.height = c->damaged.y2 - c->damaged.y1;
+ 		get_src(c, &clip);
+ 
++		DBG(DRAW, ("%s-%s target offset %dx%d\n",
++			   DisplayString(c->dst.dpy), c->dst.name,
++			   c->dst.x - c->src.x, c->dst.y - c->src.y));
++
+ 		clip.x += c->dst.x - c->src.x;
+ 		clip.y += c->dst.y - c->src.y;
+ 		put_dst(c, &clip);
+@@ -1969,8 +2137,9 @@ static void clone_damage(struct clone *c, const XRectangle *rec)
+ 	if ((v = (int)rec->y + rec->height) > c->damaged.y2)
+ 		c->damaged.y2 = v;
+ 
+-	DBG(DAMAGE, ("%s-%s damaged: (%d, %d), (%d, %d)\n",
++	DBG(DAMAGE, ("%s-%s damaged: +(%d,%d)x(%d, %d) -> (%d, %d), (%d, %d)\n",
+ 	     DisplayString(c->dst.display->dpy), c->dst.name,
++	     rec->x, rec->y, rec->width, rec->height,
+ 	     c->damaged.x1, c->damaged.y1,
+ 	     c->damaged.x2, c->damaged.y2));
+ }
+@@ -2252,6 +2421,8 @@ static int clone_init_depth(struct clone *clone)
+ 	if (ret)
+ 		return ret;
+ 
++	clone->depth = depth;
++
+ 	DBG(X11, ("%s-%s using depth %d, requires xrender for src? %d, for dst? %d\n",
+ 	     DisplayString(clone->dst.dpy), clone->dst.name,
+ 	     clone->depth,
+@@ -2312,6 +2483,8 @@ static int add_display(struct context *ctx, Display *dpy)
+ 	display->depth = DefaultDepth(dpy, DefaultScreen(dpy));
+ 	display->visual = DefaultVisual(dpy, DefaultScreen(dpy));
+ 
++	XSelectInput(dpy, display->root, ExposureMask);
++
+ 	display->has_shm = can_use_shm(dpy, display->root,
+ 				       &display->shm_event,
+ 				       &display->shm_opcode,
+@@ -2323,6 +2496,8 @@ static int add_display(struct context *ctx, Display *dpy)
+ 	     display->shm_opcode,
+ 	     display->has_shm_pixmap));
+ 
++	screensaver_save(display);
++
+ 	display->rr_active = XRRQueryExtension(dpy, &display->rr_event, &display->rr_error);
+ 	DBG(X11, ("%s: randr_active?=%d, event=%d, error=%d\n",
+ 	     DisplayString(dpy),
+@@ -2592,6 +2767,11 @@ static int last_display_add_clones__randr(struct context *ctx)
+ 			return ret;
+ 		}
+ 
++		clone->dst.x = 0;
++		clone->dst.y = 0;
++		clone->dst.width = display->width;
++		clone->dst.height = display->height;
++
+ 		ret = clone_update_modes__randr(clone);
+ 		if (ret) {
+ 			fprintf(stderr, "Failed to clone output \"%s\" from display \"%s\"\n",
+@@ -2668,8 +2848,8 @@ static int last_display_add_clones__xinerama(struct context *ctx)
+ 		}
+ 
+ 		/* Replace the modes on the local VIRTUAL output with the remote Screen */
+-		clone->width = xi[n].width;
+-		clone->height = xi[n].height;
++		clone->dst.width = xi[n].width;
++		clone->dst.height = xi[n].height;
+ 		clone->dst.x = xi[n].x_org;
+ 		clone->dst.y = xi[n].y_org;
+ 		clone->dst.rr_crtc = -1;
+@@ -2698,64 +2878,67 @@ static int last_display_add_clones__display(struct context *ctx)
+ 	Display *dpy = display->dpy;
+ 	struct clone *clone;
+ 	Screen *scr;
++	int count, s;
+ 	char buf[80];
+ 	int ret;
+ 	RROutput id;
+ 
++	count = ScreenCount(dpy);
++	DBG(X11, ("%s(%s) - %d screens\n", __func__, DisplayString(dpy), count));
++	for (s = 0; s < count; s++) {
++		clone = add_clone(ctx);
++		if (clone == NULL)
++			return -ENOMEM;
+ 
+-	DBG(X11, ("%s(%s)\n", __func__, DisplayString(dpy)));
+-	clone = add_clone(ctx);
+-	if (clone == NULL)
+-		return -ENOMEM;
++		clone->depth = 24;
++		clone->next = display->clone;
++		display->clone = clone;
+ 
+-	clone->depth = 24;
+-	clone->next = display->clone;
+-	display->clone = clone;
++		id = claim_virtual(ctx->display, buf, ctx->nclone);
++		if (id == 0) {
++			fprintf(stderr, "Failed to find available VirtualHead \"%s\" for on display \"%s\"\n",
++				buf, DisplayString(dpy));
++		}
++		ret = clone_output_init(clone, &clone->src, ctx->display, buf, id);
++		if (ret) {
++			fprintf(stderr, "Failed to add display \"%s\"\n",
++				DisplayString(ctx->display->dpy));
++			return ret;
++		}
+ 
+-	id = claim_virtual(ctx->display, buf, ctx->nclone);
+-	if (id == 0) {
+-		fprintf(stderr, "Failed to find available VirtualHead \"%s\" for on display \"%s\"\n",
+-			buf, DisplayString(dpy));
+-	}
+-	ret = clone_output_init(clone, &clone->src, ctx->display, buf, id);
+-	if (ret) {
+-		fprintf(stderr, "Failed to add display \"%s\"\n",
+-			DisplayString(ctx->display->dpy));
+-		return ret;
+-	}
++		sprintf(buf, "SCREEN%d", s);
++		ret = clone_output_init(clone, &clone->dst, display, buf, 0);
++		if (ret) {
++			fprintf(stderr, "Failed to add display \"%s\"\n",
++				DisplayString(dpy));
++			return ret;
++		}
+ 
+-	sprintf(buf, "WHOLE");
+-	ret = clone_output_init(clone, &clone->dst, display, buf, 0);
+-	if (ret) {
+-		fprintf(stderr, "Failed to add display \"%s\"\n",
+-			DisplayString(dpy));
+-		return ret;
+-	}
++		ret = clone_init_depth(clone);
++		if (ret) {
++			fprintf(stderr, "Failed to negotiate image format for display \"%s\"\n",
++				DisplayString(dpy));
++			return ret;
++		}
+ 
+-	ret = clone_init_depth(clone);
+-	if (ret) {
+-		fprintf(stderr, "Failed to negotiate image format for display \"%s\"\n",
+-			DisplayString(dpy));
+-		return ret;
+-	}
++		/* Replace the modes on the local VIRTUAL output with the remote Screen */
++		scr = ScreenOfDisplay(dpy, s);
++		clone->dst.width = scr->width;
++		clone->dst.height = scr->height;
++		clone->dst.x = 0;
++		clone->dst.y = 0;
++		clone->dst.rr_crtc = -1;
++		ret = clone_update_modes__fixed(clone);
++		if (ret) {
++			fprintf(stderr, "Failed to clone display \"%s\"\n",
++				DisplayString(dpy));
++			return ret;
++		}
+ 
+-	/* Replace the modes on the local VIRTUAL output with the remote Screen */
+-	scr = ScreenOfDisplay(dpy, DefaultScreen(dpy));
+-	clone->width = scr->width;
+-	clone->height = scr->height;
+-	clone->dst.x = 0;
+-	clone->dst.y = 0;
+-	clone->dst.rr_crtc = -1;
+-	ret = clone_update_modes__fixed(clone);
+-	if (ret) {
+-		fprintf(stderr, "Failed to clone display \"%s\"\n",
+-			DisplayString(dpy));
+-		return ret;
++		clone->active = ctx->active;
++		ctx->active = clone;
+ 	}
+ 
+-	clone->active = ctx->active;
+-	ctx->active = clone;
+-
+ 	return 0;
+ }
+ 
+@@ -3168,6 +3351,33 @@ static void context_cleanup(struct context *ctx)
+ 	XCloseDisplay(dpy);
+ }
+ 
++static void update_cursor_image(struct context *ctx)
++{
++	XFixesCursorImage *cur;
++	int i;
++
++	DBG(CURSOR, ("%s cursor changed\n",
++		     DisplayString(ctx->display->dpy)));
++
++	cur = XFixesGetCursorImage(ctx->display->dpy);
++	if (cur == NULL)
++		return;
++
++	display_load_visible_cursor(&ctx->display[0], cur);
++	for (i = 1; i < ctx->ndisplay; i++) {
++		struct display *display = &ctx->display[i];
++
++		DBG(CURSOR, ("%s marking cursor changed\n", DisplayString(display->dpy)));
++		display->cursor_moved++;
++		if (display->cursor != display->invisible_cursor) {
++			display->cursor_visible++;
++			context_enable_timer(display->ctx);
++		}
++	}
++
++	XFree(cur);
++}
++
+ static int done;
+ 
+ static void signal_handler(int sig)
+@@ -3182,6 +3392,7 @@ int main(int argc, char **argv)
+ 	uint64_t count;
+ 	int daemonize = 1, bumblebee = 0, siblings = 0, singleton = 1;
+ 	int i, ret, open, fail;
++	int idle;
+ 
+ 	signal(SIGPIPE, SIG_IGN);
+ 
+@@ -3228,6 +3439,7 @@ int main(int argc, char **argv)
+ 		return -ret;
+ 
+ 	XSetErrorHandler(_check_error_handler);
++	XSetIOErrorHandler(_io_error_handler);
+ 
+ 	ret = add_fd(&ctx, display_open(&ctx, src_name));
+ 	if (ret) {
+@@ -3237,6 +3449,13 @@ int main(int argc, char **argv)
+ 		goto out;
+ 	}
+ 
++	ret = check_virtual(ctx.display);
++	if (ret) {
++		fprintf(stderr, "No VIRTUAL outputs on \"%s\".\n",
++			DisplayString(ctx.display->dpy));
++		goto out;
++	}
++
+ 	if (singleton) {
+ 		XSelectInput(ctx.display->dpy, ctx.display->root, PropertyChangeMask);
+ 		if (first_display_has_singleton(&ctx)) {
+@@ -3291,6 +3510,11 @@ int main(int argc, char **argv)
+ 	if (ret)
+ 		goto out;
+ 
++	if (ctx.display->saver_active)
++		XScreenSaverSelectInput(ctx.display->dpy,
++					ctx.display->root,
++					ScreenSaverNotifyMask);
++
+ 	if ((ctx.display->rr_event | ctx.display->rr_error) == 0) {
+ 		fprintf(stderr, "RandR extension not supported by %s\n", DisplayString(ctx.display->dpy));
+ 		ret = EINVAL;
+@@ -3348,25 +3572,60 @@ int main(int argc, char **argv)
+ 	signal(SIGTERM, signal_handler);
+ 
+ 	ctx.command_continuation = 0;
++	update_cursor_image(&ctx);
++
++	idle = 0;
+ 	while (!done) {
+ 		XEvent e;
+ 		int reconfigure = 0;
+ 		int rr_update = 0;
+ 
+-		DBG(POLL, ("polling - enable timer? %d, nfd=%d, ndisplay=%d\n", ctx.timer_active, ctx.nfd, ctx.ndisplay));
+-		ret = poll(ctx.pfd + !ctx.timer_active, ctx.nfd - !ctx.timer_active, -1);
+-		if (ret <= 0)
+-			break;
++		if (idle) {
++			DBG(POLL, ("polling - enable timer? %d, nfd=%d, ndisplay=%d\n", ctx.timer_active, ctx.nfd, ctx.ndisplay));
++			ret = poll(ctx.pfd + !ctx.timer_active, ctx.nfd - !ctx.timer_active, -1);
++			if (ret <= 0)
++				break;
++
++			DBG(POLL, ("poll reports %d fd awake\n", ret));
++		}
++		idle = 1;
+ 
+ 		/* pfd[0] is the timer, pfd[1] is the local display, pfd[2] is the mouse, pfd[3+] are the remotes */
+ 
+-		DBG(POLL, ("poll reports %d fd awake\n", ret));
+ 		if (ctx.pfd[1].revents || XPending(ctx.display[0].dpy)) {
+ 			DBG(POLL,("%s woken up\n", DisplayString(ctx.display[0].dpy)));
++			ctx.pfd[1].revents = 0;
++			idle = 0;
++
+ 			do {
+ 				XNextEvent(ctx.display->dpy, &e);
+ 
+-				if (e.type == ctx.display->damage_event + XDamageNotify ) {
++				DBG(POLL, ("%s received event %d\n", DisplayString(ctx.display[0].dpy), e.type));
++
++				if (e.type == ctx.display->saver_event + ScreenSaverNotify) {
++					const XScreenSaverNotifyEvent *se = (const XScreenSaverNotifyEvent *)&e;
++					DBG(SCREEN,
++					    ("%s screen saver: state=%d, kind=%d, forced=%d\n",
++					     DisplayString(ctx.display->dpy),
++					     se->state, se->kind, se->forced));
++					for (i = 1; i < ctx.ndisplay; i++) {
++						struct display *display = &ctx.display[i];
++
++						if (!display->active)
++							continue;
++
++						DBG(SCREEN,
++						    ("%s %s screen saver\n",
++						     DisplayString(display->dpy),
++						     se->state == ScreenSaverOn ? "activating" : "resetting\n"));
++
++						if (se->state == ScreenSaverOn)
++							XActivateScreenSaver(display->dpy);
++						else
++							XResetScreenSaver(display->dpy);
++						XFlush(display->dpy);
++					}
++				} else if (e.type == ctx.display->damage_event + XDamageNotify) {
+ 					const XDamageNotifyEvent *de = (const XDamageNotifyEvent *)&e;
+ 					struct clone *clone;
+ 
+@@ -3380,19 +3639,7 @@ int main(int argc, char **argv)
+ 					if (ctx.active)
+ 						context_enable_timer(&ctx);
+ 				} else if (e.type == ctx.display->xfixes_event + XFixesCursorNotify) {
+-					XFixesCursorImage *cur;
+-
+-					DBG(CURSOR, ("%s cursor changed\n",
+-					     DisplayString(ctx.display->dpy)));
+-
+-					cur = XFixesGetCursorImage(ctx.display->dpy);
+-					if (cur == NULL)
+-						continue;
+-
+-					for (i = 1; i < ctx.ndisplay; i++)
+-						display_load_visible_cursor(&ctx.display[i], cur);
+-
+-					XFree(cur);
++					update_cursor_image(&ctx);
+ 				} else if (e.type == ctx.display->rr_event + RRScreenChangeNotify) {
+ 					DBG(XRR, ("%s screen changed (reconfigure pending? %d)\n",
+ 					     DisplayString(ctx.display->dpy), reconfigure));
+@@ -3426,13 +3673,41 @@ int main(int argc, char **argv)
+ 			if (ctx.pfd[i+2].revents == 0 && !XPending(ctx.display[i].dpy))
+ 				continue;
+ 
++			ctx.pfd[i+2].revents = 0;
++			idle = 0;
++
+ 			DBG(POLL, ("%s woken up\n", DisplayString(ctx.display[i].dpy)));
+ 			do {
+ 				XNextEvent(ctx.display[i].dpy, &e);
+ 
+ 				DBG(POLL, ("%s received event %d\n", DisplayString(ctx.display[i].dpy), e.type));
+-				if (ctx.display[i].rr_active && e.type == ctx.display[i].rr_event + RRNotify) {
+-					XRRNotifyEvent *re = (XRRNotifyEvent *)&e;
++				if (e.type == Expose) {
++					const XExposeEvent *xe = (XExposeEvent *)&e;
++					struct clone *clone;
++					int damaged = 0;
++
++					DBG(DAMAGE, ("%s exposed: (%d, %d)x(%d, %d)\n",
++					     DisplayString(ctx.display[i].dpy),
++					     xe->x, xe->y, xe->width, xe->height));
++
++					for (clone = ctx.active; clone; clone = clone->active) {
++						XRectangle r;
++
++						if (clone->dst.display != &ctx.display[i])
++							continue;
++
++						r.x = clone->src.x + xe->x;
++						r.y = clone->src.y + xe->y;
++						r.width  = xe->width;
++						r.height = xe->height;
++						clone_damage(clone, &r);
++						damaged++;
++					}
++
++					if (damaged)
++						context_enable_timer(&ctx);
++				} else if (ctx.display[i].rr_active && e.type == ctx.display[i].rr_event + RRNotify) {
++					const XRRNotifyEvent *re = (XRRNotifyEvent *)&e;
+ 
+ 					DBG(XRR, ("%s received RRNotify, type %d\n", DisplayString(ctx.display[i].dpy), re->subtype));
+ 					if (re->subtype == RRNotify_OutputChange) {
+@@ -3480,6 +3755,7 @@ int main(int argc, char **argv)
+ 
+ 			DBG(TIMER, ("%s timer still active? %d\n", DisplayString(ctx.display->dpy), ret != 0));
+ 			ctx.timer_active = ret != 0;
++			idle = 0;
+ 		}
+ 	}
+ 
diff --git a/main/xf86-video-intel/xf86-video-intel-2.99.917-libdrm-kernel-4_0-crash.patch b/main/xf86-video-intel/xf86-video-intel-2.99.917-libdrm-kernel-4_0-crash.patch
deleted file mode 100644
index ea3aa30ed1..0000000000
--- a/main/xf86-video-intel/xf86-video-intel-2.99.917-libdrm-kernel-4_0-crash.patch
+++ /dev/null
@@ -1,65 +0,0 @@
-From 7fe2b2948652443ff43d907855bd7a051d54d309 Mon Sep 17 00:00:00 2001
-From: Chris Wilson <chris@chris-wilson.co.uk>
-Date: Thu, 19 Mar 2015 23:14:17 +0000
-Subject: sna: Protect against ABI breakage in recent versions of libdrm
-
-Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
-
-diff --git a/src/sna/kgem.c b/src/sna/kgem.c
-index 11f0828..6f16cba 100644
---- a/src/sna/kgem.c
-+++ b/src/sna/kgem.c
-@@ -182,6 +182,15 @@ struct local_i915_gem_caching {
- #define LOCAL_IOCTL_I915_GEM_SET_CACHING DRM_IOW(DRM_COMMAND_BASE + LOCAL_I915_GEM_SET_CACHING, struct local_i915_gem_caching)
- #define LOCAL_IOCTL_I915_GEM_GET_CACHING DRM_IOW(DRM_COMMAND_BASE + LOCAL_I915_GEM_GET_CACHING, struct local_i915_gem_caching)
- 
-+struct local_i915_gem_mmap {
-+	uint32_t handle;
-+	uint32_t pad;
-+	uint64_t offset;
-+	uint64_t size;
-+	uint64_t addr_ptr;
-+};
-+#define LOCAL_IOCTL_I915_GEM_MMAP DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_MMAP, struct local_i915_gem_mmap)
-+
- struct local_i915_gem_mmap2 {
- 	uint32_t handle;
- 	uint32_t pad;
-@@ -514,15 +523,15 @@ retry_wc:
- 
- static void *__kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo)
- {
--	struct drm_i915_gem_mmap mmap_arg;
-+	struct local_i915_gem_mmap arg;
- 	int err;
- 
- retry:
--	VG_CLEAR(mmap_arg);
--	mmap_arg.handle = bo->handle;
--	mmap_arg.offset = 0;
--	mmap_arg.size = bytes(bo);
--	if ((err = do_ioctl(kgem->fd, DRM_IOCTL_I915_GEM_MMAP, &mmap_arg))) {
-+	VG_CLEAR(arg);
-+	arg.handle = bo->handle;
-+	arg.offset = 0;
-+	arg.size = bytes(bo);
-+	if ((err = do_ioctl(kgem->fd, LOCAL_IOCTL_I915_GEM_MMAP, &arg))) {
- 		assert(err != EINVAL);
- 
- 		if (__kgem_throttle_retire(kgem, 0))
-@@ -536,10 +545,10 @@ retry:
- 		return NULL;
- 	}
- 
--	VG(VALGRIND_MAKE_MEM_DEFINED(mmap_arg.addr_ptr, bytes(bo)));
-+	VG(VALGRIND_MAKE_MEM_DEFINED(arg.addr_ptr, bytes(bo)));
- 
- 	DBG(("%s: caching CPU vma for %d\n", __FUNCTION__, bo->handle));
--	return bo->map__cpu = (void *)(uintptr_t)mmap_arg.addr_ptr;
-+	return bo->map__cpu = (void *)(uintptr_t)arg.addr_ptr;
- }
- 
- static int gem_write(int fd, uint32_t handle,
--- 
-cgit v0.10.2
-
diff --git a/main/xf86-video-intel/xorg-1.18.patch b/main/xf86-video-intel/xorg-1.18.patch
deleted file mode 100644
index ab3fb399a1..0000000000
--- a/main/xf86-video-intel/xorg-1.18.patch
+++ /dev/null
@@ -1,24 +0,0 @@
-From 2c5063938cc809f624e56efd4673041fa8141e81 Mon Sep 17 00:00:00 2001
-From: Martin Peres <martin.peres@linux.intel.com>
-Date: Thu, 9 Jul 2015 11:26:38 +0300
-Subject: uxa: fix the call to PixmapSyncDirtyHelper, broken by xserver's
- 90db5ed
-
-[ickle: switch to HAS_DIRTYTRACKING_ROTATION as suggested by Dave Airlie]
-Signed-off-by: Martin Peres <martin.peres@linux.intel.com>
-
-diff --git a/src/compat-api.h b/src/compat-api.h
-index aa93bee..293e9d7 100644
---- a/src/compat-api.h
-+++ b/src/compat-api.h
-@@ -247,3 +247,7 @@ static inline void FreePixmap(PixmapPtr pixmap)
- #endif
- 
- #endif
-+
-+#if HAS_DIRTYTRACKING_ROTATION
-+#define PixmapSyncDirtyHelper(d, dd) PixmapSyncDirtyHelper(d)
-+#endif
--- 
-cgit v0.10.2
-