xf86-video-msm/0000755000175000017500000000000011615776600013470 5ustar paulliupaulliuxf86-video-msm/autogen.sh0000755000175000017500000000030311615776600015465 0ustar paulliupaulliu#! /bin/sh srcdir=`dirname $0` test -z "$srcdir" && srcdir=. ORIGDIR=`pwd` cd $srcdir autoreconf -v --install || exit 1 cd $ORIGDIR || exit $? $srcdir/configure --enable-maintainer-mode "$@" xf86-video-msm/Makefile.am0000644000175000017500000000221311615776600015522 0ustar paulliupaulliu# Copyright 2005 Adam Jackson. # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), # to deal in the Software without restriction, including without limitation # on the rights to use, copy, modify, merge, publish, distribute, sub # license, and/or sell copies of the Software, and to permit persons to whom # the Software is furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice (including the next # paragraph) shall be included in all copies or substantial portions of the # Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL # ADAM JACKSON BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. AUTOMAKE_OPTIONS = foreign SUBDIRS = src xf86-video-msm/configure.ac0000644000175000017500000000664611615776600015772 0ustar paulliupaulliu# Copyright 2005 Adam Jackson. # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), # to deal in the Software without restriction, including without limitation # on the rights to use, copy, modify, merge, publish, distribute, sub # license, and/or sell copies of the Software, and to permit persons to whom # the Software is furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice (including the next # paragraph) shall be included in all copies or substantial portions of the # Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL # ADAM JACKSON BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # # Process this file with autoconf to produce a configure script AC_PREREQ(2.57) AC_INIT([xf86-video-msm], 1.1.0, [], xf86-video-msm) AC_CONFIG_SRCDIR([Makefile.am]) AM_CONFIG_HEADER([config.h]) AC_CONFIG_AUX_DIR(.) AM_INIT_AUTOMAKE([dist-bzip2]) AM_MAINTAINER_MODE # Checks for programs. AC_DISABLE_STATIC AC_PROG_LIBTOOL AC_PROG_CC AM_PROG_AS AH_TOP([#include "xorg-server.h"]) AC_ARG_ENABLE(dri, AC_HELP_STRING([--disable-dri], [Disable DRI support [[default=auto]]]) [DRI="$enableval"], [DRI=auto]) AC_ARG_WITH(xorg-module-dir, AC_HELP_STRING([--with-xorg-module-dir=DIR], [Default xorg module directory [[default=$libdir/xorg/modules]]]), [moduledir="$withval"], [moduledir="$libdir/xorg/modules"]) # Allow the user to specify where the kernel headers for the MSM are AC_ARG_WITH(kernel-headers, AC_HELP_STRING([--with-kernel-headers=DIR], [Kernel header directory] ), [CFLAGS="$CFLAGS -I$withval -I${withval}/../arch/arm/include"]) # Checks for extensions #XORG_DRIVER_CHECK_EXT(RANDR, randrproto) #XORG_DRIVER_CHECK_EXT(RENDER, renderproto) #XORG_DRIVER_CHECK_EXT(XV, videoproto) # Checks for pkg-config packages PKG_CHECK_MODULES(XORG, [xorg-server xproto $REQUIRED_MODULES]) sdkdir=$(pkg-config --variable=sdkdir xorg-server) # Checks for libraries. # Checks for header files. AC_HEADER_STDC DRI=yes DRI2=yes AC_ARG_ENABLE(dri2, AC_HELP_STRING([--disable-dri2], [Disable DRI2 support]), [DRI2=no], [DRI2=yes]) AC_ARG_ENABLE(dri, AC_HELP_STRING([--disable-dri], [Disable DRI support]), [DRI=no], [DRI=yes]) AM_CONDITIONAL(USEDRI, test x$DRI = xyes) if test "$DRI" = yes; then PKG_CHECK_MODULES(DRI, [libdrm >= 2.2 xf86driproto glproto]) AC_DEFINE(USEDRI,1,[Enable DRI driver support]) fi # We need xserver-xorg at least 1.6.3 for DRI2 AM_CONDITIONAL(USEDRI2, test x$DRI2 = xyes) if test "$DRI2" = yes; then PKG_CHECK_MODULES(DRI2, [libdrm >= 2.2 xf86driproto glproto xorg-server >= 1.6.3]) AC_DEFINE(USEDRI2,1,[Enable DRI2 driver support]) fi AC_SUBST([XORG_CFLAGS]) AC_SUBST([moduledir]) AC_SUBST([CFLAGS]) AC_SUBST([CCASFLAGS]) DRIVER_NAME=msm AC_SUBST([DRIVER_NAME]) #XORG_MANPAGE_SECTIONS XORG_RELEASE_VERSION AC_OUTPUT([ Makefile src/Makefile ]) xf86-video-msm/.gitignore0000644000175000017500000000025711615776600015464 0ustar paulliupaulliu.deps .libs Makefile *.la *.lo *.o *~ aclocal.m4 autom4te.cache compile config.h config.h.in config.log config.status depcomp stamp-h1 Makefile.in configure libtool ltmain.sh xf86-video-msm/README0000644000175000017500000000435011615776600014352 0ustar paulliupaulliu= Introduction This is graphics driver for the Qualcomm MSM7xxx and QSD8x50x processors. = Building == Dependencies You will need the following X development dependencies installed on your development system before building this package: xorg-server > 1.4, xproto, fontsproto, renderproto == Configuration Type ./autogen.sh at the command line to automatically generate the confguration system. If you need to change some parameters to the configure script (such as prefix paths), you can re-run configure again after the autogen.sh script has completed. Make sure that you specify the correct host and build targets for your cross compiler. Here is an example: ./configure --build=x86_64-linux --host=arm-linux-gnueabi --target=arm-linux-gnueabi --prefix=/usr/local/qcomle/arm/arm-linux-gnueabi/ This will configure the system to use the 'arm-linux-gnueabi-gcc' compiler. == Building After the driver has been configured, it can be built by typing 'make' at the command line. = Using the driver After building, the driver is located at src/.libs/msm_drv.so This can be copied diretly to your target. All X video drivers are located in 'usr/lib/xorg/modules/drivers' on target. Copy the .so there. To use the driver, you need to configure it in /etc/X11/xorg.conf. Add the following section to your xorg.conf file: Section "Device" Identifier "MSM" Driver "msm" Option "fb" "/dev/fb0" # Option "NoAccel" "true" # Option "SWBlit" "true" EndSection To change which framebuffer device you want to use, modify the "fb" option to point at the device file you want to run on. To disable accelerations, set the "NoAccel" option to true. To use software blits only in EXA (no MSMFB_BLIT), then set the "SWBlit" option to "true". Note that software blits may be used regardless of the value of the option. Next, you need to add the "MSM" device to the screen: Section "Screen" ... Device "MSM" EndSection You do not need to configure a monitor or a mode size -the mode size is determined automatically and cannot be changed. Finally, restart X. In the log you should see the following: (II) msm: Driver for Qualcomm MSM processors: MSM7201, MSM7X25 That indicates that the msm driver has been loaded. xf86-video-msm/src/0000755000175000017500000000000011615776600014257 5ustar paulliupaulliuxf86-video-msm/src/msm-swfill.c0000755000175000017500000005165011615776600016527 0ustar paulliupaulliu/* msm-swfill.c * * Copyright (c) 2009, Code Aurora Forum. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Code Aurora nor * the names of its contributors may be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NON-INFRINGEMENT ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif // TODO: Check if this is needed for fills. Rename it? // Shared software blit code. #include "msm-swblits.h" /* Alignment check macro functions used to determine if two pointers are aligned with a specified granularity. */ #define SW_CHECK_ALIGNMENT(ALIGNMENT_BYTE_SIZE,dst,REQUIRED_ALIGNMENT) \ (((int) (dst) % (ALIGNMENT_BYTE_SIZE)) == (REQUIRED_ALIGNMENT)) /* Alignment check macro functions used to determine if two pointers (along with pitches) are aligned with a specified granularity. */ /* (Having the pitches aligned, as well as the pointers, insures that all pointers when incremented by the pitches will still be aligned.) */ #define SW_CHECK_PITCHED_ALIGNMENT(ALIGNMENT_BYTE_SIZE,dst,dpitch,spitch,REQUIRED_ALIGNMENT) \ (((int) (dst) % (ALIGNMENT_BYTE_SIZE)) == (REQUIRED_ALIGNMENT) \ && (abs(dpitch) % (ALIGNMENT_BYTE_SIZE)) == 0) static inline void memset16_NoAlignmentAssumptions_UpTo7Count(uint8_t *dst, uint16_t src, int count) { const uint32_t packedSource32 = (uint32_t) src | ((uint32_t) src << 16); const uint64_t packedSource64 = (uint64_t) src | ((uint64_t) src << 16) | ((uint64_t) src << 32) | ((uint64_t) src << 48); // Quickly branch to customized code for each width. switch (count) { // Cases 0-7 are designed to be optimal in that they generate a minimal number of aligned operations with minimal alignment test code. case 0: break; case 1: *(uint16_t *)(dst) = src; break; case 2: if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,0)) { *(uint32_t *)(dst) = packedSource32; } else { *(uint16_t *)(dst) = src; *(uint16_t *)(dst+1*BYTES_PER_UINT16_T) = src; } break; case 3: if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,0)) { *(uint32_t *)(dst) = packedSource32; *(uint16_t *)(dst+BYTES_PER_UINT32_T) = src; } else { *(uint16_t *)(dst) = src; *(uint32_t *)(dst+BYTES_PER_UINT16_T) = packedSource32; } break; case 4: if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,0)) { *(uint64_t *)(dst) = packedSource64; } else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,0)) { *(uint32_t *)(dst) = packedSource32; *(uint32_t *)(dst+BYTES_PER_UINT32_T) = packedSource32; } else { *(uint16_t *)(dst) = src; *(uint32_t *)(dst+BYTES_PER_UINT16_T) = packedSource32; *(uint16_t *)(dst+BYTES_PER_UINT16_T+BYTES_PER_UINT32_T) = src; } break; case 5: if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,0)) { *(uint64_t *)(dst) = packedSource64; *(uint16_t *)(dst+BYTES_PER_UINT64_T) = src; } else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,0)) { *(uint32_t *)(dst) = packedSource32; *(uint32_t *)(dst+BYTES_PER_UINT32_T) = packedSource32; *(uint16_t *)(dst+BYTES_PER_UINT32_T+BYTES_PER_UINT32_T) = src; } else { *(uint16_t *)(dst) = src; *(uint32_t *)(dst+BYTES_PER_UINT16_T) = packedSource32; *(uint32_t *)(dst+BYTES_PER_UINT16_T+BYTES_PER_UINT32_T) = packedSource32; } break; case 6: if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,0)) { *(uint64_t *)(dst) = packedSource64; *(uint32_t *)(dst+BYTES_PER_UINT64_T) = packedSource32; } else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,0)) { *(uint32_t *)(dst) = packedSource32; *(uint64_t *)(dst+BYTES_PER_UINT32_T) = packedSource64; } else { *(uint16_t *)(dst) = src; *(uint32_t *)(dst+BYTES_PER_UINT16_T) = packedSource32; *(uint32_t *)(dst+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = packedSource32; *(uint16_t *)(dst+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src; } break; case 7: if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,0)) { *(uint64_t *)(dst) = packedSource64; *(uint32_t *)(dst+BYTES_PER_UINT64_T) = packedSource32; *(uint16_t *)(dst+BYTES_PER_UINT64_T+BYTES_PER_UINT32_T) = src; } else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,0)) { *(uint32_t *)(dst) = packedSource32; *(uint64_t *)(dst+BYTES_PER_UINT32_T) = packedSource64; *(uint16_t *)(dst+BYTES_PER_UINT32_T+BYTES_PER_UINT64_T) = src; } else { *(uint16_t *)(dst) = src; *(uint32_t *)(dst+BYTES_PER_UINT16_T) = packedSource32; *(uint32_t *)(dst+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = packedSource32; *(uint32_t *)(dst+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = packedSource32; } break; default: break; } } static inline void memset16_NeonAlignmentAssumptions_UpTo7Count(uint8_t *dst, uint16_t src, int count) { const uint32_t packedSource32 = (uint32_t) src | ((uint32_t) src << 16); const uint64_t packedSource64 = (uint64_t) src | ((uint64_t) src << 16) | ((uint64_t) src << 32) | ((uint64_t) src << 48); // Quickly branch to customized code for each width. // NOTE: We don't need any alignment checks because dest is assumed to already be Neon-aligned // (which guarantees double-word, word and half-word alignment as well). switch (count) { // Cases 0-7 are designed to be optimal in that they generate a minimal number of aligned operations with minimal alignment test code. case 0: break; case 1: *(uint16_t *)(dst) = src; break; case 2: *(uint32_t *)(dst) = packedSource32; break; case 3: *(uint32_t *)(dst) = packedSource32; *(uint16_t *)(dst+BYTES_PER_UINT32_T) = src; break; case 4: *(uint64_t *)(dst) = packedSource64; break; case 5: *(uint64_t *)(dst) = packedSource64; *(uint16_t *)(dst+BYTES_PER_UINT64_T) = src; break; case 6: *(uint64_t *)(dst) = packedSource64; *(uint32_t *)(dst+BYTES_PER_UINT64_T) = packedSource32; break; case 7: *(uint64_t *)dst = packedSource64; *(uint32_t *)(dst+BYTES_PER_UINT64_T) = packedSource32; *(uint16_t *)(dst+BYTES_PER_UINT64_T+BYTES_PER_UINT32_T) = src; break; } } static inline void memset16_AssumesNeonAlignment(uint8_t *dst, uint16_t src, int count) { // Get the number of pixels that can be packed into the eight lanes of a 16x8 vector of uint16_t words. const int PIXELS_IN_VECTOR_COLUMN = BYTES_PER_UINT16X8_T / BYTES_PER_UINT16_T; // Duplicate the 16-bit source value into 8 lanes of a 16x8 vector (8 pixels). uint16x8_t packedSource128 = vdupq_n_u16(src); // Quickly branch to customized code for each width. switch (count / PIXELS_IN_VECTOR_COLUMN) { // Cases are designed to be near-optimal in terms of number of operations, but they don't attempt to align memory access. // (This can result in slowdowns unless the function is called with an aligned destination pointer.) case 0: break; case 1: { // If we get here, we can assume there are 8 pixels to copy, // so copy one vector worth of pixels. const int ONE_COLUMN = 1; count -= PIXELS_IN_VECTOR_COLUMN * ONE_COLUMN; vst1q_u16((uint16_t *)(dst+0*BYTES_PER_UINT16X8_T),packedSource128); dst += BYTES_PER_UINT16X8_T * ONE_COLUMN; } break; case 2: { // If we get here, we can assume there are 16 pixels to copy, // so copy two vectors worth of pixels. const int TWO_COLUMNS = 2; count -= PIXELS_IN_VECTOR_COLUMN * TWO_COLUMNS; vst1q_u16((uint16_t *)(dst+0*BYTES_PER_UINT16X8_T),packedSource128); vst1q_u16((uint16_t *)(dst+1*BYTES_PER_UINT16X8_T),packedSource128); dst += BYTES_PER_UINT16X8_T * TWO_COLUMNS; } break; case 3: { // If we get here, we can assume there are 24 pixels to copy, // so copy three vectors worth of pixels. const int THREE_COLUMNS = 3; count -= PIXELS_IN_VECTOR_COLUMN * THREE_COLUMNS; vst1q_u16((uint16_t *)(dst+0*BYTES_PER_UINT16X8_T),packedSource128); vst1q_u16((uint16_t *)(dst+1*BYTES_PER_UINT16X8_T),packedSource128); vst1q_u16((uint16_t *)(dst+2*BYTES_PER_UINT16X8_T),packedSource128); dst += BYTES_PER_UINT16X8_T * THREE_COLUMNS; } break; case 4: { // If we get here, we can assume there are 32 pixels to copy, // so copy three vectors worth of pixels. const int FOUR_COLUMNS = 4; count -= PIXELS_IN_VECTOR_COLUMN * FOUR_COLUMNS; vst1q_u16((uint16_t *)(dst+0*BYTES_PER_UINT16X8_T),packedSource128); vst1q_u16((uint16_t *)(dst+1*BYTES_PER_UINT16X8_T),packedSource128); vst1q_u16((uint16_t *)(dst+2*BYTES_PER_UINT16X8_T),packedSource128); vst1q_u16((uint16_t *)(dst+3*BYTES_PER_UINT16X8_T),packedSource128); dst += BYTES_PER_UINT16X8_T * FOUR_COLUMNS; } break; default: { // Copy multiple columns of a vector -- eight vectors at a time. const int EIGHT_COLUMNS = 8; while (count >= PIXELS_IN_VECTOR_COLUMN * EIGHT_COLUMNS) { count -= PIXELS_IN_VECTOR_COLUMN * EIGHT_COLUMNS; vst1q_u16((uint16_t *)(dst+0*BYTES_PER_UINT16X8_T),packedSource128); vst1q_u16((uint16_t *)(dst+1*BYTES_PER_UINT16X8_T),packedSource128); vst1q_u16((uint16_t *)(dst+2*BYTES_PER_UINT16X8_T),packedSource128); vst1q_u16((uint16_t *)(dst+3*BYTES_PER_UINT16X8_T),packedSource128); vst1q_u16((uint16_t *)(dst+4*BYTES_PER_UINT16X8_T),packedSource128); vst1q_u16((uint16_t *)(dst+5*BYTES_PER_UINT16X8_T),packedSource128); vst1q_u16((uint16_t *)(dst+6*BYTES_PER_UINT16X8_T),packedSource128); vst1q_u16((uint16_t *)(dst+7*BYTES_PER_UINT16X8_T),packedSource128); dst += BYTES_PER_UINT16X8_T * EIGHT_COLUMNS; } // If we get here, we can assume there are less than 64 pixels to copy. // Copy multiple columns of a vector -- up to four vectors (32 pixels). const int FOUR_COLUMNS = 4; if (count >= PIXELS_IN_VECTOR_COLUMN * FOUR_COLUMNS) { count -= PIXELS_IN_VECTOR_COLUMN * FOUR_COLUMNS; vst1q_u16((uint16_t *)(dst+0*BYTES_PER_UINT16X8_T),packedSource128); vst1q_u16((uint16_t *)(dst+1*BYTES_PER_UINT16X8_T),packedSource128); vst1q_u16((uint16_t *)(dst+2*BYTES_PER_UINT16X8_T),packedSource128); vst1q_u16((uint16_t *)(dst+3*BYTES_PER_UINT16X8_T),packedSource128); dst += BYTES_PER_UINT16X8_T * FOUR_COLUMNS; } // If we get here, we can assume there are less than 32 pixels to copy. // Copy multiple columns of a vector -- up to two vectors (16 pixels). const int TWO_COLUMNS = 2; if (count >= PIXELS_IN_VECTOR_COLUMN * TWO_COLUMNS) { count -= PIXELS_IN_VECTOR_COLUMN * TWO_COLUMNS; vst1q_u16((uint16_t *)(dst+0*BYTES_PER_UINT16X8_T),packedSource128); vst1q_u16((uint16_t *)(dst+1*BYTES_PER_UINT16X8_T),packedSource128); dst += BYTES_PER_UINT16X8_T * TWO_COLUMNS; } // If we get here, we can assume there are less than 16 pixels to copy. // If there is one vector left (with eight pixels), then copy it. const int ONE_COLUMN = 1; if (count >= PIXELS_IN_VECTOR_COLUMN * ONE_COLUMN) { count -= PIXELS_IN_VECTOR_COLUMN * ONE_COLUMN; vst1q_u16((uint16_t *)(dst+0*BYTES_PER_UINT16X8_T),packedSource128); dst += BYTES_PER_UINT16X8_T * ONE_COLUMN; } } break; } // Quickly fill remaining pixels (up to 7). memset16_NeonAlignmentAssumptions_UpTo7Count(dst, src, count); } static inline void memset16_Test(uint16_t *dst, uint16_t src, int count) { // Get the number of pixels that can be packed into the eight lanes of a 16x8 vector of uint16_t words. const int PIXELS_IN_VECTOR_COLUMN = BYTES_PER_UINT16X8_T / BYTES_PER_UINT16_T; // For narrow widths, do an optimized fill for both sides of the rectangle. if (count < PIXELS_IN_VECTOR_COLUMN) { memset16_NoAlignmentAssumptions_UpTo7Count((void*) dst, src, count); } // Otherwise, if the rectangle is not Neon-aligned, first fill the unaligned portion, // then fill the middle using Neon operations and finish the right using non-Neon operations. else { // Compute the misalignment from the optimal copy alignment (assumed to be the size of a Neon vector). // (NOTE: It is also assumed that the pointer is already pixel-aligned.) const int pixelMisalignment = (((int) dst) & (BYTES_PER_UINT16X8_T - 1)) / BYTES_PER_UINT16_T; if (pixelMisalignment != 0) { // Compute the number of pixels to fill that would align the rest of the rectangle. // NOTE: Since count is guaranteed to be >= PIXELS_IN_VECTOR_COLUMN at this point, // pixelsToCopyForAlignment is guaranteed to be less than count. const int pixelsToCopyForAlignment = PIXELS_IN_VECTOR_COLUMN - pixelMisalignment; count -= pixelsToCopyForAlignment; // Don't assume any pre-existing alignment when filling up to PIXELS_IN_VECTOR_COLUMN - 1 (7 for 16bpp). memset16_NoAlignmentAssumptions_UpTo7Count((void *) dst, src, pixelsToCopyForAlignment); dst += pixelsToCopyForAlignment; } // Copy remaining pixels using Neon and non-Neon instructions. // NOTE: This assumes that dst is aligned optimally for Neon instructions. memset16_AssumesNeonAlignment((void *) dst, src, count); } } /* Do multiple row fills with a specific memory set function. */ #define DO_MULTIPLE_FILLS_WITH_MEMSET(MEMSET_FUNCTION,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS) \ do { \ BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); \ \ /* Draw one row at a time, in the most efficient way. */ \ while (h != 0) { \ h -= 1; \ \ MEMSET_FUNCTION((void *) (dst), src, w); \ \ dst += dpitch; \ } \ \ UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); \ \ } while (0) /* Fill a line of 16bpp pixels. */ /* (Pointers are assumed to be half-word-aligned, which should be guaranteed for 16bpp.) */ static inline void swFillRect16Bpp_Unaligned(unsigned char *dst, uint16_t src, int w, int h, int dpitch, BOOL blockSignalsForVFP) { // Handle single-pixel width columns as a special case. // Since this function only requires half-word-alignment, which is guaranteed at this point, // it's safe to call now with no further tests. if (w == 1) { swFill2ByteWideRectangle_HalfWordAligned(dst, src, h, dpitch); return; } if (w < 64) { // For narrow rectangles, block signals only once for the entire rectangles. BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); DO_MULTIPLE_FILLS_WITH_MEMSET(memset16_Test,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP); UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); } else { // For wider rectangles, block and unblock signals for every row. DO_MULTIPLE_FILLS_WITH_MEMSET(memset16_Test,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); } } /* Fill a line of 32bpp pixels. */ /* (Pointers are assumed to be word-aligned, which should be guaranteed for 32bpp.) */ static inline void swFillRect32Bpp_Unaligned(unsigned char *dst, uint32_t src, int w, int h, int dpitch, BOOL blockSignalsForVFP) { // Handle single-pixel width columns as a special case. // Since this function only requires half-word-alignment, which is guaranteed at this point, // it's safe to call now with no further tests. if (w == 1) { swFill4ByteWideRectangle_WordAligned(dst, src, h, dpitch); return; } if (w < 32) { // For narrow rectangles, block signals only once for the entire rectangles. BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP); UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); } else { // For wider rectangles, block and unblock signals for every row. DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); } } /* Perform a solid fill. */ void swFill(MSMPtr pMsm, int byteOffset, int destSurfaceWidthPixels, int x, int y, int w, int h, uint32_t src, int bitsPerPixel, BOOL blockSignalsForVFP) { int bytesPerPixel = bitsPerPixel / 8; int dpitch = destSurfaceWidthPixels * bytesPerPixel; uint8_t *dst = (uint8_t *)(pMsm->fbmem + byteOffset + y * dpitch + x * bytesPerPixel); // This is a trivial one-pixel copy that avoids most overhead. // (This makes the 1x1 copy case significantly faster and there is reason to believe this is a common case.) if (h == 1 && w == 1) { switch (bitsPerPixel) { case 16: *(uint16_t *)dst = src; break; case 32: *(uint32_t *)dst = src; break; default: break; } return; } // Call BPP-specific code to draw pixels. switch (bitsPerPixel) { case 16: swFillRect16Bpp_Unaligned(dst, src, w, h, dpitch, blockSignalsForVFP); break; case 32: swFillRect32Bpp_Unaligned(dst, src, w, h, dpitch, blockSignalsForVFP); break; default: return; } } xf86-video-msm/src/msm-dri.c0000644000175000017500000001434211615776600015777 0ustar paulliupaulliu/* msm-dri.c * * Copyright (c) 2009, Code Aurora Forum. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Code Aurora nor * the names of its contributors may be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NON-INFRINGEMENT ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include "xf86.h" #define _XF86DRI_SERVER_ #include "GL/glxtokens.h" #include "sarea.h" #include "msm.h" #define MSM_USE_DBUFFER 1 #define MSM_USE_STENCIL 1 #define MSM_USE_DEPTH 1 static Bool MSMCreateContext(ScreenPtr pScreen, VisualPtr visual, drm_context_t hwContext, void *pVisualConfigPriv, DRIContextType contextStore) { return TRUE; } static void MSMDestroyContext(ScreenPtr pScreen, drm_context_t hwContext, DRIContextType contextStore) { } /* NOTE: This handles 2D vs 3D swapping */ /* This gets called every time SwapBuffers is called on the other side */ static void MSMDRISwapContext(ScreenPtr pScreen, DRISyncType syncType, DRIContextType oldContextType, void *oldContext, DRIContextType newContextType, void *newContext) { } static void MSMDRIInitBuffers(WindowPtr pWin, RegionPtr prgn, CARD32 index) { } static void MSMDRIMoveBuffers(WindowPtr pParent, DDXPointRec ptOldOrg, RegionPtr prgnSrc, CARD32 indx) { } void MSMDRICloseScreen(ScreenPtr pScreen) { ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum]; MSMPtr pMsm = MSMPTR(pScrn); MSMDRIPrivPtr pMSMDRI = (MSMDRIPrivPtr) pMsm->dri->pDRIInfo->devPrivate; if (pMSMDRI) xfree(pMSMDRI); DRICloseScreen(pScreen); DRIDestroyInfoRec(pMsm->dri->pDRIInfo); pMsm->dri->pDRIInfo = NULL; } static Bool MSMDRIDoCloseScreen(int scrnIndex, ScreenPtr pScreen) { ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum]; MSMPtr pMsm = MSMPTR(pScrn); MSMDRICloseScreen(pScreen); pScreen->CloseScreen = pMsm->dri->DRICloseScreen; return (*pScreen->CloseScreen) (scrnIndex, pScreen); } static void MSMDRIClipNotify(ScreenPtr pScreen, WindowPtr * ppWin, int num) { } Bool MSMDRIScreenInit(ScreenPtr pScreen) { ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum]; MSMPtr pMsm = MSMPTR(pScrn); DRIInfoPtr pDRIInfo; MSMDRIPrivPtr pMSMDRI; pMsm->dri->DRICloseScreen = NULL; pDRIInfo = DRICreateInfoRec(); if (pDRIInfo == NULL) return FALSE; pMsm->dri->pDRIInfo = pDRIInfo; pDRIInfo->drmDriverName = "msm_kgsl"; pDRIInfo->clientDriverName = "yamato"; pDRIInfo->busIdString = xalloc(16); strcpy(pDRIInfo->busIdString, "platform:kgsl"); pDRIInfo->ddxDriverMajorVersion = 1; pDRIInfo->ddxDriverMinorVersion = 0; pDRIInfo->ddxDriverPatchVersion = 0; pDRIInfo->frameBufferPhysicalAddress = (void *)pMsm->fixed_info.smem_start; pDRIInfo->frameBufferSize = pMsm->fixed_info.smem_len; pDRIInfo->frameBufferStride = pMsm->fixed_info.line_length; /* FIXME: How many drawables can we do (should we do)? */ pDRIInfo->ddxDrawableTableEntry = 32; pDRIInfo->maxDrawableTableEntry = 32; /* Just allocate the maximum chunk of memory for the DRI * structures */ pDRIInfo->SAREASize = SAREA_MAX; pMSMDRI = (MSMDRIPrivPtr) xcalloc(1, sizeof(*pMSMDRI)); if (pMSMDRI == NULL) { DRIDestroyInfoRec(pMsm->dri->pDRIInfo); pMsm->dri->pDRIInfo = NULL; return FALSE; } pDRIInfo->devPrivate = pMSMDRI; pDRIInfo->devPrivateSize = sizeof(*pMSMDRI); pDRIInfo->contextSize = sizeof(MSMDRIContextRec); pDRIInfo->CreateContext = MSMCreateContext; pDRIInfo->DestroyContext = MSMDestroyContext; pDRIInfo->SwapContext = MSMDRISwapContext; pDRIInfo->InitBuffers = MSMDRIInitBuffers; pDRIInfo->MoveBuffers = MSMDRIMoveBuffers; pDRIInfo->bufferRequests = DRI_ALL_WINDOWS; pDRIInfo->ClipNotify = MSMDRIClipNotify; /* FIXME: What else do I need to define in this structure? */ pDRIInfo->createDummyCtx = TRUE; pDRIInfo->createDummyCtxPriv = FALSE; if (!DRIScreenInit(pScreen, pDRIInfo, &pMsm->dri->drmFD)) { xf86DrvMsg(pScreen->myNum, X_ERROR, "MSM-DRI DRIScreenInit failed. Disabling DRI.\n"); xfree(pDRIInfo->devPrivate); pDRIInfo->devPrivate = NULL; DRIDestroyInfoRec(pDRIInfo); pDRIInfo = NULL; return FALSE; } xf86DrvMsg(pScreen->myNum, X_INFO, "[dri] MSM-DRI is initialized.\n"); return TRUE; } Bool MSMDRIFinishScreenInit(ScreenPtr pScreen) { ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum]; MSMPtr pMsm = MSMPTR(pScrn); MSMDRIPrivPtr pMSMDRI; /*FIXME: WHat is this? */ pMsm->dri->pDRIInfo->driverSwapMethod = DRI_HIDE_X_CONTEXT; if (!DRIFinishScreenInit(pScreen)) { MSMDRICloseScreen(pScreen); return FALSE; } pMSMDRI = (MSMDRIPrivPtr) pMsm->dri->pDRIInfo->devPrivate; pMSMDRI->bpp = pScrn->bitsPerPixel; pMsm->dri->DRICloseScreen = pScreen->CloseScreen; pScreen->CloseScreen = MSMDRIDoCloseScreen; return TRUE; } xf86-video-msm/src/msm-cursor.c0000644000175000017500000001064311615776600016536 0ustar paulliupaulliu/* msm-cursor.c * * Copyright (c) 2009, Code Aurora Forum. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Code Aurora nor * the names of its contributors may be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NON-INFRINGEMENT ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include "xf86.h" #include "xf86Crtc.h" #include #include #include "msm.h" #define MSM_CURSOR_WIDTH 64 #define MSM_CURSOR_HEIGHT 64 #ifdef MSMFB_CURSOR static void _init_cursor(MSMPtr pMsm, struct fb_cursor *cursor) { memset(cursor, 0, sizeof(*cursor)); /* This is a workaround for a buggy kernel */ cursor->image.width = MSM_CURSOR_WIDTH; cursor->image.height = MSM_CURSOR_HEIGHT; cursor->image.depth = 32; cursor->enable = pMsm->HWCursorState; } void MSMSetCursorPosition(MSMPtr pMsm, int x, int y) { struct fb_cursor cursor; _init_cursor(pMsm, &cursor); if (x < 0) x = 0; if (y < 0) y = 0; cursor.set |= FB_CUR_SETPOS; cursor.image.dx = x; cursor.image.dy = y; if (ioctl(pMsm->fd, MSMFB_CURSOR, &cursor)) ErrorF("%s: Error calling MSMBF_CURSOR\n", __FUNCTION__); } void MSMCursorEnable(MSMPtr pMsm, Bool enable) { struct fb_cursor cursor; _init_cursor(pMsm, &cursor); pMsm->HWCursorState = cursor.enable = (enable == TRUE) ? 1 : 0; if (ioctl(pMsm->fd, MSMFB_CURSOR, &cursor)) ErrorF("%s: Error calling MSMBF_CURSOR\n", __FUNCTION__); } void MSMCursorLoadARGB(MSMPtr pMsm, CARD32 * image) { struct fb_cursor cursor; _init_cursor(pMsm, &cursor); cursor.set |= FB_CUR_SETIMAGE; cursor.image.data = (char *)image; /* BLEND_TRANSP_EN off */ cursor.image.bg_color = 0xFFFFFFFF; /* Per pixel alpha on */ cursor.image.fg_color = 0; if (ioctl(pMsm->fd, MSMFB_CURSOR, &cursor)) ErrorF("%s: Error calling MSMBF_CURSOR\n", __FUNCTION__); } Bool MSMCursorInit(ScreenPtr pScreen) { ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum]; MSMPtr pMsm = MSMPTR(pScrn); struct fb_cursor cursor; _init_cursor(pMsm, &cursor); /* Try to turn off the cursor - if this fails then we don't have * HW cursor support */ cursor.enable = 0; if (ioctl(pMsm->fd, MSMFB_CURSOR, &cursor)) { xf86DrvMsg(pScreen->myNum, X_ERROR, "Unable to enable the HW cursor: %s\n", strerror(errno)); return FALSE; } /* HWCursor is on the air, but not visible (yet) */ pMsm->HWCursorState = 0; return xf86_cursors_init(pScreen, MSM_CURSOR_WIDTH, MSM_CURSOR_HEIGHT, HARDWARE_CURSOR_TRUECOLOR_AT_8BPP | HARDWARE_CURSOR_INVERT_MASK | HARDWARE_CURSOR_AND_SOURCE_WITH_MASK | HARDWARE_CURSOR_SOURCE_MASK_INTERLEAVE_32 | HARDWARE_CURSOR_ARGB); } #else /* if MSMFB_CURSOR isn't defined, then this is an older version of the kernel that doesn't support it - so just provide some dummy stuff here */ void MSMCrtcSetCursorPosition(MSMPtr pMsm, int x, int y) { } void MSMCursorEnable(MSMPtr pMsm, Bool enable) { } void MSMCursorLoadARGB(MSMPtr pMsm, CARD32 * image) { } Bool MSMCursorInit(ScreenPtr pScreen) { return FALSE; } #endif xf86-video-msm/src/msm-video.c0000755000175000017500000004655311615776600016343 0ustar paulliupaulliu/* msm-video.c * * Copyright (c) 2009, Code Aurora Forum. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Code Aurora nor * the names of its contributors may be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NON-INFRINGEMENT ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include "xf86.h" #include "msm.h" #include "xf86xv.h" #include "fourcc.h" #include "msm_fourcc.h" #include "msm-render.h" #include "msm-drm.h" extern struct cmsghdr *cmptr; #define ALIGN(_v, _d) (((_v) + ((_d) - 1)) & ~((_d) - 1)) #define ISPLANAR(_id) ( (_id) != FOURCC_RGB565 \ && (_id) != FOURCC_UYVY \ && (_id) != FOURCC_YUY2 ) #define MAX_STRETCH_FACTOR 4 /* Maximum MDP stretchblt factor */ #define MAX_SHRINK_FACTOR 4 /* Maximum MDP shrinkblt factor */ #define SHRINK_LIMIT 8 /* Minimum shrink size */ #define BITS_TO_BYTES 3 static XF86VideoEncodingRec DummyEncoding[1] = { {0, "XV_IMAGE", 1280, 720, {1, 1}} }; static XF86VideoFormatRec Formats[] = { {8, PseudoColor}, {15, TrueColor}, {16, TrueColor}, {24, TrueColor} }; static XF86ImageRec Images[] = { XVIMAGE_UYVY, /* MDP_YCRYCB_H2V1 */ XVIMAGE_NV12, /* MDP_Y_CRCB_H2V1 */ XVIMAGE_NV21, /* MDP_Y_CBCR_H2V1 */ XVIMAGE_YUY2, XVIMAGE_I420, XVIMAGE_YV12, XVIMAGE_RGB565, }; static XF86AttributeRec Attributes[] = { {XvSettable | XvGettable, 0, 1, "XV_HWCODEC"}, }; static Atom xvHWCodec; typedef struct { int HWCodecFlag; struct msm_offscreen_area *area; int width; int height; } MSMPortPrivRec, *MSMPortPrivPtr; static int MSMGetFormat(int id) { switch (id) { case FOURCC_UYVY: case FOURCC_YUY2: return MDP_YCRYCB_H2V1; case FOURCC_RGB565: return MDP_RGB_565; case FOURCC_NV12: case FOURCC_I420: return MDP_Y_CRCB_H2V2; case FOURCC_YV12: case FOURCC_NV21: return MDP_Y_CBCR_H2V2; } return -1; } static MSMPtr latest_pMsm = NULL; static void copy(unsigned char *dst, unsigned char *src, int width, int height, int stride) { if (latest_pMsm->FastVideoMemCopy) { BOOL blockSignalsForVFP = !(latest_pMsm->NoSigBlock); // The following code assumes that dst and src pointers are not aliased. if ((width % 2) == 0) { // Use the fast software-blit code if possible (16bpp); // it assumes widths in multiples of 16-bit pixels. swBlit_NoOverlap(dst, src, width / 2, height, stride, stride, 16, blockSignalsForVFP); } else { // Otherwise, use 8bpp code. swBlit_NoOverlap(dst, src, width, height, stride, stride, 8, blockSignalsForVFP); } } else { while (height--) { memcpy(dst, src, width); dst += stride; src += stride; } } } static void MSMCopyPseudoPlanar(unsigned char *dst, unsigned char *src, int srcX, int srcY, int srcW, int srcH, int width, int height) { unsigned int yoffset = (srcY * ALIGN(width, 2)) + srcX; unsigned int uvoffset = (height * ALIGN(width, 2)) + yoffset; copy(dst + yoffset, src + yoffset, srcW, srcH, ALIGN(width, 2)); copy(dst + uvoffset, src + uvoffset, srcW, srcH, ALIGN(width, 2)); } static void MSMCopyPacked(unsigned char *dst, unsigned char *src, int srcX, int srcY, int srcW, int srcH, int width, int height) { /* dword align the destination pitch so that it matches * what we do for planar */ unsigned int offset = (srcY * ALIGN(width, 2) * 2) + (srcX * 2); copy(dst + offset, src + offset, srcW * 2, srcH, ALIGN(width, 2) * 2); } static void MSMCopyAndSwapPacked(unsigned char *dst, unsigned char *src, int srcX, int srcY, int srcW, int srcH, int width, int height) { unsigned int offset = (srcY * ALIGN(width,2) << 1) + (srcX << 1); unsigned char *sptr = src + offset; unsigned char *dptr = dst + offset; int x, y; int block = srcX >> 3; for(y = 0; y < srcH; y++) { for(x = 0; x < block; x++) { dptr[(x << 1) + 1] = sptr[(x << 1) + 0]; dptr[(x << 1) + 0] = sptr[(x << 1) + 1]; dptr[(x << 1) + 3] = sptr[(x << 1) + 2]; dptr[(x << 1) + 2] = sptr[(x << 1) + 3]; dptr[(x << 1) + 5] = sptr[(x << 1) + 4]; dptr[(x << 1) + 4] = sptr[(x << 1) + 5]; dptr[(x << 1) + 7] = sptr[(x << 1) + 6]; dptr[(x << 1) + 6] = sptr[(x << 1) + 7]; dptr[(x << 1) + 9] = sptr[(x << 1) + 8]; dptr[(x << 1) + 8] = sptr[(x << 1) + 9]; dptr[(x << 1) + 11] = sptr[(x << 1) + 10]; dptr[(x << 1) + 10] = sptr[(x << 1) + 11]; dptr[(x << 1) + 13] = sptr[(x << 1) + 12]; dptr[(x << 1) + 12] = sptr[(x << 1) + 13]; dptr[(x << 1) + 15] = sptr[(x << 1) + 14]; dptr[(x << 1) + 14] = sptr[(x << 1) + 15]; } for(x = block << 3; x < srcX; x++) { dptr[(x << 1) + 1] = sptr[(x << 1) + 0]; dptr[(x << 1) + 0] = sptr[(x << 1) + 1]; } dptr += ALIGN(width, 2) << 1; sptr += ALIGN(width, 2) << 1; } } static void copy2(unsigned char *dst, unsigned char *usrc, unsigned char *vsrc, int width, int height, int pitch) { int drawh = height >> 1; int draww = width >> 1; while (--drawh) { int w; for (w = 0; w < draww; w++) { dst[(w * 2)] = usrc[w]; dst[(w * 2) + 1] = vsrc[w]; } usrc += (pitch >> 1); vsrc += (pitch >> 1); dst += pitch; } } static void MSMCopyPlanar(unsigned char *dst, unsigned char *src, int id, int srcX, int srcY, int srcW, int srcH, int width, int height) { int pitch = ALIGN(width, 2); unsigned int yoffset = (srcY * pitch) + srcX; unsigned int uvoffset = (height * pitch) + yoffset; unsigned char *usrc = src + (height * pitch); unsigned char *vsrc = usrc + ((height >> 1) * (pitch >> 1)); copy(dst + yoffset, src + yoffset, srcW, srcH, pitch); copy2(dst + uvoffset, usrc, vsrc, srcW, srcH, pitch); } static int MSMDoBlit(MSMPtr pMsm, MSMPortPrivPtr pPriv, int srcFd, int srcOffset, int id, int drawX, int drawY, int drawW, int drawH, int width, int height, RegionPtr clipBoxes, DrawablePtr pDraw) { PixmapPtr pxDst; ScreenPtr pScreen = pDraw->pScreen; BoxPtr pbox = REGION_RECTS(clipBoxes); int nbox = REGION_NUM_RECTS(clipBoxes); int i; hwBlitFlush(pMsm); if (pDraw->type == DRAWABLE_WINDOW) { pxDst = (*pScreen->GetWindowPixmap)((WindowPtr)pDraw); if (!pMsm->useDRI2) exaMoveInPixmap(pxDst); } else { pxDst = (PixmapPtr)pDraw; } for (i = 0; i < nbox; i++) { int sx,sy,sw,sh; int dx,dy,dw,dh; int newMaxSrcSize; MSMBlitSurface srcSurface, dstSurface; MSMBlitRect srcRect, dstRect; MSMBlitRec blit; srcSurface.width = ALIGN(width, 2); srcSurface.height = ALIGN(height,2); srcSurface.format = MSMGetFormat(id); srcSurface.pitch = srcSurface.width * 2; if (srcFd > 0) { srcSurface.flags = MSM_BLIT_PMEM; srcSurface.priv[0] = srcFd; srcSurface.priv[1] = srcOffset; } else { if (pPriv->area->type == MSM_OFFSCREEN_GEM) { srcSurface.flags = MSM_BLIT_GEM; srcSurface.priv[0] = (unsigned long) pPriv->area->priv; } else { srcSurface.flags = MSM_BLIT_FB; srcSurface.priv[0] = ((ExaOffscreenArea *) pPriv->area->priv)->offset; } } dstSurface.width = pxDst->drawable.width; dstSurface.height = pxDst->drawable.height; dstSurface.format = MDP_FB_FORMAT; dstSurface.pitch = msm_pixmap_get_pitch(pxDst); if (msm_pixmap_in_gem(pxDst)) { dstSurface.flags = MSM_BLIT_GEM; dstSurface.priv[0] = (unsigned long) msm_get_pixmap_bo(pxDst); } else { dstSurface.flags = MSM_BLIT_FB; dstSurface.priv[0] = exaGetPixmapOffset(pxDst); } dx = pbox->x1 + (pxDst->drawable.x - pxDst->screen_x); dy = pbox->y1 + (pxDst->drawable.y - pxDst->screen_y); sx = pbox->x1 - drawX; sy = pbox->y1 - drawY; sw = dw = pbox->x2 - pbox->x1; sh = dh = pbox->y2 - pbox->y1; if (drawW != width && (sw > 2)) { sx = (sx * width) / drawW; sw = (sw * width) / drawW; } if (drawH != height) { sy = (sy * height) / drawH; sh = (sh * height) / drawH; } if (sx + sw > width) sw = width - sx; if (sy + sh > height) sh = height - sy; if (ISPLANAR(id)) { if (sx & 1) { sx &= ~1; sw++; } if (sy & 1) { sy &= ~1; sh++; } sw &= ~1; sh &= ~1; } /* FIXME: It occurred to me that these could be done as a series of blits, stretching or shrinking as much as possible until the final size is reached. Lots of copying and work, but it'd look nice. */ /* Account for MDP stretch boundaries. Cannot shrink to less than 15-25% reliably */ /* Check for horizontal shrinks */ if ((sw > 0) && (sh > 0)) { if ((sw > dw) && ((sw/dw) >= MAX_SHRINK_FACTOR)) { newMaxSrcSize = dw; if (dw > SHRINK_LIMIT) { newMaxSrcSize = dw * (MAX_SHRINK_FACTOR - 1); } else { /* No shrink at all if width is very small */ newMaxSrcSize = dw; } /* Cut off left and right edges leaving middle */ sx = sx + ((sw - newMaxSrcSize) >> 1); sw = newMaxSrcSize; } /* Check for vertical shrinks */ if ((sh > dh) && ((sh/dh) >= MAX_SHRINK_FACTOR)) { if (dh > SHRINK_LIMIT) { newMaxSrcSize = dh * (MAX_SHRINK_FACTOR - 1); } else { /* No shrink at all if height is very small */ newMaxSrcSize = dh; } /* Cut off top and bottom edges leaving middle */ sy = sy + ((sh - newMaxSrcSize) >> 1); sh = newMaxSrcSize; } /* Clamp out-of-range horizontal stretches */ if ((dw > sw) && ((dw/sw) >= MAX_STRETCH_FACTOR)) { dw = width * MAX_STRETCH_FACTOR; } /* Clamp out-of-range vertical stretches */ if ((dh > sh) && ((dh/sh) >= MAX_STRETCH_FACTOR)) { dh = height * MAX_STRETCH_FACTOR; } } srcRect.x = sx; srcRect.y = sy; srcRect.w = sw; srcRect.h = sh; dstRect.x = dx; dstRect.y = dy; dstRect.w = dw; dstRect.h = dh; pbox++; if (sw < 2 || sh < 2) continue; blit.src = &srcSurface; blit.dst = &dstSurface; blit.srcRect = &srcRect; blit.dstRect = &dstRect; hwBlit(pMsm, &blit, id != FOURCC_RGB565 ? MDP_DITHER : 0); } hwBlitFlush(pMsm); /* Update dirty regions for compositor */ DamageDamageRegion(pDraw, clipBoxes); return Success; } static int MSMPutHWCodecImage(ScrnInfoPtr pScrn, short srcX, short srcY, short drawX, short drawY, short srcW, short srcH, short drawW, short drawH, int id, unsigned char *buf, short width, short height, Bool sync, RegionPtr clipBoxes, pointer data, DrawablePtr pDraw) { MSMPtr pMsm = MSMPTR(pScrn); MSMPortPrivPtr pPriv = (MSMPortPrivPtr) data; unsigned int *udata = (unsigned int *)buf; if (drawW == 0 || drawH == 0) return Success; return MSMDoBlit(pMsm, pPriv, pMsm->pfd, udata[1], id, drawX, drawY, drawW, drawH, width, height, clipBoxes, pDraw); } static int MSMPutImage(ScrnInfoPtr pScrn, short srcX, short srcY, short drawX, short drawY, short srcW, short srcH, short drawW, short drawH, int id, unsigned char *buf, short width, short height, Bool sync, RegionPtr clipBoxes, pointer data, DrawablePtr pDraw) { MSMPtr pMsm = MSMPTR(pScrn); MSMPortPrivPtr pPriv = (MSMPortPrivPtr) data; int size; /* Nothing to do here */ if (srcW == 0 || srcH == 0 || drawW == 0 || drawH == 0) return Success; if (pPriv->HWCodecFlag || id == FOURCC_NV21) { pPriv->HWCodecFlag = TRUE; return MSMPutHWCodecImage(pScrn, srcX, srcY, drawX, drawY, srcW, srcH, drawW, drawH, id, buf, width, height, sync, clipBoxes, data, pDraw); } if (!pMsm->accel) { ErrorF("Cannot do software codecs without EXA support\n"); return BadAlloc; } if (ISPLANAR(id)) size = (ALIGN(width, 2) * ALIGN(height, 2)) * 2; else size = (ALIGN(width, 2) * 2) * height; if (pPriv->area == NULL || width > pPriv->width || height > pPriv->height) { if (pPriv->area != NULL) msm_free_offscreen_memory(pScrn->pScreen, pPriv->area); pPriv->area = msm_alloc_offscreen_memory(pScrn->pScreen, size); if (pPriv->area == NULL) return BadAlloc; pPriv->width = width; pPriv->height = height; } // FIXME: Rather than change a lot of function prototypes, // just save a pointer to the MSM driver for the copy fundtions. latest_pMsm = pMsm; /* Clip against the actual source pixmap width and height - we can't trust that srcW and srcH are correct */ if (width < srcW) srcW = width; if (height < srcH) srcH = height; switch (id) { case FOURCC_UYVY: case FOURCC_RGB565: MSMCopyPacked(pPriv->area->ptr, buf, srcX, srcY, srcW, srcH, width, height); break; case FOURCC_YUY2: MSMCopyAndSwapPacked(pPriv->area->ptr, buf, srcX, srcY, srcW, srcH, width, height); break; case FOURCC_NV12: case FOURCC_NV21: MSMCopyPseudoPlanar(pPriv->area->ptr, buf, srcX, srcY, srcW, srcH, width, height); break; case FOURCC_I420: case FOURCC_YV12: MSMCopyPlanar(pPriv->area->ptr, buf, id, srcX, srcY, srcW, srcH, width, height); break; } return MSMDoBlit(pMsm, pPriv, -1, 0, id, drawX, drawY, drawW, drawH, width, height, clipBoxes, pDraw); } static int MSMQueryImageAttributes(ScrnInfoPtr pScrn, int id, unsigned short *w, unsigned short *h, int *pitches, int *offsets) { int ypitch; if (!ISPLANAR(id)) { /* Packed */ if (pitches) pitches[0] = ALIGN(*w, 2) * 2; if (offsets) offsets[0] = 0; return ALIGN(*w, 2) * 2 * *h; } else if (id == FOURCC_NV12 || id == FOURCC_NV21) { /* Pseudo planar */ /* Adjust the width to be word aligned */ *w = ALIGN(*w, 2); /* The number of lines needs to be even */ *h = ALIGN(*h, 2); /* Figure out the pitches for the segments */ ypitch = ALIGN(*w, 2); if (pitches) { pitches[0] = ypitch; pitches[1] = ypitch; } /* Calculate the offsets of the chunks */ if (offsets) { offsets[0] = 0; offsets[1] = (*h * ypitch); } /* Return the total size of the chunk */ return (*h * ypitch) + (*h * ypitch); } else { /* True planar */ /* Adjust the width to be word aligned */ *w = ALIGN(*w, 2); /* The number of lines needs to be even */ *h = ALIGN(*h, 2); /* Figure out the pitches for the segments */ ypitch = ALIGN(*w, 2); if (pitches) { pitches[0] = ypitch; pitches[1] = ypitch >> 1; pitches[2] = ypitch >> 1; } /* Calculate the offsets of the chunks */ if (offsets) { offsets[0] = 0; offsets[1] = (*h * ypitch); offsets[2] = ((*h >> 1) * (ypitch >> 1)); } return (*h * ypitch) + (((*h >> 1) * (ypitch >> 1)) * 2); } } static void MSMQueryBestSize(ScrnInfoPtr pScrn, Bool motion, short vidW, short vidH, short drawW, short drawH, unsigned int *retW, unsigned int *retH, pointer data) { /* Allow any size window */ *retW = drawW; *retH = drawH; } static void MSMStopVideo(ScrnInfoPtr pScrn, pointer data, Bool exit) { MSMPtr pMsm = MSMPTR(pScrn); MSMPortPrivPtr pPriv = (MSMPortPrivPtr) data; if (!pPriv->HWCodecFlag && pPriv->area != NULL) msm_free_offscreen_memory(pScrn->pScreen, pPriv->area); if (exit && pPriv->HWCodecFlag && pMsm->pfd > 0) { close(pMsm->pfd); pMsm->pfd = -1; if (cmptr) { free(cmptr); cmptr=NULL; } pPriv->HWCodecFlag = FALSE; } pPriv->area = NULL; pPriv->width = 0; pPriv->height = 0; } static int MSMSetPortAttribute(ScrnInfoPtr pScrni, Atom attribute, INT32 value, pointer data) { MSMPortPrivPtr pPriv = (MSMPortPrivPtr) data; if (attribute == xvHWCodec) pPriv->HWCodecFlag = value ? 1 : 0; return Success; } static int MSMGetPortAttribute(ScrnInfoPtr pScrni, Atom attribute, INT32 * value, pointer data) { MSMPortPrivPtr pPriv = (MSMPortPrivPtr) data; if (attribute == xvHWCodec) *value = pPriv->HWCodecFlag; else return BadMatch; return Success; } static XF86VideoAdaptorPtr MSMInitAdaptor(ScreenPtr pScreen) { ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum]; MSMPtr pMsm = MSMPTR(pScrn); XF86VideoAdaptorRec *adapt; MSMPortPrivPtr pPriv; int i; adapt = xcalloc(1, sizeof(XF86VideoAdaptorRec) + (sizeof(DevUnion) * pMsm->xvports) + (sizeof(MSMPortPrivRec) * pMsm->xvports)); if (adapt == NULL) return NULL; adapt->type = XvWindowMask | XvInputMask | XvImageMask; adapt->flags = VIDEO_OVERLAID_IMAGES | VIDEO_CLIP_TO_VIEWPORT; adapt->name = "MSM"; adapt->nEncodings = 1; adapt->pEncodings = DummyEncoding; adapt->nFormats = ARRAY_SIZE(Formats); adapt->pFormats = Formats; adapt->nPorts = pMsm->xvports; /* This is a list of private pointers - located immediately * after the adapt struture */ adapt->pPortPrivates = (DevUnion *) (&adapt[1]); /* The actual set of private sructures begins afer the * DevUnion component */ pPriv = (MSMPortPrivRec *) (&adapt->pPortPrivates[pMsm->xvports]); for (i = 0; i < pMsm->xvports; i++) { adapt->pPortPrivates[i].ptr = (void *)&pPriv[i]; } adapt->nAttributes = ARRAY_SIZE(Attributes); adapt->pAttributes = Attributes; adapt->nImages = ARRAY_SIZE(Images); adapt->pImages = Images; adapt->StopVideo = MSMStopVideo; adapt->SetPortAttribute = MSMSetPortAttribute; adapt->GetPortAttribute = MSMGetPortAttribute; adapt->QueryBestSize = MSMQueryBestSize; adapt->PutImage = MSMPutImage; adapt->QueryImageAttributes = MSMQueryImageAttributes; xvHWCodec = MakeAtom("XV_HWCODEC", sizeof("XV_HWCODEC") - 1, TRUE); return adapt; } void MSMInitVideo(ScreenPtr pScreen) { ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum]; XF86VideoAdaptorPtr adapt = MSMInitAdaptor(pScreen); XF86VideoAdaptorPtr *list, *newlist; int count; if (adapt == NULL) return; count = xf86XVListGenericAdaptors(pScrn, &list); newlist = xalloc((count + 1) * sizeof(XF86VideoAdaptorPtr *)); if (newlist == NULL) return; if (count > 0) memcpy(newlist, list, count * sizeof(XF86VideoAdaptorPtr *)); newlist[count++] = adapt; xf86XVScreenInit(pScreen, newlist, count); xfree(newlist); } xf86-video-msm/src/msm-dri2.c0000644000175000017500000001713511615776600016064 0ustar paulliupaulliu/* msm-dri2.c * * Copyright (c) 2009, Code Aurora Forum. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Code Aurora nor * the names of its contributors may be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NON-INFRINGEMENT ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include "msm.h" #include "xf86drm.h" #include "dri2.h" #include "msm-drm.h" #if DRI2INFOREC_VERSION >= 2 #define USE_DRI2_2 #endif struct msm_dri2_priv { PixmapPtr pixmap; }; #ifndef USE_DRI2_2 static DRI2BufferPtr MSMDRI2CreateBuffers(DrawablePtr drawable, unsigned int *attachments, int count) { ScreenPtr pScreen = drawable->pScreen; DRI2BufferPtr buffers; struct msm_dri2_priv *privates; struct msm_pixmap_priv *pixpriv; PixmapPtr pixmap; int i; buffers = xcalloc(count, sizeof(*buffers)); if (buffers == NULL) return NULL; privates = xcalloc(count, sizeof(struct msm_dri2_priv)); if (privates == NULL) { xfree(buffers); return NULL; } for(i = 0; i < count; i++) { if (attachments[i] == DRI2BufferFrontLeft || attachments[i] == DRI2BufferFakeFrontLeft) { if (drawable->type == DRAWABLE_PIXMAP) pixmap = (PixmapPtr) drawable; else pixmap = pScreen->GetWindowPixmap((WindowPtr) drawable); pixmap->refcnt++; } else { pixmap = (*pScreen->CreatePixmap)(pScreen, drawable->width, drawable->height, drawable->depth, 0); } if (pixmap == NULL) return NULL; pixpriv = exaGetPixmapDriverPrivate(pixmap); if (pixpriv) msm_drm_bo_flink(pixpriv->bo, &buffers[i].name); buffers[i].attachment = attachments[i]; buffers[i].pitch = pixmap->devKind; buffers[i].cpp = pixmap->drawable.bitsPerPixel / 8; buffers[i].attachment = attachments[i]; buffers[i].driverPrivate = &privates[i]; buffers[i].flags = 0; privates[i].pixmap = pixmap; } return buffers; } #else static DRI2Buffer2Ptr MSMDRI2CreateBuffer(DrawablePtr drawable, unsigned int attachment, unsigned int format) { DRI2Buffer2Ptr buffer; struct msm_dri2_priv *private; ScreenPtr pScreen = drawable->pScreen; struct msm_pixmap_priv *pixpriv; PixmapPtr pixmap; buffer = xcalloc(1, sizeof(*buffer)); if (buffer == NULL) return NULL; private = xcalloc(1, sizeof(struct msm_dri2_priv)); if (private == NULL) { xfree(buffer); return NULL; } if (attachment == DRI2BufferFrontLeft || attachment == DRI2BufferFakeFrontLeft) { if (drawable->type == DRAWABLE_PIXMAP) pixmap = (PixmapPtr) drawable; else pixmap = pScreen->GetWindowPixmap((WindowPtr) drawable); pixmap->refcnt++; } else { /* BackLeft and BackRight */ pixmap = (*pScreen->CreatePixmap)(pScreen, drawable->width, drawable->height, (format != 0) ? format : drawable->depth, 0); if (pixmap == NULL) return NULL; /* To begin with, put the back buffers into EBI memory - * eventually this needs to shift to KMEM */ pixpriv = exaGetPixmapDriverPrivate(pixmap); if (pixpriv) msm_drm_bo_set_memtype(pixpriv->bo, MSM_DRM_MEMTYPE_EBI); } pixpriv = exaGetPixmapDriverPrivate(pixmap); if (pixpriv) msm_drm_bo_flink(pixpriv->bo, &buffer->name); buffer->pitch = pixmap->devKind; buffer->cpp = pixmap->drawable.bitsPerPixel / 8; buffer->attachment = attachment; buffer->driverPrivate = private; buffer->format = format; buffer->flags = 0; private->pixmap = pixmap; return buffer; } #endif #ifndef USE_DRI2_2 static void MSMDRI2DestroyBuffers(DrawablePtr drawable, DRI2BufferPtr buffers, int count) { ScreenPtr pScreen = drawable->pScreen; struct msm_dri2_priv *priv; int i; if (!buffers || !count) return; for(i = 0; i < count; i++) { priv = buffers[i].driverPrivate; (*pScreen->DestroyPixmap)(priv->pixmap); } xfree(buffers[0].driverPrivate); xfree(buffers); } #else static void MSMDRI2DestroyBuffer(DrawablePtr drawable, DRI2Buffer2Ptr buffers) { ScreenPtr pScreen = drawable->pScreen; struct msm_dri2_priv *priv; if (buffers == NULL) return; priv = buffers->driverPrivate; (*pScreen->DestroyPixmap)(priv->pixmap); xfree(buffers->driverPrivate); xfree(buffers); } #endif static void MSMDRI2CopyRegion(DrawablePtr pDraw, RegionPtr pRegion, DRI2BufferPtr pDstBuffer, DRI2BufferPtr pSrcBuffer) { ScreenPtr pScreen = pDraw->pScreen; struct msm_dri2_priv *srcpriv, *dstpriv; PixmapPtr srcPix, dstPix; RegionPtr copyRegion; GCPtr gc; srcpriv = pSrcBuffer->driverPrivate; dstpriv = pDstBuffer->driverPrivate; if (srcpriv == NULL || dstpriv == NULL) return; srcPix = srcpriv->pixmap; dstPix = dstpriv->pixmap; if (pSrcBuffer->attachment == DRI2BufferFakeFrontLeft || pDstBuffer->attachment == DRI2BufferFakeFrontLeft) return; if (pSrcBuffer->attachment == DRI2BufferFrontLeft) srcPix = (PixmapPtr) pDraw; if (pDstBuffer->attachment == DRI2BufferFrontLeft) dstPix = (PixmapPtr) pDraw; gc = GetScratchGC(pDraw->depth, pScreen); copyRegion = REGION_CREATE(pScreen, NULL, 0); REGION_COPY(pScreen, copyRegion, pRegion); (*gc->funcs->ChangeClip)(gc, CT_REGION, copyRegion, 0); ValidateGC(&dstPix->drawable, gc); (*gc->ops->CopyArea)(&srcPix->drawable, &dstPix->drawable, gc, 0, 0, pDraw->width, pDraw->height, 0, 0); FreeScratchGC(gc); } Bool MSMDRI2ScreenInit(ScreenPtr pScreen) { DRI2InfoRec info; ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum]; MSMPtr pMsm = MSMPTR(pScrn); if (pMsm->drmFD <= 0) { ErrorF("DRI2: DRM is not initialized\n"); return FALSE; } info.driverName = "yamato"; info.deviceName = pMsm->drmDevName; info.fd = pMsm->drmFD; #ifndef USE_DRI2_2 info.version = 1; info.CreateBuffers = MSMDRI2CreateBuffers; info.DestroyBuffers = MSMDRI2DestroyBuffers; #else info.version = DRI2INFOREC_VERSION; info.CreateBuffer = MSMDRI2CreateBuffer; info.DestroyBuffer = MSMDRI2DestroyBuffer; #endif info.CopyRegion = MSMDRI2CopyRegion; return DRI2ScreenInit(pScreen, &info); } void MSMDRI2ScreenClose(ScreenPtr pScreen) { DRI2CloseScreen(pScreen); } xf86-video-msm/src/msm.h0000755000175000017500000001135611615776600015235 0ustar paulliupaulliu/* msm.h * * Copyright (c) 2009, Code Aurora Forum. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Code Aurora nor * the names of its contributors may be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NON-INFRINGEMENT ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef _MSM_H_ #define _MSM_H_ #include "xf86.h" #include "damage.h" #include "exa.h" #if USEDRI #define _XF86DRI_SERVER_ #include "dri.h" #include "GL/glxint.h" #endif #include #include #include #define ARRAY_SIZE(a) (sizeof((a)) / (sizeof(*(a)))) typedef enum { MSM_TYPE_7201, MSM_TYPE_7X25, MSM_TYPE_8X50, } MSMChipType; #if USEDRI typedef struct msm_dri { DRIInfoPtr pDRIInfo; int drmFD; int depthBits; int numVisualConfigs; __GLXvisualConfig *pVisualConfigs; Bool(*DRICloseScreen) (int, ScreenPtr); } MSMDRIRec, *MSMDRIPtr; typedef struct { int dummy; } MSMDRIContextRec, *MSMDRIContextPtr; typedef struct { int bpp; } MSMDRIPrivRec, *MSMDRIPrivPtr; #endif typedef struct _MSMRec { /* File descriptor for the framebuffer device */ int fd; /* Fixed and var strutures from the framebuffer */ struct fb_fix_screeninfo fixed_info; struct fb_var_screeninfo mode_info; /* Pointer to the mapped framebuffer memory */ void *fbmem; /* Processor identifier */ MSMChipType chipID; /* Default mode for X */ DisplayModeRec default_mode; /* EXA driver structure */ ExaDriverPtr pExa; /* Place holder for the standard close screen function */ CloseScreenProcPtr CloseScreen; /* The blit list structure used by msm_fb */ struct mdp_blit_req_list *blitList; /* The number of xv ports to support */ int xvports; /* A flag that indicates if we should use EXA or not */ Bool accel; Bool useSWBlit; Bool useDRI; Bool useDRI2; #if USEDRI Bool DRIEnabled; MSMDRIPtr dri; #endif Bool HWCursor; int HWCursorState; int defaultVsync; Bool NoSigBlock; Bool FastFill; Bool FastComposite; Bool FastCompositeRepeat; Bool FastVideoMemCopy; Bool FastAppFBMemCopy; int FBCache; int pfd; gid_t socketGID; #if USEDRI2 int drmFD; char drmDevName[64]; #endif int pixmapMemtype; } MSMRec, *MSMPtr; struct msm_pixmap_priv { struct msm_drm_bo *bo; int SavedPitch; }; /* Macro to get the private record from the ScreenInfo structure */ #define MSMPTR(p) ((MSMPtr) ((p)->driverPrivate)) Bool MSMSetupExa(ScreenPtr); Bool MSMDRIScreenInit(ScreenPtr pScreen); Bool MSMDRIFinishScreenInit(ScreenPtr pScreen); Bool MSMDRI2ScreenInit(ScreenPtr pScreen); void MSMDRI2ScreenClose(ScreenPtr pScreen); void MSMInitVideo(ScreenPtr pScreen); void MSMSetCursorPosition(MSMPtr pMsm, int x, int y); void MSMCursorEnable(MSMPtr pMsm, Bool enable); void MSMCursorLoadARGB(MSMPtr pMsm, CARD32 * image); Bool MSMCursorInit(ScreenPtr pScreen); void MSMOutputSetup(ScrnInfoPtr pScrn); void MSMCrtcSetup(ScrnInfoPtr pScrn); void MSMBinderInit(MSMPtr pMsm); #define MSM_OFFSCREEN_GEM 0x01 #define MSM_OFFSCREEN_EXA 0x02 struct msm_offscreen_area { void *priv; unsigned char *ptr; int type; }; int msm_pixmap_offset(PixmapPtr); int msm_pixmap_get_pitch(PixmapPtr pix); Bool msm_pixmap_in_gem(PixmapPtr); struct msm_drm_bo *msm_get_pixmap_bo(PixmapPtr); struct msm_offscreen_area *msm_alloc_offscreen_memory(ScreenPtr pScreen, int size); void msm_free_offscreen_memory(ScreenPtr pScreen, struct msm_offscreen_area *area); #endif xf86-video-msm/src/msm-output.c0000644000175000017500000000747211615776600016567 0ustar paulliupaulliu/* msm-output.c * * Copyright (c) 2009, Code Aurora Forum. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Code Aurora nor * the names of its contributors may be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NON-INFRINGEMENT ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include "xf86.h" #include "xf86i2c.h" #include "xf86Crtc.h" #include "msm.h" static void MSMOutputCreateResources(xf86OutputPtr output) { /* No custom properties are supported */ } static Bool MSMOutputSetProperty(xf86OutputPtr output, Atom property, RRPropertyValuePtr value) { /* No custom properties are supported */ return TRUE; } static void MSMOutputDPMS(xf86OutputPtr output, int mode) { /* DPMS is handled at the CRTC */ } static void MSMOutputPrepare(xf86OutputPtr output) { } static void MSMOutputCommit(xf86OutputPtr output) { } static void MSMOutputSave(xf86OutputPtr output) { } static void MSMOutputRestore(xf86OutputPtr output) { } static int MSMOutputModeValid(xf86OutputPtr output, DisplayModePtr pMode) { return MODE_OK; } static Bool MSMOutputModeFixup(xf86OutputPtr output, DisplayModePtr mode, DisplayModePtr adjmode) { return TRUE; } static void MSMOutputModeSet(xf86OutputPtr output, DisplayModePtr mode, DisplayModePtr adjmode) { /* Nothing to do on the output side */ } static xf86OutputStatus MSMOutputDetect(xf86OutputPtr output) { return XF86OutputStatusConnected; } static DisplayModePtr MSMOutputGetModes(xf86OutputPtr output) { ScrnInfoPtr pScrn = output->scrn; MSMPtr pMsm = MSMPTR(pScrn); DisplayModePtr modes; modes = xf86DuplicateMode(&pMsm->default_mode); return modes; } static void MSMOutputDestroy(xf86OutputPtr output) { } static const xf86OutputFuncsRec MSMOutputFuncs = { .create_resources = MSMOutputCreateResources, .dpms = MSMOutputDPMS, .save = MSMOutputSave, .restore = MSMOutputRestore, .mode_valid = MSMOutputModeValid, .mode_fixup = MSMOutputModeFixup, .prepare = MSMOutputPrepare, .mode_set = MSMOutputModeSet, .commit = MSMOutputCommit, .detect = MSMOutputDetect, .get_modes = MSMOutputGetModes, .set_property = MSMOutputSetProperty, .destroy = MSMOutputDestroy }; void MSMOutputSetup(ScrnInfoPtr pScrn) { xf86OutputPtr output; output = xf86OutputCreate(pScrn, &MSMOutputFuncs, "default"); output->interlaceAllowed = FALSE; output->doubleScanAllowed = FALSE; /* FIXME: Set monitor size here? */ output->possible_crtcs = 1; } xf86-video-msm/src/msm-swblits.h0000755000175000017500000001332211615776600016715 0ustar paulliupaulliu/* msm-blits.h * * Copyright (c) 2009, Code Aurora Forum. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Code Aurora nor * the names of its contributors may be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NON-INFRINGEMENT ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef _MSM_BLITS_H_ #define _MSM_BLITS_H_ /* Interface to MSM driver. */ #include "xf86.h" #include "msm.h" /* Use this include to get access to integer definitions for specific integral data types optimized for speed or size. */ #include #include /* Neon intrinsics are part of the ARM or GCC compiler used. */ /* Tested with: /pkg/asw/compilers/gnu/codesourcery/arm-2008q3-72/lib/gcc/arm-none-linux-gnueabi/4.3.2/include/arm_neon.h */ #include /* These are NEON-optimized functions linked to by various tests. */ extern void * neon_memcpy (void * dest, const void * source, unsigned int numBytes); extern void * neon_memmove (void * dest, const void * source, unsigned int numBytes); extern void memset16(uint16_t *dst, uint16_t value, int count); extern void memset32(uint32_t *dst, uint32_t value, int count); /* Make definitions to clarify memory-related sizes to enable avoidance of magic numbers. */ #define BITS_PER_BYTE (8) #define BYTES_PER_16BPP_PIXEL (2) #define BYTES_PER_24BPP_PIXEL (3) #define BYTES_PER_32BPP_PIXEL (4) #define BYTES_PER_UINT16X8_T (16) #define BYTES_PER_UINT32X4_T (16) #define BYTES_PER_UINT64_T (8) #define BYTES_PER_UINT32_T (4) #define BYTES_PER_UINT16_T (2) #define BYTES_PER_UINT8_T (1) #define BYTE_ALIGNMENT_BYTE_SIZE (1) #define HALF_WORD_ALIGNMENT_BYTE_SIZE (2) #define WORD_ALIGNMENT_BYTE_SIZE (4) #define DOUBLE_WORD_ALIGNMENT_BYTE_SIZE (8) #define QUAD_WORD_ALIGNMENT_BYTE_SIZE (16) /* Function declarations from msm-swaligned.c. */ void swCopy1ByteWideRectangle_UnAligned(uint8_t *dst, uint8_t *src, int h, int dpitch, int spitch); void swCopy2ByteWideRectangle_HalfWordAligned(uint8_t *dst, uint8_t *src, int h, int dpitch, int spitch); void swCopy4ByteWideRectangle_WordAligned(uint8_t *dst, uint8_t *src, int h, int dpitch, int spitch); void swCopy8ByteWideRectangle_DoubleWordAligned(uint8_t *dst, uint8_t *src, int h, int dpitch, int spitch); void swFill1ByteWideRectangle_UnAligned(uint8_t *dst, uint8_t src, int h, int dpitch); void swFill2ByteWideRectangle_HalfWordAligned(uint8_t *dst, uint16_t src, int h, int dpitch); void swFill4ByteWideRectangle_WordAligned(uint8_t *dst, uint32_t src, int h, int dpitch); // Should only be needed for working around a kernel issue that fails to save/restore Neon registers for userspace signal handlers. #define MASK_SIGNALS (TRUE) #if (MASK_SIGNALS) #include "signal.h" // Needed only for masking signals. #endif // (MASK_SIGNALS) /* USAGE NOTES FOR SIGNAL BLOCKING MACROS: */ /* (1) The block and unblock macros must surround all Neon/FP code */ /* and need to be called from the same scope because the macros share state (oldMaskSet). */ /* (2) Blocking signals stops the mouse cursor from being updated, so don't */ /* bracket sections of code that take a noticeable amount of time to execute. */ /* Block signals from being handled before any operations that use VFP/Neon registers. */ #if (MASK_SIGNALS) #define BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS() \ /* This is temporary code to disable signals that are corrupting Neon registers because of a kernel issue. */ \ /* The kernel issue causes Neon/VFP registers to not be saved or restored correctly in userspace signal handlers. */ \ sigset_t oldMaskSet; \ if (blockSignalsForVFP) { \ sigset_t allSignalsMaskSet; \ sigfillset(&allSignalsMaskSet); \ sigprocmask(SIG_BLOCK, &allSignalsMaskSet, &oldMaskSet); \ } #else #define BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS() #endif // (MASK_SIGNALS) /* Unblock signals from being handled after any operations that use VFP/Neon registers. */ #if (MASK_SIGNALS) #define UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS() \ if (blockSignalsForVFP) { \ /* Re-enable signals that are corrupting Neon registers. */ \ sigprocmask(SIG_SETMASK, &oldMaskSet, NULL); \ } #else #define UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS() #endif // (MASK_SIGNALS) /* Do no signal blocking. */ #define SIGNAL_BLOCK_NOOP() #endif // _MSM_BLITS_H_ xf86-video-msm/src/neon_memsets.c0000755000175000017500000002604111615776600017125 0ustar paulliupaulliu/* neon_memsets.c * * Copyright (c) 2009, Code Aurora Forum. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Code Aurora nor * the names of its contributors may be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NON-INFRINGEMENT ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "msm-swblits.h" void memset16(uint16_t dst[], uint16_t value, int count) { if (count <= 0) return; asm volatile( " pld [%[dst], #0] \n" " cmp %[count], #4 \n" " blt 6f \n" " tst %[dst], #0x3 \n" " strneh %[value], [%[dst]], #2 \n" " subne %[count], %[count], #1 \n" " vdup.u16 q8, %[value] \n" " vmov q9, q8 \n" " cmp %[count], #64 \n" " bge 0f \n" " cmp %[count], #32 \n" " bge 2f \n" " cmp %[count], #16 \n" " bge 3f \n" " cmp %[count], #8 \n" " bge 4f \n" " b 5f \n" "0: \n" " mov r12, %[count], lsr #6 \n" "1: \n" " vst1.16 {q8, q9}, [%[dst]]! \n" " vst1.16 {q8, q9}, [%[dst]]! \n" " vst1.16 {q8, q9}, [%[dst]]! \n" " vst1.16 {q8, q9}, [%[dst]]! \n" " subs r12, r12, #1 \n" " bne 1b \n" " ands %[count], %[count], #0x3f \n" " beq 7f \n" "2: \n" " cmp %[count], #32 \n" " blt 3f \n" " vst1.16 {q8, q9}, [%[dst]]! \n" " vst1.16 {q8, q9}, [%[dst]]! \n" " subs %[count], %[count], #32 \n" " beq 7f \n" "3: \n" " cmp %[count], #16 \n" " blt 4f \n" " vst1.16 {q8, q9}, [%[dst]]! \n" " subs %[count], %[count], #16 \n" " beq 7f \n" "4: \n" " cmp %[count], #8 \n" " blt 5f \n" " vst1.16 {q8}, [%[dst]]! \n" " subs %[count], %[count], #8 \n" " beq 7f \n" "5: \n" " cmp %[count], #4 \n" " blt 6f \n" " vst1.16 {d16}, [%[dst]]! \n" " subs %[count], %[count], #4 \n" " beq 7f \n" "6: \n" " cmp %[count], #0 \n" " blt 7f \n" " lsls %[count], #31 \n" " strmih %[value], [%[dst]], #2 \n" " strcsh %[value], [%[dst]], #2 \n" " strcsh %[value], [%[dst]], #2 \n" "7: \n" // Clobbered input registers : [dst] "+r" (dst), [count] "+r" (count) // Unclobbered input : [value] "r" (value) // Clobbered registers : "q8", "q9", "r12", "cc", "memory" ); } void memset32(uint32_t dst[], uint32_t value, int count) { asm volatile( " pld [%[dst], #0] \n" " cmp %[count], #4 \n" " blt 5f \n" " vdup.u32 q8, %[value] \n" " vmov q9, q8 \n" " cmp %[count], #32 \n" " bge 0f \n" " cmp %[count], #16 \n" " bge 2f \n" " cmp %[count], #8 \n" " bge 3f \n" " b 4f \n" "0: \n" " mov r12, %[count], lsr #5 \n" "1: \n" " vst1.32 {q8, q9}, [%[dst]]! \n" " vst1.32 {q8, q9}, [%[dst]]! \n" " vst1.32 {q8, q9}, [%[dst]]! \n" " vst1.32 {q8, q9}, [%[dst]]! \n" " pld [%[dst], #0] \n" " subs r12, r12, #1 \n" " bne 1b \n" " ands %[count], %[count], #0x1f \n" " beq 6f \n" "2: \n" " cmp %[count], #16 \n" " blt 3f \n" " vst1.32 {q8, q9}, [%[dst]]! \n" " vst1.32 {q8, q9}, [%[dst]]! \n" " subs %[count], %[count], #16 \n" " beq 6f \n" "3: \n" " cmp %[count], #8 \n" " blt 4f \n" " vst1.32 {q8, q9}, [%[dst]]! \n" " subs %[count], %[count], #8 \n" " beq 6f \n" "4: \n" " cmp %[count], #4 \n" " blt 5f \n" " vst1.32 {q8}, [%[dst]]! \n" " subs %[count], %[count], #4 \n" " beq 6f \n" "5: \n" " cmp %[count], #0 \n" " beq 6f \n" " lsls %[count], #31 \n" " strmi %[value], [%[dst]], #4 \n" " strcs %[value], [%[dst]], #4 \n" " strcs %[value], [%[dst]], #4 \n" "6: @end \n" // Clobbered input registers : [dst] "+r" (dst), [count] "+r" (count) // Unclobbered input : [value] "r" (value) // Clobbered registers : "q8", "q9", "r12", "cc", "memory" ); } xf86-video-msm/src/msm-pixmap.c0000644000175000017500000001056411615776600016521 0ustar paulliupaulliu/* msm-pixmap.c * * Copyright (c) 2009, Code Aurora Forum. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Code Aurora nor * the names of its contributors may be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NON-INFRINGEMENT ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include "msm.h" #include "msm-drm.h" #define MSMPTR_FROM_PIXMAP(_x) \ MSMPTR(xf86Screens[(_x)->drawable.pScreen->myNum]) int msm_pixmap_offset(PixmapPtr pixmap) { struct msm_pixmap_priv *priv; priv = exaGetPixmapDriverPrivate(pixmap); if (priv && priv->bo) return 0; return exaGetPixmapOffset(pixmap); } int msm_pixmap_get_pitch(PixmapPtr pix) { struct msm_pixmap_priv *priv = exaGetPixmapDriverPrivate(pix); if (priv && priv->bo) { return ((pix->drawable.width + 31) & ~31) * (pix->drawable.bitsPerPixel >> 3); } return exaGetPixmapPitch(pix); } Bool msm_pixmap_in_gem(PixmapPtr pix) { MSMPtr pMsm = MSMPTR_FROM_PIXMAP(pix); struct msm_pixmap_priv *priv = exaGetPixmapDriverPrivate(pix); #if USEDRI2 if (pMsm->useDRI2 && priv && priv->bo) return TRUE; #endif return FALSE; } struct msm_drm_bo * msm_get_pixmap_bo(PixmapPtr pix) { MSMPtr pMsm = MSMPTR_FROM_PIXMAP(pix); struct msm_pixmap_priv *priv = exaGetPixmapDriverPrivate(pix); #if USEDRI2 if (pMsm->useDRI2 && priv && priv->bo) { /* When this fucntion is called then ensure it gets allocated - if this function ever gets used outside of EXA this could cause problems */ msm_drm_bo_alloc(priv->bo); return priv->bo; } #endif return NULL; } /* For now, all offscreen memory will be in EBI since the only consumer * is the MDP in the Xvideo code, so when there is another consumer, * then make sure to allow the type to be specified */ struct msm_offscreen_area * msm_alloc_offscreen_memory(ScreenPtr pScreen, int size) { ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum]; MSMPtr pMsm = MSMPTR(pScrn); struct msm_offscreen_area *ret = xcalloc(1, sizeof(struct msm_offscreen_area)); if (ret == NULL) return NULL; #if USEDRI2 if (pMsm->useDRI2) { struct msm_drm_bo *bo = msm_drm_bo_create_memtype(pMsm->drmFD, size, MSM_DRM_MEMTYPE_EBI); if (bo == NULL) goto err; if (msm_drm_bo_map(bo)) { msm_drm_bo_free(bo); goto err; } ret->priv = (void *) bo; ret->ptr = bo->virt; ret->type = MSM_OFFSCREEN_GEM; return ret; } #endif if (pMsm->accel) { ExaOffscreenArea *area = exaOffscreenAlloc(pScreen, size, 4, TRUE, NULL, NULL); if (area == NULL) goto err; ret->priv = (void *) area; ret->ptr = pMsm->fbmem + area->offset; ret->type = MSM_OFFSCREEN_EXA; return ret; } err: xfree(ret); return NULL; } void msm_free_offscreen_memory(ScreenPtr pScreen, struct msm_offscreen_area *area) { if (area == NULL) return; if (area->type == MSM_OFFSCREEN_GEM) msm_drm_bo_free((struct msm_drm_bo *) area->priv); else if (area->type == MSM_OFFSCREEN_EXA) exaOffscreenFree(pScreen, (ExaOffscreenArea *) area->priv); xfree(area); } xf86-video-msm/src/msm-swalignedcopy.c0000755000175000017500000003454311615776600020101 0ustar paulliupaulliu/* msm-swrender.c * * Copyright (c) 2009, Code Aurora Forum. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Code Aurora nor * the names of its contributors may be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NON-INFRINGEMENT ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include "xf86.h" #include "msm.h" #include "msm-swblits.h" /* Copy a rectangle that is 1 byte wide, using unrolled loops to make the function as fast as possible. */ /* This function may use unaligned pointers at no penalty. */ void swCopy1ByteWideRectangle_UnAligned(uint8_t *dst, uint8_t *src, int h, int dpitch, int spitch) { const int EIGHT_ROWS = 8; while (h>=EIGHT_ROWS) { h -= EIGHT_ROWS; uint8_t src1 = *(src+0*spitch); uint8_t src2 = *(src+1*spitch); uint8_t src3 = *(src+2*spitch); uint8_t src4 = *(src+3*spitch); uint8_t src5 = *(src+4*spitch); uint8_t src6 = *(src+5*spitch); uint8_t src7 = *(src+6*spitch); uint8_t src8 = *(src+7*spitch); src += EIGHT_ROWS * spitch; *(uint8_t *)(dst+0*dpitch) = src1; *(uint8_t *)(dst+1*dpitch) = src2; *(uint8_t *)(dst+2*dpitch) = src3; *(uint8_t *)(dst+3*dpitch) = src4; *(uint8_t *)(dst+4*dpitch) = src5; *(uint8_t *)(dst+5*dpitch) = src6; *(uint8_t *)(dst+6*dpitch) = src7; *(uint8_t *)(dst+7*dpitch) = src8; dst += EIGHT_ROWS * dpitch; } const int FOUR_ROWS = 4; while (h>=FOUR_ROWS) { h -= FOUR_ROWS; uint8_t src1 = *(src+0*spitch); uint8_t src2 = *(src+1*spitch); uint8_t src3 = *(src+2*spitch); uint8_t src4 = *(src+3*spitch); src += FOUR_ROWS * spitch; *(uint8_t *)(dst+0*dpitch) = src1; *(uint8_t *)(dst+1*dpitch) = src2; *(uint8_t *)(dst+2*dpitch) = src3; *(uint8_t *)(dst+3*dpitch) = src4; dst += FOUR_ROWS * dpitch; } const int TWO_ROWS = 2; while (h>=TWO_ROWS) { h -= TWO_ROWS; uint8_t src1 = *(src+0*spitch); uint8_t src2 = *(src+1*spitch); src += TWO_ROWS * spitch; *(uint8_t *)(dst+0*dpitch) = src1; *(uint8_t *)(dst+1*dpitch) = src2; dst += TWO_ROWS * dpitch; } const int ONE_ROW = 1; while (h > 0) { h -= ONE_ROW; uint8_t src1 = *(src+0*spitch); src += spitch; *(uint8_t *)(dst+0*dpitch) = src1; dst += dpitch; } } /* Copy a rectangle that is 2 bytes wide, using unrolled loops to make the function as fast as possible. */ /* This function requires the pointers to be half-word-aligned (even addresses). */ void swCopy2ByteWideRectangle_HalfWordAligned(uint8_t *dst, uint8_t *src, int h, int dpitch, int spitch) { const int EIGHT_ROWS = 8; while (h>=EIGHT_ROWS) { h -= EIGHT_ROWS; uint16_t src1 = *(uint16_t *)(src+0*spitch); uint16_t src2 = *(uint16_t *)(src+1*spitch); uint16_t src3 = *(uint16_t *)(src+2*spitch); uint16_t src4 = *(uint16_t *)(src+3*spitch); uint16_t src5 = *(uint16_t *)(src+4*spitch); uint16_t src6 = *(uint16_t *)(src+5*spitch); uint16_t src7 = *(uint16_t *)(src+6*spitch); uint16_t src8 = *(uint16_t *)(src+7*spitch); src += EIGHT_ROWS * spitch; *(uint16_t *)(dst+0*dpitch) = src1; *(uint16_t *)(dst+1*dpitch) = src2; *(uint16_t *)(dst+2*dpitch) = src3; *(uint16_t *)(dst+3*dpitch) = src4; *(uint16_t *)(dst+4*dpitch) = src5; *(uint16_t *)(dst+5*dpitch) = src6; *(uint16_t *)(dst+6*dpitch) = src7; *(uint16_t *)(dst+7*dpitch) = src8; dst += EIGHT_ROWS * dpitch; } const int FOUR_ROWS = 4; while (h>=FOUR_ROWS) { h -= FOUR_ROWS; uint16_t src1 = *(uint16_t *)(src+0*spitch); uint16_t src2 = *(uint16_t *)(src+1*spitch); uint16_t src3 = *(uint16_t *)(src+2*spitch); uint16_t src4 = *(uint16_t *)(src+3*spitch); src += FOUR_ROWS * spitch; *(uint16_t *)(dst+0*dpitch) = src1; *(uint16_t *)(dst+1*dpitch) = src2; *(uint16_t *)(dst+2*dpitch) = src3; *(uint16_t *)(dst+3*dpitch) = src4; dst += FOUR_ROWS * dpitch; } const int TWO_ROWS = 2; while (h>=TWO_ROWS) { h -= TWO_ROWS; uint16_t src1 = *(uint16_t *)(src+0*spitch); uint16_t src2 = *(uint16_t *)(src+1*spitch); src += TWO_ROWS * spitch; *(uint16_t *)(dst+0*dpitch) = src1; *(uint16_t *)(dst+1*dpitch) = src2; dst += TWO_ROWS * dpitch; } const int ONE_ROW = 1; while (h > 0) { h -= ONE_ROW; uint16_t src1 = *(uint16_t *)(src+0*spitch); src += spitch; *(uint16_t *)(dst+0*dpitch) = src1; dst += dpitch; } } /* Copy a rectangle that is 4 bytes wide, using unrolled loops to make the function as fast as possible. */ /* This function requires the pointers to be word-aligned (divisible by 4). */ void swCopy4ByteWideRectangle_WordAligned(uint8_t *dst, uint8_t *src, int h, int dpitch, int spitch) { const int EIGHT_ROWS = 8; while (h>=EIGHT_ROWS) { h -= EIGHT_ROWS; uint32_t src1 = *(uint32_t *)(src+0*spitch); uint32_t src2 = *(uint32_t *)(src+1*spitch); uint32_t src3 = *(uint32_t *)(src+2*spitch); uint32_t src4 = *(uint32_t *)(src+3*spitch); uint32_t src5 = *(uint32_t *)(src+4*spitch); uint32_t src6 = *(uint32_t *)(src+5*spitch); uint32_t src7 = *(uint32_t *)(src+6*spitch); uint32_t src8 = *(uint32_t *)(src+7*spitch); src += EIGHT_ROWS * spitch; *(uint32_t *)(dst+0*dpitch) = src1; *(uint32_t *)(dst+1*dpitch) = src2; *(uint32_t *)(dst+2*dpitch) = src3; *(uint32_t *)(dst+3*dpitch) = src4; *(uint32_t *)(dst+4*dpitch) = src5; *(uint32_t *)(dst+5*dpitch) = src6; *(uint32_t *)(dst+6*dpitch) = src7; *(uint32_t *)(dst+7*dpitch) = src8; dst += EIGHT_ROWS * dpitch; } const int FOUR_ROWS = 4; while (h>=FOUR_ROWS) { h -= FOUR_ROWS; uint32_t src1 = *(uint32_t *)(src+0*spitch); uint32_t src2 = *(uint32_t *)(src+1*spitch); uint32_t src3 = *(uint32_t *)(src+2*spitch); uint32_t src4 = *(uint32_t *)(src+3*spitch); src += FOUR_ROWS * spitch; *(uint32_t *)(dst+0*dpitch) = src1; *(uint32_t *)(dst+1*dpitch) = src2; *(uint32_t *)(dst+2*dpitch) = src3; *(uint32_t *)(dst+3*dpitch) = src4; dst += FOUR_ROWS * dpitch; } const int TWO_ROWS = 2; while (h>=TWO_ROWS) { h -= TWO_ROWS; uint32_t src1 = *(uint32_t *)(src+0*spitch); uint32_t src2 = *(uint32_t *)(src+1*spitch); src += TWO_ROWS * spitch; *(uint32_t *)(dst+0*dpitch) = src1; *(uint32_t *)(dst+1*dpitch) = src2; dst += TWO_ROWS * dpitch; } const int ONE_ROW = 1; while (h > 0) { h -= ONE_ROW; uint32_t src1 = *(uint32_t *)(src+0*spitch); src += spitch; *(uint32_t *)(dst+0*dpitch) = src1; dst += dpitch; } } /* Copy a rectangle that is 8 bytes wide, using unrolled loops to make the function as fast as possible. */ /* This function requires the pointers to be double-word-aligned (divisible by 8). */ void swCopy8ByteWideRectangle_DoubleWordAligned(uint8_t *dst, uint8_t *src, int h, int dpitch, int spitch) { const int EIGHT_ROWS = 8; while (h>=EIGHT_ROWS) { h -= EIGHT_ROWS; uint64_t src1 = *(uint64_t *)(src+0*spitch); uint64_t src2 = *(uint64_t *)(src+1*spitch); uint64_t src3 = *(uint64_t *)(src+2*spitch); uint64_t src4 = *(uint64_t *)(src+3*spitch); uint64_t src5 = *(uint64_t *)(src+4*spitch); uint64_t src6 = *(uint64_t *)(src+5*spitch); uint64_t src7 = *(uint64_t *)(src+6*spitch); uint64_t src8 = *(uint64_t *)(src+7*spitch); src += EIGHT_ROWS * spitch; *(uint64_t *)(dst+0*dpitch) = src1; *(uint64_t *)(dst+1*dpitch) = src2; *(uint64_t *)(dst+2*dpitch) = src3; *(uint64_t *)(dst+3*dpitch) = src4; *(uint64_t *)(dst+4*dpitch) = src5; *(uint64_t *)(dst+5*dpitch) = src6; *(uint64_t *)(dst+6*dpitch) = src7; *(uint64_t *)(dst+7*dpitch) = src8; dst += EIGHT_ROWS * dpitch; } const int FOUR_ROWS = 4; while (h>=FOUR_ROWS) { h -= FOUR_ROWS; uint64_t src1 = *(uint64_t *)(src+0*spitch); uint64_t src2 = *(uint64_t *)(src+1*spitch); uint64_t src3 = *(uint64_t *)(src+2*spitch); uint64_t src4 = *(uint64_t *)(src+3*spitch); src += FOUR_ROWS * spitch; *(uint64_t *)(dst+0*dpitch) = src1; *(uint64_t *)(dst+1*dpitch) = src2; *(uint64_t *)(dst+2*dpitch) = src3; *(uint64_t *)(dst+3*dpitch) = src4; dst += FOUR_ROWS * dpitch; } const int TWO_ROWS = 2; while (h>=TWO_ROWS) { h -= TWO_ROWS; uint64_t src1 = *(uint64_t *)(src+0*spitch); uint64_t src2 = *(uint64_t *)(src+1*spitch); src += TWO_ROWS * spitch; *(uint64_t *)(dst+0*dpitch) = src1; *(uint64_t *)(dst+1*dpitch) = src2; dst += TWO_ROWS * dpitch; } const int ONE_ROW = 1; while (h > 0) { h -= ONE_ROW; uint64_t src1 = *(uint64_t *)(src+0*spitch); src += spitch; *(uint64_t *)(dst+0*dpitch) = src1; dst += dpitch; } } /* Fill a rectangle that is 1 byte wide, using unrolled loops to make the function as fast as possible. */ /* This function may use unaligned pointers at no penalty. */ void swFill1ByteWideRectangle_UnAligned(uint8_t *dst, uint8_t src, int h, int dpitch) { const int EIGHT_ROWS = 8; while (h>=EIGHT_ROWS) { h -= EIGHT_ROWS; *(uint8_t *)(dst+0*dpitch) = src; *(uint8_t *)(dst+1*dpitch) = src; *(uint8_t *)(dst+2*dpitch) = src; *(uint8_t *)(dst+3*dpitch) = src; *(uint8_t *)(dst+4*dpitch) = src; *(uint8_t *)(dst+5*dpitch) = src; *(uint8_t *)(dst+6*dpitch) = src; *(uint8_t *)(dst+7*dpitch) = src; dst += EIGHT_ROWS * dpitch; } const int FOUR_ROWS = 4; while (h>=FOUR_ROWS) { h -= FOUR_ROWS; *(uint8_t *)(dst+0*dpitch) = src; *(uint8_t *)(dst+1*dpitch) = src; *(uint8_t *)(dst+2*dpitch) = src; *(uint8_t *)(dst+3*dpitch) = src; dst += FOUR_ROWS * dpitch; } const int TWO_ROWS = 2; while (h>=TWO_ROWS) { h -= TWO_ROWS; *(uint8_t *)(dst+0*dpitch) = src; *(uint8_t *)(dst+1*dpitch) = src; dst += TWO_ROWS * dpitch; } const int ONE_ROW = 1; while (h > 0) { h -= ONE_ROW; *(uint8_t *)(dst+0*dpitch) = src; dst += dpitch; } } /* Fill a rectangle that is 2 bytes wide, using unrolled loops to make the function as fast as possible. */ /* This function requires the pointers to be half-word-aligned (even addresses). */ void swFill2ByteWideRectangle_HalfWordAligned(uint8_t *dst, uint16_t src, int h, int dpitch) { const int EIGHT_ROWS = 8; while (h>=EIGHT_ROWS) { h -= EIGHT_ROWS; *(uint16_t *)(dst+0*dpitch) = src; *(uint16_t *)(dst+1*dpitch) = src; *(uint16_t *)(dst+2*dpitch) = src; *(uint16_t *)(dst+3*dpitch) = src; *(uint16_t *)(dst+4*dpitch) = src; *(uint16_t *)(dst+5*dpitch) = src; *(uint16_t *)(dst+6*dpitch) = src; *(uint16_t *)(dst+7*dpitch) = src; dst += EIGHT_ROWS * dpitch; } const int FOUR_ROWS = 4; while (h>=FOUR_ROWS) { h -= FOUR_ROWS; *(uint16_t *)(dst+0*dpitch) = src; *(uint16_t *)(dst+1*dpitch) = src; *(uint16_t *)(dst+2*dpitch) = src; *(uint16_t *)(dst+3*dpitch) = src; dst += FOUR_ROWS * dpitch; } const int TWO_ROWS = 2; while (h>=TWO_ROWS) { h -= TWO_ROWS; *(uint16_t *)(dst+0*dpitch) = src; *(uint16_t *)(dst+1*dpitch) = src; dst += TWO_ROWS * dpitch; } const int ONE_ROW = 1; while (h > 0) { h -= ONE_ROW; *(uint16_t *)(dst+0*dpitch) = src; dst += dpitch; } } /* Fill a rectangle that is 4 bytes wide, using unrolled loops to make the function as fast as possible. */ /* This function requires the pointers to be word-aligned (divisible by 4). */ void swFill4ByteWideRectangle_WordAligned(uint8_t *dst, uint32_t src, int h, int dpitch) { const int EIGHT_ROWS = 8; while (h>=EIGHT_ROWS) { h -= EIGHT_ROWS; *(uint32_t *)(dst+0*dpitch) = src; *(uint32_t *)(dst+1*dpitch) = src; *(uint32_t *)(dst+2*dpitch) = src; *(uint32_t *)(dst+3*dpitch) = src; *(uint32_t *)(dst+4*dpitch) = src; *(uint32_t *)(dst+5*dpitch) = src; *(uint32_t *)(dst+6*dpitch) = src; *(uint32_t *)(dst+7*dpitch) = src; dst += EIGHT_ROWS * dpitch; } const int FOUR_ROWS = 4; while (h>=FOUR_ROWS) { h -= FOUR_ROWS; *(uint32_t *)(dst+0*dpitch) = src; *(uint32_t *)(dst+1*dpitch) = src; *(uint32_t *)(dst+2*dpitch) = src; *(uint32_t *)(dst+3*dpitch) = src; dst += FOUR_ROWS * dpitch; } const int TWO_ROWS = 2; while (h>=TWO_ROWS) { h -= TWO_ROWS; *(uint32_t *)(dst+0*dpitch) = src; *(uint32_t *)(dst+1*dpitch) = src; dst += TWO_ROWS * dpitch; } const int ONE_ROW = 1; while (h > 0) { h -= ONE_ROW; *(uint32_t *)(dst+0*dpitch) = src; dst += dpitch; } } xf86-video-msm/src/msm-display.c0000644000175000017500000001152011615776600016661 0ustar paulliupaulliu/* msm-output.c * * Copyright (c) 2009, Code Aurora Forum. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Code Aurora nor * the names of its contributors may be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NON-INFRINGEMENT ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include "xf86.h" #include "xf86Crtc.h" #include "msm.h" static void MSMCrtcGammaSet(xf86CrtcPtr crtc, CARD16 *red, CARD16 *green, CARD16 *blue, int size) { /* This is a new function that gets called by the DI code */ } static void MSMCrtcDPMS(xf86CrtcPtr crtc, int mode) { /* TODO: Implement DPMS */ } static Bool MSMCrtcLock(xf86CrtcPtr crtc) { return TRUE; } static void MSMCrtcUnlock(xf86CrtcPtr crtc) { } static void MSMCrtcPrepare(xf86CrtcPtr crtc) { /* Blank the display before we change modes? */ } static Bool MSMCrtcModeFixup(xf86CrtcPtr crtc, DisplayModePtr mode, DisplayModePtr adjmode) { ScrnInfoPtr pScrn = crtc->scrn; MSMPtr pMsm = MSMPTR(pScrn); if (mode->HDisplay > pMsm->mode_info.xres_virtual || mode->VDisplay > pMsm->mode_info.yres_virtual) return FALSE; return TRUE; } static void MSMCrtcModeSet(xf86CrtcPtr crtc, DisplayModePtr mode, DisplayModePtr adjmode, int x, int y) { int ret; ScrnInfoPtr pScrn = crtc->scrn; MSMPtr pMsm = MSMPTR(pScrn); struct fb_var_screeninfo var; memcpy(&var, &pMsm->mode_info, sizeof(var)); var.xres = adjmode->HDisplay; var.right_margin = adjmode->HSyncStart - adjmode->HDisplay; var.hsync_len = adjmode->HSyncEnd - adjmode->HSyncStart; var.left_margin = adjmode->HTotal - adjmode->HSyncEnd; var.yres = adjmode->VDisplay; var.lower_margin = adjmode->VSyncStart - adjmode->VDisplay; var.vsync_len = adjmode->VSyncEnd - adjmode->VSyncStart; var.upper_margin = adjmode->VTotal - adjmode->VSyncEnd; var.pixclock = pMsm->defaultVsync * adjmode->HTotal * adjmode->VTotal; ret = ioctl(pMsm->fd, FBIOPUT_VSCREENINFO, &var); if (ret) xf86DrvMsg(pScrn->scrnIndex, X_ERROR, "Unable to change the mode: %m"); else { /* Refresh the changed settings from the driver */ ioctl(pMsm->fd, FBIOGET_VSCREENINFO, &pMsm->mode_info); } } static void MSMCrtcCommit(xf86CrtcPtr crtc) { } static void MSMCrtcSetCursorPosition(xf86CrtcPtr crtc, int x, int y) { ScrnInfoPtr pScrn = crtc->scrn; MSMPtr pMsm = MSMPTR(pScrn); MSMSetCursorPosition(pMsm, x, y); } static void MSMCrtcShowCursor(xf86CrtcPtr crtc) { ScrnInfoPtr pScrn = crtc->scrn; MSMPtr pMsm = MSMPTR(pScrn); MSMCursorEnable(pMsm, TRUE); } static void MSMCrtcHideCursor(xf86CrtcPtr crtc) { ScrnInfoPtr pScrn = crtc->scrn; MSMPtr pMsm = MSMPTR(pScrn); MSMCursorEnable(pMsm, FALSE); } static void MSMCrtcLoadCursorARGB(xf86CrtcPtr crtc, CARD32 * image) { ScrnInfoPtr pScrn = crtc->scrn; MSMPtr pMsm = MSMPTR(pScrn); MSMCursorLoadARGB(pMsm, image); } static const xf86CrtcFuncsRec MSMCrtcFuncs = { .dpms = MSMCrtcDPMS, .lock = MSMCrtcLock, .unlock = MSMCrtcUnlock, .mode_fixup = MSMCrtcModeFixup, .prepare = MSMCrtcPrepare, .mode_set = MSMCrtcModeSet, .commit = MSMCrtcCommit, .set_cursor_position = MSMCrtcSetCursorPosition, .show_cursor = MSMCrtcShowCursor, .hide_cursor = MSMCrtcHideCursor, .load_cursor_argb = MSMCrtcLoadCursorARGB, .gamma_set = MSMCrtcGammaSet, .destroy = NULL, /* XXX */ }; void MSMCrtcSetup(ScrnInfoPtr pScrn) { xf86CrtcPtr crtc = xf86CrtcCreate(pScrn, &MSMCrtcFuncs); crtc->driver_private = NULL; } xf86-video-msm/src/Makefile.am0000755000175000017500000000174711615776600016327 0ustar paulliupaulliumsm_drv_la_LIBADD = MSM_DRI_SRCS= if USEDRI MSM_DRI_SRCS += msm-dri.c msm_drv_la_LIBADD += $(DRI_LIBS) endif if USEDRI2 MSM_DRI_SRCS += msm-drm.c msm-dri2.c msm_drv_la_LIBADD += $(DRI2_LIBS) endif NEON_CFLAGS=-march=armv7-a -mfpu=neon -mfloat-abi=softfp NEON_CCASFLAGS=$(NEON_CFLAGS) -mthumb-interwork NEON_ASFLAGS=-k -mcpu=cortex-a8 $(NEON_CCASFLAGS) AM_CFLAGS = @XORG_CFLAGS@ @DRI_CFLAGS@ @DRI2_CFLAGS@ $(NEON_CFLAGS) -Wall -Werror AM_ASFLAGS = $(NEON_ASFLAGS) AM_CCASFLAGS = $(NEON_CCASFLAGS) msm_drv_la_LTLIBRARIES = msm_drv.la msm_drv_la_LDFLAGS = -module -avoid-version msm_drv_ladir = @moduledir@/drivers msm_drv_la_SOURCES = \ msm-driver.c \ msm-swrender.c \ msm-swalignedcopy.c \ msm-exa.c \ msm-video.c \ msm-output.c \ msm-display.c \ msm-cursor.c \ msm-binder.c \ msm-swfill.c \ msm-hwrender.c \ msm-pixmap.c \ neon_memsets.c \ neon_memcpy.S \ neon_memmove.S \ $(MSM_DRI_SRCS) EXTRA_DIST = \ msm.h \ msm_mdp.h \ msm-swblits.h msm-drm.h xf86-video-msm/src/neon_memcpy.S0000644000175000017500000003437711615776600016732 0ustar paulliupaulliu/*************************************************************************** Copyright (c) 2009, Code Aurora Forum. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Code Aurora nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ***************************************************************************/ /*************************************************************************** Neon memcpy: Attempts to do a memcpy with Neon registers if possible, Inputs: dest: The destination buffer src: The source buffer n: The size of the buffer to transfer Outputs: ***************************************************************************/ /* * General note: * The original code that was compiled for rvct used PUSH/POP and VPUSH/VPOP * However, it looks like the 2006 CodeSourcery Assembler has issues generating * the correct object code for VPOP, resulting in horrific stack crashes. * As a result, I've temporarily move PUSH->STMDB, POP->LDMIA, VPUSH->VSTMDB, * and VPOP->VLDMIA. We can revert this back once we update our toolchain. * * Also, VSHL swaps the source register and the shift-amount register * around in 2006-q3. I've coded this incorrectly so it turns out correct * in the object code, but we'll need to undo that later... */ .code 32 .align 4 .globl neon_memcpy .func neon_memcpy: /* * First, make sure we're not copying < 4 bytes. If so, we'll * just handle it here. */ #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) stmdb sp!, {r0} #else push {r0} #endif cmp r2, #4 bgt neon_gt_4 /* Copy 0-4 bytes, if needed, and return.*/ cmp r2, #0 neon_smallcopy_loop: beq neon_smallcopy_done ldrb r12, [r1], #1 subs r2, r2, #1 strb r12, [r0], #1 b neon_smallcopy_loop neon_smallcopy_done: #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) ldmia sp!, {r0} #else pop {r0} #endif bx lr /* Copy 4 or more bytes*/ neon_gt_4: /* Preload what we can...*/ pld [r0,#0] pld [r1,#0] #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) stmdb sp!, {r4-r5} #else push {r4-r5} #endif neon_check_align: /* Check normal word alignment for target. */ ands r12, r0, #0x3 beq source_alignment_check /* * Target is not aligned. Step through until we get that * word-aligned. This works better than a loop, according * to our pipeline modeler. */ cmp r12, #2 ldrb r3, [r1], #1 ldrleb r4, [r1], #1 ldrltb r5, [r1], #1 rsb r12, r12, #4 sub r2, r2, r12 strb r3, [r0], #1 strleb r4, [r0], #1 strltb r5, [r0], #1 source_alignment_check: ands r12, r1, #0x3 bne neon_memcpy_nonaligned /* Source is not word aligned.*/ neon_try_16_align: cmp r2, #64 blt neon_align_route /* This is where we try 16-byte alignment. */ ands r12, r0, #0xf beq neon_align_route rsb r12, r12, #16 neon_16_start: sub r2, r2, r12 lsrs r3, r12, #2 neon_align_16_4: ldr r4, [r1], #4 subs r3, r3, #1 str r4, [r0], #4 bne neon_align_16_4 neon_align_route: /* In this case, both source and target are word-aligned. */ cmp r2, #32768 bge neon_copy_128p_a cmp r2, #256 bge neon_copy_128_a cmp r2, #64 bge neon_copy_32_a b neon_copy_finish_a nop neon_copy_128p_a: /* We'll copy blocks 128-bytes at a time, but try to call pld to * load in the next page, if possible. */ #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) vstmdb sp!, {q4-q7} #else vpush {q4-q7} #endif mov r12, r2, lsr #7 neon_copy_128p_loop_a: vld1.32 {q0, q1}, [r1]! vld1.32 {q2, q3}, [r1]! vld1.32 {q4, q5}, [r1]! vld1.32 {q6, q7}, [r1]! pld [r1, #0] pld [r1, #1024] vst1.32 {q0, q1}, [r0]! vst1.32 {q2, q3}, [r0]! vst1.32 {q4, q5}, [r0]! vst1.32 {q6, q7}, [r0]! subs r12, r12, #1 bne neon_copy_128p_loop_a #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) vldmia sp!, {q4-q7} #else vpop {q4-q7} #endif ands r2, r2, #0x7f beq neon_end cmp r2, #32 blt neon_copy_finish_a b neon_copy_32_a /* Copy blocks of 128-bytes (word-aligned) at a time*/ neon_copy_128_a: #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) vstmdb sp!, {q4-q7} #else vpush {q4-q7} #endif /* * Move to a 1-s based countdown to determine when to loop. That * allows the subs to set the Z flag without having to explicitly * call cmp to a value. */ mov r12, r2, lsr #7 neon_copy_128_loop_a: vld1.32 {q0, q1}, [r1]! vld1.32 {q2, q3}, [r1]! vld1.32 {q4, q5}, [r1]! vld1.32 {q6, q7}, [r1]! pld [r1, #0] pld [r1, #128] vst1.32 {q0, q1}, [r0]! vst1.32 {q2, q3}, [r0]! vst1.32 {q4, q5}, [r0]! vst1.32 {q6, q7}, [r0]! subs r12, r12, #1 pld [r0, #0] pld [r0, #128] bne neon_copy_128_loop_a #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) vldmia sp!, {q4-q7} #else vpop {q4-q7} #endif ands r2, r2, #0x7f beq neon_end cmp r2, #32 blt neon_copy_finish_a /* Copy blocks of 32-bytes (word aligned) at a time*/ neon_copy_32_a: mov r12, r2, lsr #5 neon_copy_32_loop_a: vld1.32 {q0,q1}, [r1]! subs r12, r12, #1 pld [r1,#0] vst1.32 {q0,q1}, [r0]! bne neon_copy_32_loop_a ands r2, r2, #0x1f beq neon_end neon_copy_finish_a: neon_copy_16_a: movs r12, r2, lsr #4 beq neon_copy_8_a neon_copy_16_a_loop: vld1.32 {q0}, [r1]! subs r12, r12, #1 vst1.32 {q0}, [r0]! bne neon_copy_16_a_loop ands r2, r2, #0xf beq neon_end neon_copy_8_a: cmp r2, #8 blt neon_copy_4_a ldm r1!, {r4-r5} subs r2, r2, #8 stm r0!, {r4-r5} /* Copy 4-bytes of word-aligned data at a time*/ neon_copy_4_a: cmp r2, #4 blt neon_copy_finish ldr r4, [r1], #4 subs r2, r2, #4 str r4, [r0], #4 b neon_copy_finish /* * Handle unaligned data. The basic concept here is that we'll * try to pull out enough data from the source to get that word- * aligned, then do our writes word-aligned, storing the difference * in a register, and shifting the data as needed. */ neon_memcpy_nonaligned: /* * If this is <8 bytes, it makes more sense to just copy it * quickly instead of incurring all kinds of overhead. */ cmp r2, #8 /* Let's try this...*/ ble neon_copy_finish /* * This is where we'll pull out either 1, 2, or 3 bytes of data * from the source as needed to align it, then store off those * bytes in r4. When we read in the (now) aligned data from the * source, we'll shift the bytes and AND in the r4 data, then write * to the target aligned. * * The conditional ldr calls work slightly faster than the * previous method, confirmed by our pipeline modeler. */ #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) stmdb sp!, {r6-r9} #else push {r6-r9} #endif cmp r12, #2 ldrb r4, [r1], #1 ldrleb r5, [r1], #1 ldrltb r6, [r1], #1 rsb r8, r12, #4 sub r2, r2, r8 lsl r8, r8, #3 orrle r4, r4, r5, lsl #8 orrlt r4, r4, r6, lsl #16 rsb r9, r8, #32 cmp r2, #64 blt neon_unaligned_route ands r12, r0, #0xf beq neon_unaligned_route rsb r12, r12, #16 neon_16_start_u: sub r2, r2, r12 lsrs r6, r12, #2 neon_align_16_4_u: ldr r5, [r1], #4 subs r6, r6, #1 orr r4, r4, r5, lsl r8 str r4, [r0], #4 mov r4, r5, lsr r9 bne neon_align_16_4_u neon_unaligned_route: /* Decide which loop block to branch to.*/ cmp r2, #256 bge neon_copy_64_u cmp r2, #64 bge neon_copy_32_u b neon_copy_finish_u /* Copy data in 64-byte blocks.*/ neon_copy_64_u: #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) vstmdb sp!, {q4} vstmdb sp!, {q5-q8} #else vpush {q4} vpush {q5-q8} #endif /* We'll need this for the q register shift later.*/ vdup.u32 q8, r8 /* * As above, we determine how many times we can go through the * 64-byte copy loop, then countdown. */ mov r12, r2, lsr #6 and r2, r2, #0x3f neon_copy_64_u_loop: /* Load 64-bytes into q4-q7.*/ vld1.32 {q4, q5}, [r1]! vld1.32 {q6, q7}, [r1]! /* * Shift q0-q3 right so everything but the data we need due to the * alignment falls off the right-hand side. The branching * is needed, since vshr requires the shift to be an immediate * value. */ lsls r5, r8, #28 bcc neon_copy_64_u_b8 bpl neon_copy_64_u_b16 vshr.u64 q0, q4, #40 vshr.u64 q1, q5, #40 vshr.u64 q2, q6, #40 vshr.u64 q3, q7, #40 b neon_copy_64_unify neon_copy_64_u_b8: vshr.u64 q0, q4, #56 vshr.u64 q1, q5, #56 vshr.u64 q2, q6, #56 vshr.u64 q3, q7, #56 b neon_copy_64_unify neon_copy_64_u_b16: vshr.u64 q0, q4, #48 vshr.u64 q1, q5, #48 vshr.u64 q2, q6, #48 vshr.u64 q3, q7, #48 neon_copy_64_unify: /* * Shift q4-q7 left by r8 bits to take the alignment into * account. */ #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) vshl.u64 q4, q8, q4 vshl.u64 q5, q8, q5 vshl.u64 q6, q8, q6 vshl.u64 q7, q8, q7 #else vshl.u64 q4, q4, q8 vshl.u64 q5, q5, q8 vshl.u64 q6, q6, q8 vshl.u64 q7, q7, q8 #endif /* * The data in s14 will be needed for the next loop iteration. Move * that to r5. */ vmov r5, s14 /* We'll vorr the shifted data with the data that needs to move back.*/ vorr d9, d9, d0 /* Copy the data from the previous loop into s14.*/ vmov s14, r4 vorr d10, d10, d1 vorr d11, d11, d2 vorr d12, d12, d3 vorr d13, d13, d4 vorr d14, d14, d5 vorr d15, d15, d6 vorr d8, d8, d7 subs r12, r12, #1 pld [r1, #0] pld [r1, #128] /* Save off the r5 data into r4 for the next iteration.*/ mov r4, r5 vst1.32 {q4, q5}, [r0]! vst1.32 {q6, q7}, [r0]! pld [r0, #0] pld [r0, #128] bne neon_copy_64_u_loop #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) vldmia sp!, {q5-q8} vldmia sp!, {q4} #else vpop {q5-q8} vpop {q4} #endif cmp r2, #32 bge neon_copy_32_u b neon_copy_finish_u neon_copy_32_u: #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) vstmdb sp!, {q4} #else vpush {q4} #endif vdup.u32 q4, r8 mov r12, r2, lsr #5 and r2, r2, #0x1f neon_copy_32_u_loop: vld1.32 {q0, q1}, [r1]! lsls r5, r8, #28 bcc neon_copy_32_u_b8 bpl neon_copy_32_u_b16 vshr.u64 q2, q0, #40 vshr.u64 q3, q1, #40 b neon_copy_32_unify neon_copy_32_u_b8: vshr.u64 q2, q0, #56 vshr.u64 q3, q1, #56 b neon_copy_32_unify neon_copy_32_u_b16: vshr.u64 q2, q0, #48 vshr.u64 q3, q1, #48 neon_copy_32_unify: #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) vshl.u64 q0, q4, q0 vshl.u64 q1, q4, q1 #else vshl.u64 q0, q0, q4 vshl.u64 q1, q1, q4 #endif vmov r5, s14 vorr d1, d1, d4 vmov s14, r4 vorr d2, d2, d5 vorr d3, d3, d6 vorr d0, d0, d7 subs r12, r12, #1 pld [r1, #0] mov r4, r5 vst1.32 {q0, q1}, [r0]! bne neon_copy_32_u_loop #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) vldmia sp!, {q4} #else vpop {q4} #endif neon_copy_finish_u: neon_copy_16_u: movs r12, r2, lsr #4 beq neon_copy_8_u vdup.u32 q2, r8 and r2, r2, #0xf neon_copy_16_u_loop: vld1.32 {q0}, [r1]! lsls r5, r8, #28 bcc neon_copy_16_u_b8 bpl neon_copy_16_u_b16 vshr.u64 q1, q0, #40 b neon_copy_16_unify neon_copy_16_u_b8: vshr.u64 q1, q0, #56 b neon_copy_16_unify neon_copy_16_u_b16: vshr.u64 q1, q0, #48 neon_copy_16_unify: #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) vshl.u64 q0, q2, q0 #else vshl.u64 q0, q0, q2 #endif vmov r5, s6 vorr d1, d1, d2 vmov s6, r4 vorr d0, d0, d3 subs r12, r12, #1 mov r4, r5 vst1.32 {q0}, [r0]! bne neon_copy_16_u_loop neon_copy_8_u: cmp r2, #8 blt neon_copy_4_u ldm r1!, {r6-r7} subs r2, r2, #8 orr r4, r4, r6, lsl r8 mov r5, r6, lsr r9 orr r5, r5, r7, lsl r8 stm r0!, {r4-r5} mov r4, r7, lsr r9 neon_copy_4_u: cmp r2, #4 blt neon_copy_last_bits_u ldr r5, [r1], #4 subs r2, r2, #4 orr r4, r4, r5, lsl r8 str r4, [r0], #4 mov r4, r5, lsr r9 neon_copy_last_bits_u: /* * Remember, r8 contains the size of the data in r4 in bits, * so to get to bytes we'll need to shift 3 places */ lsr r8, r8, #0x3 /* Write out the bytes stored in r4.*/ neon_copy_last_bits_u_loop: strb r4, [r0], #1 subs r8, r8, #1 lsrne r4, r4, #8 bne neon_copy_last_bits_u_loop #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) ldmia sp!, {r6-r9} #else pop {r6-r9} #endif neon_copy_finish: cmp r2, #0 beq neon_end /* * This just copies the data from source to target one byte * at a time. For some small values, this makes more sense. * Note that since this code copies data a byte at a time, * both the aligned and unaligned paths can use it. */ neon_copy_finish_loop: ldrb r4, [r1], #1 subs r2, r2, #1 strb r4, [r0], #1 bne neon_copy_finish_loop neon_end: #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) ldmia sp!, {r4-r5} ldmia sp!, {r0} #else pop {r4-r5} pop {r0} #endif bx lr .endfunc .end xf86-video-msm/src/msm-binder.c0000644000175000017500000001356311615776600016470 0ustar paulliupaulliu/* msm-binder.c * * Copyright (c) 2009, Code Aurora Forum. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Code Aurora nor * the names of its contributors may be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NON-INFRINGEMENT ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include #include "msm.h" static pthread_t binderid; static pthread_attr_t attr; struct cmsghdr *cmptr; static uid_t *tptr; #define QLEN 10 #define CONTROLLEN CMSG_LEN(sizeof(int)) //#define offsetof(TYPE, MEMBER) ((int)&((TYPE *)0)->MEMBER) #define MAXLINE 4096 #define STALE 30 static int MSMBinderSocket(const char *name, gid_t gid) { int fd = socket(AF_UNIX, SOCK_STREAM, 0); struct sockaddr_un un; int len, ret; if (fd < 0) { ErrorF("%s: ERROR socket(): %m\n", __FUNCTION__); return -1; } unlink(name); memset(&un, 0, sizeof(un)); un.sun_family = AF_UNIX; strcpy(un.sun_path, name); len = offsetof(struct sockaddr_un, sun_path) + strlen(name); ret = bind(fd, (struct sockaddr *)&un, len); if (ret < 0) { ErrorF("%s: ERROR bind() %m\n", __FUNCTION__); close(fd); return -1; } /* Set the permissions on the file to the desired group - the effective owner remains the same */ if (chown(name, geteuid(), gid)) ErrorF("%s: ERROR chown(): %m\n", __FUNCTION__); /* Set the mode 0660 - read write by user and owner */ if (chmod(name, 0660)) ErrorF("%s: chmod(): %m\n", __FUNCTION__); ret = listen(fd, QLEN); if (ret < 0) { ErrorF("%s: ERROR listen() %m\n", __FUNCTION__); close(fd); return -1; } return fd; } static int MSMBinderAccept(int fd, uid_t * uidptr) { socklen_t len; int cfd; time_t staletime; struct sockaddr_un un; struct stat s; len = sizeof(un); cfd = accept(fd, (struct sockaddr *)&un, &len); if (cfd < 0) { /* Don't log a message on EINTR */ if (errno != EINTR) ErrorF("%s: ERROR accept() %m\n", __FUNCTION__); return cfd; } if (stat(un.sun_path, &s) < 0) { close(cfd); ErrorF("%s: ERROR stat() %m\n", __FUNCTION__); return -1; } if ((s.st_mode & (S_IRWXG | S_IRWXO)) || (s.st_mode & S_IRWXU) != S_IRWXU) { close(cfd); return -1; } staletime = time(NULL) - STALE; if (s.st_atime < staletime || s.st_ctime < staletime || s.st_mtime < staletime) { close(cfd); return -1; } if (uidptr != NULL) *uidptr = s.st_uid; unlink(un.sun_path); return cfd; } static int MSMBinderGetFD(int fd) { int newfd = -1, nr, status; char *ptr; int *iptr; char buf[MAXLINE]; struct iovec iov[1]; struct msghdr msg; status = -1; for (;;) { iov[0].iov_base = buf; iov[0].iov_len = sizeof(buf); msg.msg_iov = iov; msg.msg_iovlen = 1; msg.msg_name = NULL; msg.msg_namelen = 0; if (cmptr == NULL && (cmptr = malloc(CONTROLLEN)) == NULL) return (-1); msg.msg_control = cmptr; msg.msg_controllen = CONTROLLEN; if ((nr = recvmsg(fd, &msg, 0)) < 0) { ErrorF("%s: recvmsg() %m\n", __FUNCTION__); } else if (nr == 0) { return (-1); } for (ptr = buf; ptr < &buf[nr];) { if (*ptr++ == 0) { if (ptr != &buf[nr - 1]) ErrorF("%s: message format error\n", __FUNCTION__); status = *ptr & 0xFF; if (status == 0) { if (msg.msg_controllen != CONTROLLEN) ErrorF("%s: status = 0 but no fd\n", __FUNCTION__); iptr = (int *) CMSG_DATA(cmptr); newfd = *iptr; } else { newfd = -status; } nr -= 2; } } if (status >= 0) { return (newfd); } } } void * MSMBinderProc(void *arg) { MSMPtr pMsm = (MSMPtr) arg; int fd = MSMBinderSocket("/var/tmp/hwsocket", pMsm->socketGID); if (fd == -1) { ErrorF("%s: Unable to start the binder thread.\n", __FUNCTION__); return NULL; } while (1) { int child; int pfd; child = MSMBinderAccept(fd, tptr); if (child == -1) continue; pfd = MSMBinderGetFD(child); /* FIXME: Probably need a mutex here */ pMsm->pfd = pfd; close(child); } } void MSMBinderInit(MSMPtr pMsm) { pthread_attr_init(&attr); pthread_attr_setscope(&attr, PTHREAD_SCOPE_SYSTEM); if (pthread_create(&binderid, &attr, MSMBinderProc, pMsm)) ErrorF("%s: Unable to create the binder thread\n", __FUNCTION__); } xf86-video-msm/src/msm-drm.c0000644000175000017500000001571611615776600016011 0ustar paulliupaulliu/* msm-drm.c * * Copyright (c) 2009, Code Aurora Forum. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Code Aurora nor * the names of its contributors may be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NON-INFRINGEMENT ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include "msm.h" #include "msm-drm.h" int msm_drm_bo_set_memtype(struct msm_drm_bo *bo, int type) { static int ebionly = 0; int ret; struct drm_kgsl_gem_memtype mtype; if (bo == NULL || bo->handle == 0) return -1; /* Only fail the ioctl() once - the other times just quietly * force the mode to EBI - see below */ if (ebionly) { bo->memtype = DRM_KGSL_GEM_TYPE_EBI; return 0; } switch(type) { case MSM_DRM_MEMTYPE_KMEM: bo->memtype = DRM_KGSL_GEM_TYPE_KMEM; break; case MSM_DRM_MEMTYPE_EBI: bo->memtype = DRM_KGSL_GEM_TYPE_EBI; break; case MSM_DRM_MEMTYPE_SMI: bo->memtype = DRM_KGSL_GEM_TYPE_SMI; break; case MSM_DRM_MEMTYPE_KMEM_NOCACHE: bo->memtype = DRM_KGSL_GEM_TYPE_KMEM_NOCACHE; break; default: return -1; } memset(&mtype, 0, sizeof(mtype)); mtype.handle = bo->handle; mtype.type = bo->memtype; ret = ioctl(bo->fd, DRM_IOCTL_KGSL_GEM_SETMEMTYPE, &mtype); if (ret) { /* If the ioctl() isn't supported, then the legacy behavior * is to put everything is in EBI */ if (errno == EINVAL) { ErrorF("DRM: DRM_IOCTL_KGSL_GEM_SETMEMTYPE is not supported.\n"); ErrorF(" All offscreen memory will be in EBI\n"); bo->memtype = DRM_KGSL_GEM_TYPE_EBI; /* Set a flag so we don't come in here and fail for every * allocation */ ebionly = 1; return 0; } } return ret; } int msm_drm_bo_get_memtype(struct msm_drm_bo *bo) { struct drm_kgsl_gem_memtype mtype; int ret; if (bo == NULL || bo->handle == 0) return -1; if (bo->memtype < 0) { memset(&mtype, 0, sizeof(mtype)); mtype.handle = bo->handle; ret = ioctl(bo->fd, DRM_IOCTL_KGSL_GEM_SETMEMTYPE, &mtype); if (ret) return ret; } switch(bo->memtype) { case DRM_KGSL_GEM_TYPE_KMEM: return MSM_DRM_MEMTYPE_KMEM; case DRM_KGSL_GEM_TYPE_KMEM_NOCACHE: return MSM_DRM_MEMTYPE_KMEM_NOCACHE; case DRM_KGSL_GEM_TYPE_EBI: return MSM_DRM_MEMTYPE_EBI; case DRM_KGSL_GEM_TYPE_SMI: return MSM_DRM_MEMTYPE_SMI; } return -1; } struct msm_drm_bo * msm_drm_bo_create(int fd, int size) { struct drm_kgsl_gem_create create; struct msm_drm_bo *bo; int ret; size = (size + (getpagesize() - 1)) & ~(getpagesize() - 1); if (size == 0) return NULL; memset(&create, 0, sizeof(create)); create.size = size; ret = ioctl(fd, DRM_IOCTL_KGSL_GEM_CREATE, &create); if (ret) return NULL; bo = xcalloc(1, sizeof(struct msm_drm_bo)); if (bo == NULL) return NULL; bo->size = size; bo->handle = create.handle; bo->fd = fd; /* All memory defaults to EBI */ bo->memtype = DRM_KGSL_GEM_TYPE_EBI; return bo; } struct msm_drm_bo * msm_drm_bo_create_memtype(int fd, int size, int type) { struct msm_drm_bo *bo = msm_drm_bo_create(fd, size); if (bo == NULL) return NULL; if (msm_drm_bo_set_memtype(bo, type)) { msm_drm_bo_free(bo); return NULL; } return bo; } int msm_drm_bo_flink(struct msm_drm_bo *bo, unsigned int *name) { struct drm_gem_flink flink; int ret; memset(&flink, 0, sizeof(flink)); if (bo == NULL) return -1; flink.handle = bo->handle; ret = ioctl(bo->fd, DRM_IOCTL_GEM_FLINK, &flink); if (ret) return -1; bo->name = flink.name; if (name) *name = flink.name; return 0; } int msm_drm_bo_alloc(struct msm_drm_bo *bo) { struct drm_kgsl_gem_alloc alloc; int ret; if (bo == NULL) return -1; /* If the offset is set, then assume it has been allocated */ if (bo->offset != 0) return 0; memset(&alloc, 0, sizeof(alloc)); alloc.handle = bo->handle; ret = ioctl(bo->fd, DRM_IOCTL_KGSL_GEM_ALLOC, &alloc); if (ret) { /* if the ioctl isn't supported, then use the legacy PREP ioctl */ if (errno == EINVAL) { struct drm_kgsl_gem_prep prep; ErrorF("DRM: DRM_IOCTL_KGSL_GEM_ALLOC is not supported.\n"); memset(&prep, 0, sizeof(prep)); prep.handle = bo->handle; ret = ioctl(bo->fd, DRM_IOCTL_KGSL_GEM_PREP, &prep); if (ret) return -1; bo->offset = prep.offset; return 0; } return ret; } bo->offset = alloc.offset; return 0; } int msm_drm_bo_map(struct msm_drm_bo *bo) { int ret; if (bo == NULL) return -1; /* Already mapped */ if (bo->virt) return 0; if (!bo->offset) { ret = msm_drm_bo_alloc(bo); if (ret) { ErrorF("DRM: Unable to allocate: %m\n"); return ret; } } bo->virt = mmap(0, bo->size, PROT_READ | PROT_WRITE, MAP_SHARED, bo->fd, bo->offset); if (bo->virt == MAP_FAILED) { bo->virt = 0; ErrorF("DRM: Unable to map: %m\n"); return -1; } return 0; } void msm_drm_bo_unmap(struct msm_drm_bo *bo) { if (bo == NULL) return; /* For the moment, always leave buffers mapped */ #if 0 if (bo->virt) munmap((void *) bo->virt, bo->size); bo->virt = 0; #endif } void msm_drm_bo_free(struct msm_drm_bo *bo) { struct drm_gem_close close; int ret; if (bo == NULL || bo->handle == 0) return; if (bo->virt) munmap((void *) bo->virt, bo->size); memset(&close, 0, sizeof(close)); close.handle = bo->handle; ret = ioctl(bo->fd, DRM_IOCTL_GEM_CLOSE, &close); xfree(bo); } xf86-video-msm/src/msm-swrender.c0000755000175000017500000044152111615776600017060 0ustar paulliupaulliu/* msm-swrender.c * * Copyright (c) 2009, Code Aurora Forum. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Code Aurora nor * the names of its contributors may be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NON-INFRINGEMENT ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include "msm-render.h" // Shared software blit code. #include "msm.h" #include "msm-drm.h" #include "msm-swblits.h" // Should only be needed for working around a kernel issue that fails to save/restore Neon registers for userspace signal handlers. #define MASK_SIGNALS (TRUE) #if (MASK_SIGNALS) #include "signal.h" // Needed only for masking signals. #endif // (MASK_SIGNALS) /* Return TRUE if the two rectangles described in the blit request overlap. */ static inline BOOL isOverlap(MSMBlitRec *blit, int bpp) { if (blit->src->priv[0] != blit->dst->priv[0]) return FALSE; int src_x1 = blit->srcRect->x; int src_x2 = src_x1 + blit->srcRect->w; int src_y1 = blit->srcRect->y; int src_y2 = src_y1 + blit->srcRect->h; int dst_x1 = blit->dstRect->x; int dst_x2 = dst_x1 + blit->dstRect->w; int dst_y1 = blit->dstRect->y; int dst_y2 = dst_y1 + blit->dstRect->h; return (src_x2 >= dst_x1 && src_x1 < dst_x2) // 'X' coordinates overlap && (src_y2 >= dst_y1 && src_y1 < dst_y2); // and 'Y' coordinates overlap. } /* A copy is compatible with the MDP if the source and destination rectangles do not overlap. */ BOOL isCopyMDPCompatible(MSMBlitRec *blit, int bpp) { #if MDP_BLIT_REQ_VERSION < 2 if (blit->src->flags == MSM_BLIT_GEM || blit->dst->flags == MSM_BLIT_GEM) return FALSE; #endif /* Can't use the hardware if the src or the dest surface is * in kmem */ if (blit->src->flags == MSM_BLIT_GEM && msm_drm_bo_get_memtype((struct msm_drm_bo *) blit->src->priv[0]) == MSM_DRM_MEMTYPE_KMEM) return FALSE; if (blit->dst->flags == MSM_BLIT_GEM && msm_drm_bo_get_memtype((struct msm_drm_bo *) blit->dst->priv[0]) == MSM_DRM_MEMTYPE_KMEM) return FALSE; return (!isOverlap(blit, bpp)); } /* Alignment check macro functions used to determine if two pointers are aligned with a specified granularity. */ #define SW_CHECK_ALIGNMENT(ALIGNMENT_BYTE_SIZE,dst,src,REQUIRED_ALIGNMENT) \ (((int) (dst) % (ALIGNMENT_BYTE_SIZE)) == (REQUIRED_ALIGNMENT) \ && ((int) (src) % (ALIGNMENT_BYTE_SIZE)) == (REQUIRED_ALIGNMENT)) /* Alignment check macro functions used to determine if two pointers (along with pitches) are aligned with a specified granularity. */ /* (Having the pitches aligned, as well as the pointers, insures that all pointers when incremented by the pitches will still be aligned.) */ #define SW_CHECK_PITCHED_ALIGNMENT(ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,REQUIRED_ALIGNMENT) \ (((int) (dst) % (ALIGNMENT_BYTE_SIZE)) == (REQUIRED_ALIGNMENT) \ && ((int) (src) % (ALIGNMENT_BYTE_SIZE)) == (REQUIRED_ALIGNMENT) \ && (abs(dpitch) % (ALIGNMENT_BYTE_SIZE)) == 0 \ && (abs(spitch) % (ALIGNMENT_BYTE_SIZE)) == 0) /* Copy a row of 16bpp pixels, for fixed-size widths. */ /* (Pointers are assumed to be half-word-aligned, which should be guaranteed for 16bpp.) */ static inline BOOL swCopy16BppSmallFixedWidths1Row_Unaligned(unsigned char *dst, unsigned char *src, int w, int xdir) { // Try to copy the following pixels using 16-bit alignment, or higher alignments if available. // Also, unroll loops as much as possible to prevent stores from interferring with subsequent loads. switch(w) { // NOTE: Several callers of this code assume that all calls with w<=8 will succeeed and return TRUE. case 0: return TRUE; break; case 1: { uint16_t src1 = *(uint16_t *) (src+0*BYTES_PER_UINT16_T); *(uint16_t *) (dst+0*BYTES_PER_UINT16_T) = src1; return TRUE; } break; case 2: if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { uint32_t src1 = *(uint32_t *) (src+0*BYTES_PER_UINT32_T); *(uint32_t *) (dst+0*BYTES_PER_UINT32_T) = src1; return TRUE;} else { uint16_t src1 = *(uint16_t *) (src+0*BYTES_PER_UINT16_T); uint16_t src2 = *(uint16_t *) (src+1*BYTES_PER_UINT16_T); *(uint16_t *) (dst+0*BYTES_PER_UINT16_T) = src1; *(uint16_t *) (dst+1*BYTES_PER_UINT16_T) = src2; return TRUE; } break; case 3: if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { uint32_t src1 = *(uint32_t *) (src+0); uint16_t src2 = *(uint16_t *) (src+1*BYTES_PER_UINT32_T); *(uint32_t *) (dst+0) = src1; *(uint16_t *) (dst+1*BYTES_PER_UINT32_T) = src2; return TRUE; } else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,2)){ uint16_t src1 = *(uint16_t *) (src+0); uint32_t src2 = *(uint32_t *) (src+1*BYTES_PER_UINT16_T); *(uint16_t *) (dst+0) = src1; *(uint32_t *) (dst+1*BYTES_PER_UINT16_T) = src2; return TRUE; } else { uint16_t src1 = *(uint16_t *) (src+0*BYTES_PER_UINT16_T); uint16_t src2 = *(uint16_t *) (src+1*BYTES_PER_UINT16_T); uint16_t src3 = *(uint16_t *) (src+2*BYTES_PER_UINT16_T); *(uint16_t *) (dst+0*BYTES_PER_UINT16_T) = src1; *(uint16_t *) (dst+1*BYTES_PER_UINT16_T) = src2; *(uint16_t *) (dst+2*BYTES_PER_UINT16_T) = src3; return TRUE; } break; case 4: if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { uint64_t src1 = *(uint64_t *) (src+0*BYTES_PER_UINT64_T); *(uint64_t *) (dst+0*BYTES_PER_UINT64_T) = src1; return TRUE; } else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { uint32_t src1 = *(uint32_t *) (src+0*BYTES_PER_UINT32_T); uint32_t src2 = *(uint32_t *) (src+1*BYTES_PER_UINT32_T); *(uint32_t *) (dst+0*BYTES_PER_UINT32_T) = src1; *(uint32_t *) (dst+1*BYTES_PER_UINT32_T) = src2; return TRUE; } else { uint16_t src1 = *(uint16_t *) (src+0*BYTES_PER_UINT16_T); uint16_t src2 = *(uint16_t *) (src+1*BYTES_PER_UINT16_T); uint16_t src3 = *(uint16_t *) (src+2*BYTES_PER_UINT16_T); uint16_t src4 = *(uint16_t *) (src+3*BYTES_PER_UINT16_T); *(uint16_t *) (dst+0*BYTES_PER_UINT16_T) = src1; *(uint16_t *) (dst+1*BYTES_PER_UINT16_T) = src2; *(uint16_t *) (dst+2*BYTES_PER_UINT16_T) = src3; *(uint16_t *) (dst+3*BYTES_PER_UINT16_T) = src4; return TRUE; } break; case 5: if (xdir >= 0) { swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 4, xdir); swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 1, xdir); return TRUE; } else { swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 1, xdir); swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 4, xdir); return TRUE; } break; case 6: if (xdir >= 0) { if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,2 * BYTES_PER_UINT16_T)) { swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 2, xdir); swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 2 * BYTES_PER_UINT16_T, src + 2 * BYTES_PER_UINT16_T, 4, xdir); return TRUE; } else { swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 4, xdir); swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 2, xdir); return TRUE; } } else { if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,2 * BYTES_PER_UINT16_T)) { swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 2 * BYTES_PER_UINT16_T, src + 2 * BYTES_PER_UINT16_T, 4, xdir); swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 2, xdir); return TRUE; } else { swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 2, xdir); swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 4, xdir); return TRUE; } } break; case 7: if (xdir >= 0) { swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 4, xdir); swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir); return TRUE; } else { swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir); swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 4, xdir); return TRUE; } break; case 8: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T)); vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1); return TRUE; } else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { uint64_t src1 = *(uint64_t *) (src+0*BYTES_PER_UINT64_T); uint64_t src2 = *(uint64_t *) (src+1*BYTES_PER_UINT64_T); *(uint64_t *) (dst+0*BYTES_PER_UINT64_T) = src1; *(uint64_t *) (dst+1*BYTES_PER_UINT64_T) = src2; return TRUE; } else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { uint32_t src1 = *(uint32_t *) (src+0*BYTES_PER_UINT32_T); uint32_t src2 = *(uint32_t *) (src+1*BYTES_PER_UINT32_T); uint32_t src3 = *(uint32_t *) (src+2*BYTES_PER_UINT32_T); uint32_t src4 = *(uint32_t *) (src+3*BYTES_PER_UINT32_T); *(uint32_t *) (dst+0*BYTES_PER_UINT32_T) = src1; *(uint32_t *) (dst+1*BYTES_PER_UINT32_T) = src2; *(uint32_t *) (dst+2*BYTES_PER_UINT32_T) = src3; *(uint32_t *) (dst+3*BYTES_PER_UINT32_T) = src4; return TRUE; } else { uint16_t src1 = *(uint16_t *) (src+0*BYTES_PER_UINT16_T); uint16_t src2 = *(uint16_t *) (src+1*BYTES_PER_UINT16_T); uint16_t src3 = *(uint16_t *) (src+2*BYTES_PER_UINT16_T); uint16_t src4 = *(uint16_t *) (src+3*BYTES_PER_UINT16_T); uint16_t src5 = *(uint16_t *) (src+4*BYTES_PER_UINT16_T); uint16_t src6 = *(uint16_t *) (src+5*BYTES_PER_UINT16_T); uint16_t src7 = *(uint16_t *) (src+6*BYTES_PER_UINT16_T); uint16_t src8 = *(uint16_t *) (src+7*BYTES_PER_UINT16_T); *(uint16_t *) (dst+0*BYTES_PER_UINT16_T) = src1; *(uint16_t *) (dst+1*BYTES_PER_UINT16_T) = src2; *(uint16_t *) (dst+2*BYTES_PER_UINT16_T) = src3; *(uint16_t *) (dst+3*BYTES_PER_UINT16_T) = src4; *(uint16_t *) (dst+4*BYTES_PER_UINT16_T) = src5; *(uint16_t *) (dst+5*BYTES_PER_UINT16_T) = src6; *(uint16_t *) (dst+6*BYTES_PER_UINT16_T) = src7; *(uint16_t *) (dst+7*BYTES_PER_UINT16_T) = src8; return TRUE; } break; case 16: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T)); uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T)); vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1); vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2); return TRUE; } else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { uint64_t src1 = *(uint64_t *) (src+0*BYTES_PER_UINT64_T); uint64_t src2 = *(uint64_t *) (src+1*BYTES_PER_UINT64_T); uint64_t src3 = *(uint64_t *) (src+2*BYTES_PER_UINT64_T); uint64_t src4 = *(uint64_t *) (src+3*BYTES_PER_UINT64_T); *(uint64_t *) (dst+0*BYTES_PER_UINT64_T) = src1; *(uint64_t *) (dst+1*BYTES_PER_UINT64_T) = src2; *(uint64_t *) (dst+2*BYTES_PER_UINT64_T) = src3; *(uint64_t *) (dst+3*BYTES_PER_UINT64_T) = src4; return TRUE; } else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { uint32_t src1 = *(uint32_t *) (src+0*BYTES_PER_UINT32_T); uint32_t src2 = *(uint32_t *) (src+1*BYTES_PER_UINT32_T); uint32_t src3 = *(uint32_t *) (src+2*BYTES_PER_UINT32_T); uint32_t src4 = *(uint32_t *) (src+3*BYTES_PER_UINT32_T); uint32_t src5 = *(uint32_t *) (src+4*BYTES_PER_UINT32_T); uint32_t src6 = *(uint32_t *) (src+5*BYTES_PER_UINT32_T); uint32_t src7 = *(uint32_t *) (src+6*BYTES_PER_UINT32_T); uint32_t src8 = *(uint32_t *) (src+7*BYTES_PER_UINT32_T); *(uint32_t *) (dst+0*BYTES_PER_UINT32_T) = src1; *(uint32_t *) (dst+1*BYTES_PER_UINT32_T) = src2; *(uint32_t *) (dst+2*BYTES_PER_UINT32_T) = src3; *(uint32_t *) (dst+3*BYTES_PER_UINT32_T) = src4; *(uint32_t *) (dst+4*BYTES_PER_UINT32_T) = src5; *(uint32_t *) (dst+5*BYTES_PER_UINT32_T) = src6; *(uint32_t *) (dst+6*BYTES_PER_UINT32_T) = src7; *(uint32_t *) (dst+7*BYTES_PER_UINT32_T) = src8; return TRUE; } else { // Don't bother unrolling loops here, since that won't help for more than around 8 operations. // Instead, just call multiple fixed functions. if (xdir >= 0) { swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 8, xdir); swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir); } else { swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir); swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 8, xdir); } return TRUE; } break; case 32: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T)); uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T)); uint32x4_t src3 = vld1q_u32((uint32_t *)(src+2*BYTES_PER_UINT32X4_T)); uint32x4_t src4 = vld1q_u32((uint32_t *)(src+3*BYTES_PER_UINT32X4_T)); vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1); vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2); vst1q_u32((uint32_t *)(dst+2*BYTES_PER_UINT32X4_T),src3); vst1q_u32((uint32_t *)(dst+3*BYTES_PER_UINT32X4_T),src4); return TRUE; } else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { uint64_t src1 = *(uint64_t *) (src+0*BYTES_PER_UINT64_T); uint64_t src2 = *(uint64_t *) (src+1*BYTES_PER_UINT64_T); uint64_t src3 = *(uint64_t *) (src+2*BYTES_PER_UINT64_T); uint64_t src4 = *(uint64_t *) (src+3*BYTES_PER_UINT64_T); uint64_t src5 = *(uint64_t *) (src+4*BYTES_PER_UINT64_T); uint64_t src6 = *(uint64_t *) (src+5*BYTES_PER_UINT64_T); uint64_t src7 = *(uint64_t *) (src+6*BYTES_PER_UINT64_T); uint64_t src8 = *(uint64_t *) (src+7*BYTES_PER_UINT64_T); *(uint64_t *) (dst+0*BYTES_PER_UINT64_T) = src1; *(uint64_t *) (dst+1*BYTES_PER_UINT64_T) = src2; *(uint64_t *) (dst+2*BYTES_PER_UINT64_T) = src3; *(uint64_t *) (dst+3*BYTES_PER_UINT64_T) = src4; *(uint64_t *) (dst+4*BYTES_PER_UINT64_T) = src5; *(uint64_t *) (dst+5*BYTES_PER_UINT64_T) = src6; *(uint64_t *) (dst+6*BYTES_PER_UINT64_T) = src7; *(uint64_t *) (dst+7*BYTES_PER_UINT64_T) = src8; return TRUE; } break; case 64: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T)); uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T)); uint32x4_t src3 = vld1q_u32((uint32_t *)(src+2*BYTES_PER_UINT32X4_T)); uint32x4_t src4 = vld1q_u32((uint32_t *)(src+3*BYTES_PER_UINT32X4_T)); uint32x4_t src5 = vld1q_u32((uint32_t *)(src+4*BYTES_PER_UINT32X4_T)); uint32x4_t src6 = vld1q_u32((uint32_t *)(src+5*BYTES_PER_UINT32X4_T)); uint32x4_t src7 = vld1q_u32((uint32_t *)(src+6*BYTES_PER_UINT32X4_T)); uint32x4_t src8 = vld1q_u32((uint32_t *)(src+7*BYTES_PER_UINT32X4_T)); vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1); vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2); vst1q_u32((uint32_t *)(dst+2*BYTES_PER_UINT32X4_T),src3); vst1q_u32((uint32_t *)(dst+3*BYTES_PER_UINT32X4_T),src4); vst1q_u32((uint32_t *)(dst+4*BYTES_PER_UINT32X4_T),src5); vst1q_u32((uint32_t *)(dst+5*BYTES_PER_UINT32X4_T),src6); vst1q_u32((uint32_t *)(dst+6*BYTES_PER_UINT32X4_T),src7); vst1q_u32((uint32_t *)(dst+7*BYTES_PER_UINT32X4_T),src8); return TRUE; } break; } return FALSE; } /* Copy two rows of 16bpp pixels, for fixed-size widths. */ /* (Pointers are assumed to be half-word-aligned, which should be guaranteed for 16bpp.) */ static inline BOOL swCopy16BppSmallFixedWidths2Rows_Unaligned(unsigned char *dst, unsigned char *src, int w, int xdir, int dpitch, int spitch) { // Try to copy the following pixels using 16-bit alignment, or higher alignments if available. // Also, unroll loops as much as possible to prevent stores interferring with subsequent loads. switch(w) { // NOTE: Several callers of this code assume that all calls with w<=8 will succeeed and return TRUE. case 0: return TRUE; break; case 1: { uint16_t src1a = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T); uint16_t src1b = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T); *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T) = src1a; *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T) = src1b; return TRUE; } break; case 2: if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T); uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T); *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T) = src1a; *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T) = src1b; return TRUE; } else { uint16_t src1a = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T); uint16_t src2a = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT16_T); uint16_t src1b = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T); uint16_t src2b = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT16_T); *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T) = src1a; *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T) = src2a; *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T) = src1b; *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T) = src2b; return TRUE; } break; case 3: if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { uint32_t src1a = *(uint32_t *) (src+0*spitch+0); uint16_t src2a = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT32_T); uint32_t src1b = *(uint32_t *) (src+1*spitch+0); uint16_t src2b = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT32_T); *(uint32_t *) (dst+0*dpitch+0) = src1a; *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T) = src2a; *(uint32_t *) (dst+1*dpitch+0) = src1b; *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T) = src2b; return TRUE; } else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) { uint16_t src1a = *(uint16_t *) (src+0*spitch); uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT16_T); uint16_t src1b = *(uint16_t *) (src+1*spitch); uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT16_T); *(uint16_t *) (dst+0*dpitch+0) = src1a; *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T) = src2a; *(uint16_t *) (dst+1*dpitch+0) = src1b; *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T) = src2b; return TRUE; } else { uint16_t src1a = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T); uint16_t src2a = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT16_T); uint16_t src3a = *(uint16_t *) (src+0*spitch+2*BYTES_PER_UINT16_T); uint16_t src1b = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T); uint16_t src2b = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT16_T); uint16_t src3b = *(uint16_t *) (src+1*spitch+2*BYTES_PER_UINT16_T); *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T) = src1a; *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T) = src2a; *(uint16_t *) (dst+0*dpitch+2*BYTES_PER_UINT16_T) = src3a; *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T) = src1b; *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T) = src2b; *(uint16_t *) (dst+1*dpitch+2*BYTES_PER_UINT16_T) = src3b; return TRUE; } break; case 4: if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T); uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T); *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a; *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b; return TRUE; } else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T); uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T); uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T); uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T); *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T) = src1a; *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T) = src2a; *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T) = src1b; *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T) = src2b; return TRUE; } else { uint16_t src1a = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T); uint16_t src2a = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT16_T); uint16_t src3a = *(uint16_t *) (src+0*spitch+2*BYTES_PER_UINT16_T); uint16_t src4a = *(uint16_t *) (src+0*spitch+3*BYTES_PER_UINT16_T); uint16_t src1b = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T); uint16_t src2b = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT16_T); uint16_t src3b = *(uint16_t *) (src+1*spitch+2*BYTES_PER_UINT16_T); uint16_t src4b = *(uint16_t *) (src+1*spitch+3*BYTES_PER_UINT16_T); *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T) = src1a; *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T) = src2a; *(uint16_t *) (dst+0*dpitch+2*BYTES_PER_UINT16_T) = src3a; *(uint16_t *) (dst+0*dpitch+3*BYTES_PER_UINT16_T) = src4a; *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T) = src1b; *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T) = src2b; *(uint16_t *) (dst+1*dpitch+2*BYTES_PER_UINT16_T) = src3b; *(uint16_t *) (dst+1*dpitch+3*BYTES_PER_UINT16_T) = src4b; return TRUE; } break; case 5: if (xdir >= 0) { swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 1, xdir, dpitch, spitch); return TRUE; } else { swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 1, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch); return TRUE; } break; case 6: if (xdir >= 0) { if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2 * BYTES_PER_UINT16_T)) { swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 2, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 2 * BYTES_PER_UINT16_T, src + 2 * BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch); } else { swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 2, xdir, dpitch, spitch); } } else { if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2 * BYTES_PER_UINT16_T)) { swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 2 * BYTES_PER_UINT16_T, src + 2 * BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 2, xdir, dpitch, spitch); } else { swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 2, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch); } } return TRUE; break; case 7: if (xdir >= 0) { swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch); } else { swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch); } return TRUE; break; case 8: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T)); uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T)); vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a); vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b); return TRUE; } else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T); uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T); uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T); uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T); *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a; *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a; *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b; *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b; return TRUE; } else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T); uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T); uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T); uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T); uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T); uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T); uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T); uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T); *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T) = src1a; *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T) = src2a; *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T) = src3a; *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T) = src4a; *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T) = src1b; *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T) = src2b; *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T) = src3b; *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T) = src4b; return TRUE; } else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) { uint16_t src1a = *(uint16_t *) (src+0*spitch+0); uint32_t src2a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); uint32_t src3a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); uint32_t src4a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); uint16_t src5a = *(uint16_t *) (src+0*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); uint16_t src1b = *(uint16_t *) (src+1*spitch+0); uint32_t src2b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); uint32_t src3b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); uint32_t src4b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); uint16_t src5b = *(uint16_t *) (src+1*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); *(uint16_t *) (dst+0*dpitch+0) = src1a; *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2a; *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3a; *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4a; *(uint16_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5a; *(uint16_t *) (dst+1*dpitch+0) = src1b; *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2b; *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3b; *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4b; *(uint16_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5b; return TRUE; } else { uint16_t src1a = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T); uint16_t src2a = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT16_T); uint16_t src3a = *(uint16_t *) (src+0*spitch+2*BYTES_PER_UINT16_T); uint16_t src4a = *(uint16_t *) (src+0*spitch+3*BYTES_PER_UINT16_T); uint16_t src5a = *(uint16_t *) (src+0*spitch+4*BYTES_PER_UINT16_T); uint16_t src6a = *(uint16_t *) (src+0*spitch+5*BYTES_PER_UINT16_T); uint16_t src7a = *(uint16_t *) (src+0*spitch+6*BYTES_PER_UINT16_T); uint16_t src8a = *(uint16_t *) (src+0*spitch+7*BYTES_PER_UINT16_T); uint16_t src1b = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T); uint16_t src2b = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT16_T); uint16_t src3b = *(uint16_t *) (src+1*spitch+2*BYTES_PER_UINT16_T); uint16_t src4b = *(uint16_t *) (src+1*spitch+3*BYTES_PER_UINT16_T); uint16_t src5b = *(uint16_t *) (src+1*spitch+4*BYTES_PER_UINT16_T); uint16_t src6b = *(uint16_t *) (src+1*spitch+5*BYTES_PER_UINT16_T); uint16_t src7b = *(uint16_t *) (src+1*spitch+6*BYTES_PER_UINT16_T); uint16_t src8b = *(uint16_t *) (src+1*spitch+7*BYTES_PER_UINT16_T); *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T) = src1a; *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T) = src2a; *(uint16_t *) (dst+0*dpitch+2*BYTES_PER_UINT16_T) = src3a; *(uint16_t *) (dst+0*dpitch+3*BYTES_PER_UINT16_T) = src4a; *(uint16_t *) (dst+0*dpitch+4*BYTES_PER_UINT16_T) = src5a; *(uint16_t *) (dst+0*dpitch+5*BYTES_PER_UINT16_T) = src6a; *(uint16_t *) (dst+0*dpitch+6*BYTES_PER_UINT16_T) = src7a; *(uint16_t *) (dst+0*dpitch+7*BYTES_PER_UINT16_T) = src8a; *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T) = src1b; *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T) = src2b; *(uint16_t *) (dst+1*dpitch+2*BYTES_PER_UINT16_T) = src3b; *(uint16_t *) (dst+1*dpitch+3*BYTES_PER_UINT16_T) = src4b; *(uint16_t *) (dst+1*dpitch+4*BYTES_PER_UINT16_T) = src5b; *(uint16_t *) (dst+1*dpitch+5*BYTES_PER_UINT16_T) = src6b; *(uint16_t *) (dst+1*dpitch+6*BYTES_PER_UINT16_T) = src7b; *(uint16_t *) (dst+1*dpitch+7*BYTES_PER_UINT16_T) = src8b; return TRUE; } break; case 16: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T)); uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T)); uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T)); uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T)); vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a); vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a); vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b); vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b); return TRUE; } else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T); uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T); uint64_t src3a = *(uint64_t *) (src+0*spitch+2*BYTES_PER_UINT64_T); uint64_t src4a = *(uint64_t *) (src+0*spitch+3*BYTES_PER_UINT64_T); uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T); uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T); uint64_t src3b = *(uint64_t *) (src+1*spitch+2*BYTES_PER_UINT64_T); uint64_t src4b = *(uint64_t *) (src+1*spitch+3*BYTES_PER_UINT64_T); *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a; *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a; *(uint64_t *) (dst+0*dpitch+2*BYTES_PER_UINT64_T) = src3a; *(uint64_t *) (dst+0*dpitch+3*BYTES_PER_UINT64_T) = src4a; *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b; *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b; *(uint64_t *) (dst+1*dpitch+2*BYTES_PER_UINT64_T) = src3b; *(uint64_t *) (dst+1*dpitch+3*BYTES_PER_UINT64_T) = src4b; return TRUE; } else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT32_T)) { uint32_t src1a = *(uint32_t *) (src+0*spitch+0); uint64_t src2a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T); uint64_t src3a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T); uint64_t src4a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T); uint32_t src5a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T); uint32_t src1b = *(uint32_t *) (src+1*spitch+0); uint64_t src2b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T); uint64_t src3b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T); uint64_t src4b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T); uint32_t src5b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T); *(uint32_t *) (dst+0*dpitch+0) = src1a; *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2a; *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3a; *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4a; *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5a; *(uint32_t *) (dst+1*dpitch+0) = src1b; *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2b; *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3b; *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4b; *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5b; return TRUE; } else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T); uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T); uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T); uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T); uint32_t src5a = *(uint32_t *) (src+0*spitch+4*BYTES_PER_UINT32_T); uint32_t src6a = *(uint32_t *) (src+0*spitch+5*BYTES_PER_UINT32_T); uint32_t src7a = *(uint32_t *) (src+0*spitch+6*BYTES_PER_UINT32_T); uint32_t src8a = *(uint32_t *) (src+0*spitch+7*BYTES_PER_UINT32_T); uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T); uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T); uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T); uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T); uint32_t src5b = *(uint32_t *) (src+1*spitch+4*BYTES_PER_UINT32_T); uint32_t src6b = *(uint32_t *) (src+1*spitch+5*BYTES_PER_UINT32_T); uint32_t src7b = *(uint32_t *) (src+1*spitch+6*BYTES_PER_UINT32_T); uint32_t src8b = *(uint32_t *) (src+1*spitch+7*BYTES_PER_UINT32_T); *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T) = src1a; *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T) = src2a; *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T) = src3a; *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T) = src4a; *(uint32_t *) (dst+0*dpitch+4*BYTES_PER_UINT32_T) = src5a; *(uint32_t *) (dst+0*dpitch+5*BYTES_PER_UINT32_T) = src6a; *(uint32_t *) (dst+0*dpitch+6*BYTES_PER_UINT32_T) = src7a; *(uint32_t *) (dst+0*dpitch+7*BYTES_PER_UINT32_T) = src8a; *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T) = src1b; *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T) = src2b; *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T) = src3b; *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T) = src4b; *(uint32_t *) (dst+1*dpitch+4*BYTES_PER_UINT32_T) = src5b; *(uint32_t *) (dst+1*dpitch+5*BYTES_PER_UINT32_T) = src6b; *(uint32_t *) (dst+1*dpitch+6*BYTES_PER_UINT32_T) = src7b; *(uint32_t *) (dst+1*dpitch+7*BYTES_PER_UINT32_T) = src8b; return TRUE; } else { // Don't bother unrolling loops, since that won't help for more than around 8 operations. // Instead, just call multiple fixed functions. if (xdir >= 0) { swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); } else { swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch); } return TRUE; } break; case 32: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T)); uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T)); uint32x4_t src3a = vld1q_u32((uint32_t *)(src+0*spitch+2*BYTES_PER_UINT32X4_T)); uint32x4_t src4a = vld1q_u32((uint32_t *)(src+0*spitch+3*BYTES_PER_UINT32X4_T)); uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T)); uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T)); uint32x4_t src3b = vld1q_u32((uint32_t *)(src+1*spitch+2*BYTES_PER_UINT32X4_T)); uint32x4_t src4b = vld1q_u32((uint32_t *)(src+1*spitch+3*BYTES_PER_UINT32X4_T)); vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a); vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a); vst1q_u32((uint32_t *)(dst+0*dpitch+2*BYTES_PER_UINT32X4_T),src3a); vst1q_u32((uint32_t *)(dst+0*dpitch+3*BYTES_PER_UINT32X4_T),src4a); vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b); vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b); vst1q_u32((uint32_t *)(dst+1*dpitch+2*BYTES_PER_UINT32X4_T),src3b); vst1q_u32((uint32_t *)(dst+1*dpitch+3*BYTES_PER_UINT32X4_T),src4b); return TRUE; } else if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,4*BYTES_PER_UINT16_T)) { if (xdir >= 0) { swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0, 4, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4)*BYTES_PER_UINT16_T, src + (4)*BYTES_PER_UINT16_T, 16, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (16+4)*BYTES_PER_UINT16_T, src + (16+4)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+16+4)*BYTES_PER_UINT16_T, src + (8+16+4)*BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch); } else { swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+16+4)*BYTES_PER_UINT16_T, src + (8+16+4)*BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (16+4)*BYTES_PER_UINT16_T, src + (16+4)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4)*BYTES_PER_UINT16_T, src + (4)*BYTES_PER_UINT16_T, 16, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0, 4, xdir, dpitch, spitch); } return TRUE; } else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T); uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T); uint64_t src3a = *(uint64_t *) (src+0*spitch+2*BYTES_PER_UINT64_T); uint64_t src4a = *(uint64_t *) (src+0*spitch+3*BYTES_PER_UINT64_T); uint64_t src5a = *(uint64_t *) (src+0*spitch+4*BYTES_PER_UINT64_T); uint64_t src6a = *(uint64_t *) (src+0*spitch+5*BYTES_PER_UINT64_T); uint64_t src7a = *(uint64_t *) (src+0*spitch+6*BYTES_PER_UINT64_T); uint64_t src8a = *(uint64_t *) (src+0*spitch+7*BYTES_PER_UINT64_T); uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T); uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T); uint64_t src3b = *(uint64_t *) (src+1*spitch+2*BYTES_PER_UINT64_T); uint64_t src4b = *(uint64_t *) (src+1*spitch+3*BYTES_PER_UINT64_T); uint64_t src5b = *(uint64_t *) (src+1*spitch+4*BYTES_PER_UINT64_T); uint64_t src6b = *(uint64_t *) (src+1*spitch+5*BYTES_PER_UINT64_T); uint64_t src7b = *(uint64_t *) (src+1*spitch+6*BYTES_PER_UINT64_T); uint64_t src8b = *(uint64_t *) (src+1*spitch+7*BYTES_PER_UINT64_T); *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a; *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a; *(uint64_t *) (dst+0*dpitch+2*BYTES_PER_UINT64_T) = src3a; *(uint64_t *) (dst+0*dpitch+3*BYTES_PER_UINT64_T) = src4a; *(uint64_t *) (dst+0*dpitch+4*BYTES_PER_UINT64_T) = src5a; *(uint64_t *) (dst+0*dpitch+5*BYTES_PER_UINT64_T) = src6a; *(uint64_t *) (dst+0*dpitch+6*BYTES_PER_UINT64_T) = src7a; *(uint64_t *) (dst+0*dpitch+7*BYTES_PER_UINT64_T) = src8a; *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b; *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b; *(uint64_t *) (dst+1*dpitch+2*BYTES_PER_UINT64_T) = src3b; *(uint64_t *) (dst+1*dpitch+3*BYTES_PER_UINT64_T) = src4b; *(uint64_t *) (dst+1*dpitch+4*BYTES_PER_UINT64_T) = src5b; *(uint64_t *) (dst+1*dpitch+5*BYTES_PER_UINT64_T) = src6b; *(uint64_t *) (dst+1*dpitch+6*BYTES_PER_UINT64_T) = src7b; *(uint64_t *) (dst+1*dpitch+7*BYTES_PER_UINT64_T) = src8b; return TRUE; } else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2*BYTES_PER_UINT16_T)) { if (xdir >= 0) { swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 2, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch); } else { swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 2, xdir, dpitch, spitch); } return TRUE; } else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) { if (xdir >= 0) { swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 1, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch); } else { swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 1, xdir, dpitch, spitch); } return TRUE; } else { if (xdir >= 0) { swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0*8*BYTES_PER_UINT16_T, src + 0*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 1*8*BYTES_PER_UINT16_T, src + 1*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 2*8*BYTES_PER_UINT16_T, src + 2*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 3*8*BYTES_PER_UINT16_T, src + 3*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); } else { swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 3*8*BYTES_PER_UINT16_T, src + 3*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 2*8*BYTES_PER_UINT16_T, src + 2*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 1*8*BYTES_PER_UINT16_T, src + 1*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0*8*BYTES_PER_UINT16_T, src + 0*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); } return TRUE; } break; case 64: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T)); uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T)); uint32x4_t src3a = vld1q_u32((uint32_t *)(src+0*spitch+2*BYTES_PER_UINT32X4_T)); uint32x4_t src4a = vld1q_u32((uint32_t *)(src+0*spitch+3*BYTES_PER_UINT32X4_T)); uint32x4_t src5a = vld1q_u32((uint32_t *)(src+0*spitch+4*BYTES_PER_UINT32X4_T)); uint32x4_t src6a = vld1q_u32((uint32_t *)(src+0*spitch+5*BYTES_PER_UINT32X4_T)); uint32x4_t src7a = vld1q_u32((uint32_t *)(src+0*spitch+6*BYTES_PER_UINT32X4_T)); uint32x4_t src8a = vld1q_u32((uint32_t *)(src+0*spitch+7*BYTES_PER_UINT32X4_T)); uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T)); uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T)); uint32x4_t src3b = vld1q_u32((uint32_t *)(src+1*spitch+2*BYTES_PER_UINT32X4_T)); uint32x4_t src4b = vld1q_u32((uint32_t *)(src+1*spitch+3*BYTES_PER_UINT32X4_T)); uint32x4_t src5b = vld1q_u32((uint32_t *)(src+1*spitch+4*BYTES_PER_UINT32X4_T)); uint32x4_t src6b = vld1q_u32((uint32_t *)(src+1*spitch+5*BYTES_PER_UINT32X4_T)); uint32x4_t src7b = vld1q_u32((uint32_t *)(src+1*spitch+6*BYTES_PER_UINT32X4_T)); uint32x4_t src8b = vld1q_u32((uint32_t *)(src+1*spitch+7*BYTES_PER_UINT32X4_T)); vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a); vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a); vst1q_u32((uint32_t *)(dst+0*dpitch+2*BYTES_PER_UINT32X4_T),src3a); vst1q_u32((uint32_t *)(dst+0*dpitch+3*BYTES_PER_UINT32X4_T),src4a); vst1q_u32((uint32_t *)(dst+0*dpitch+4*BYTES_PER_UINT32X4_T),src5a); vst1q_u32((uint32_t *)(dst+0*dpitch+5*BYTES_PER_UINT32X4_T),src6a); vst1q_u32((uint32_t *)(dst+0*dpitch+6*BYTES_PER_UINT32X4_T),src7a); vst1q_u32((uint32_t *)(dst+0*dpitch+7*BYTES_PER_UINT32X4_T),src8a); vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b); vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b); vst1q_u32((uint32_t *)(dst+1*dpitch+2*BYTES_PER_UINT32X4_T),src3b); vst1q_u32((uint32_t *)(dst+1*dpitch+3*BYTES_PER_UINT32X4_T),src4b); vst1q_u32((uint32_t *)(dst+1*dpitch+4*BYTES_PER_UINT32X4_T),src5b); vst1q_u32((uint32_t *)(dst+1*dpitch+5*BYTES_PER_UINT32X4_T),src6b); vst1q_u32((uint32_t *)(dst+1*dpitch+6*BYTES_PER_UINT32X4_T),src7b); vst1q_u32((uint32_t *)(dst+1*dpitch+7*BYTES_PER_UINT32X4_T),src8b); return TRUE; }//HERE else if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,4*BYTES_PER_UINT16_T)) { if (xdir >= 0) { swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0, 4, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*16+4)*BYTES_PER_UINT16_T, src + (0*16+4)*BYTES_PER_UINT16_T, 2*16, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*16+4)*BYTES_PER_UINT16_T, src + (2*16+4)*BYTES_PER_UINT16_T, 16, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*16+4)*BYTES_PER_UINT16_T, src + (3*16+4)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+3*16+4)*BYTES_PER_UINT16_T, src + (8+3*16+4)*BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch); } else { swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+3*16+4)*BYTES_PER_UINT16_T, src + (8+3*16+4)*BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*16+4)*BYTES_PER_UINT16_T, src + (3*16+4)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*16+4)*BYTES_PER_UINT16_T, src + (2*16+4)*BYTES_PER_UINT16_T, 16, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*16+4)*BYTES_PER_UINT16_T, src + (0*16+4)*BYTES_PER_UINT16_T, 2*16, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0, 4, xdir, dpitch, spitch); } return TRUE; } else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2*BYTES_PER_UINT16_T)) { if (xdir >= 0) { swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 2, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+2)*BYTES_PER_UINT16_T, src + (4*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+2)*BYTES_PER_UINT16_T, src + (5*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+2)*BYTES_PER_UINT16_T, src + (6*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+2)*BYTES_PER_UINT16_T, src + (7*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch); } else { swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+2)*BYTES_PER_UINT16_T, src + (7*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+2)*BYTES_PER_UINT16_T, src + (6*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+2)*BYTES_PER_UINT16_T, src + (5*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+2)*BYTES_PER_UINT16_T, src + (4*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 2, xdir, dpitch, spitch); } return TRUE; } else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) { if (xdir >= 0) { swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 1, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+1)*BYTES_PER_UINT16_T, src + (4*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+1)*BYTES_PER_UINT16_T, src + (5*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+1)*BYTES_PER_UINT16_T, src + (6*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+1)*BYTES_PER_UINT16_T, src + (7*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch); } else { swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+1)*BYTES_PER_UINT16_T, src + (7*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+1)*BYTES_PER_UINT16_T, src + (6*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+1)*BYTES_PER_UINT16_T, src + (5*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+1)*BYTES_PER_UINT16_T, src + (4*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 1, xdir, dpitch, spitch); } return TRUE; } break; } return FALSE; } /* Copy two rows of 16bpp pixels, for fixed-size widths. */ /* (Pointers are assumed to be half-word-aligned, which should be guaranteed for 16bpp.) */ static inline BOOL swCopy16BppSmallFixedWidths4Rows_Unaligned(unsigned char *dst, unsigned char *src, int w, int xdir, int dpitch, int spitch) { // Try to copy the following pixels using 16-bit alignment, or higher alignments if available. // Also, unroll loops as much as possible to prevent stores interferring with subsequent loads. switch(w) { // NOTE: Several callers of this code assume that all calls with w<=8 will succeeed and return TRUE. case 0: return TRUE; break; case 1: { uint16_t src1a = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T); uint16_t src1b = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T); uint16_t src1c = *(uint16_t *) (src+2*spitch+0*BYTES_PER_UINT16_T); uint16_t src1d = *(uint16_t *) (src+3*spitch+0*BYTES_PER_UINT16_T); *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T) = src1a; *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T) = src1b; *(uint16_t *) (dst+2*dpitch+0*BYTES_PER_UINT16_T) = src1c; *(uint16_t *) (dst+3*dpitch+0*BYTES_PER_UINT16_T) = src1d; return TRUE; } break; case 2: if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T); uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T); uint32_t src1c = *(uint32_t *) (src+2*spitch+0*BYTES_PER_UINT32_T); uint32_t src1d = *(uint32_t *) (src+3*spitch+0*BYTES_PER_UINT32_T); *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T) = src1a; *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T) = src1b; *(uint32_t *) (dst+2*dpitch+0*BYTES_PER_UINT32_T) = src1c; *(uint32_t *) (dst+3*dpitch+0*BYTES_PER_UINT32_T) = src1d; return TRUE; } else { uint16_t src1a = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T); uint16_t src2a = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT16_T); uint16_t src1b = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T); uint16_t src2b = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT16_T); uint16_t src1c = *(uint16_t *) (src+2*spitch+0*BYTES_PER_UINT16_T); uint16_t src2c = *(uint16_t *) (src+2*spitch+1*BYTES_PER_UINT16_T); uint16_t src1d = *(uint16_t *) (src+3*spitch+0*BYTES_PER_UINT16_T); uint16_t src2d = *(uint16_t *) (src+3*spitch+1*BYTES_PER_UINT16_T); *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T) = src1a; *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T) = src2a; *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T) = src1b; *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T) = src2b; *(uint16_t *) (dst+2*dpitch+0*BYTES_PER_UINT16_T) = src1c; *(uint16_t *) (dst+2*dpitch+1*BYTES_PER_UINT16_T) = src2c; *(uint16_t *) (dst+3*dpitch+0*BYTES_PER_UINT16_T) = src1d; *(uint16_t *) (dst+3*dpitch+1*BYTES_PER_UINT16_T) = src2d; return TRUE; } break; case 3: if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { uint32_t src1a = *(uint32_t *) (src+0*spitch+0); uint16_t src2a = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT32_T); uint32_t src1b = *(uint32_t *) (src+1*spitch+0); uint16_t src2b = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT32_T); uint32_t src1c = *(uint32_t *) (src+2*spitch+0); uint16_t src2c = *(uint16_t *) (src+2*spitch+1*BYTES_PER_UINT32_T); uint32_t src1d = *(uint32_t *) (src+3*spitch+0); uint16_t src2d = *(uint16_t *) (src+3*spitch+1*BYTES_PER_UINT32_T); *(uint32_t *) (dst+0*dpitch+0) = src1a; *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T) = src2a; *(uint32_t *) (dst+1*dpitch+0) = src1b; *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T) = src2b; *(uint32_t *) (dst+2*dpitch+0) = src1c; *(uint16_t *) (dst+2*dpitch+1*BYTES_PER_UINT32_T) = src2c; *(uint32_t *) (dst+3*dpitch+0) = src1d; *(uint16_t *) (dst+3*dpitch+1*BYTES_PER_UINT32_T) = src2d; return TRUE; } else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) { uint16_t src1a = *(uint16_t *) (src+0*spitch); uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT16_T); uint16_t src1b = *(uint16_t *) (src+1*spitch); uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT16_T); uint16_t src1c = *(uint16_t *) (src+2*spitch); uint32_t src2c = *(uint32_t *) (src+2*spitch+1*BYTES_PER_UINT16_T); uint16_t src1d = *(uint16_t *) (src+3*spitch); uint32_t src2d = *(uint32_t *) (src+3*spitch+1*BYTES_PER_UINT16_T); *(uint16_t *) (dst+0*dpitch+0) = src1a; *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T) = src2a; *(uint16_t *) (dst+1*dpitch+0) = src1b; *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T) = src2b; *(uint16_t *) (dst+2*dpitch+0) = src1c; *(uint32_t *) (dst+2*dpitch+1*BYTES_PER_UINT16_T) = src2c; *(uint16_t *) (dst+3*dpitch+0) = src1d; *(uint32_t *) (dst+3*dpitch+1*BYTES_PER_UINT16_T) = src2d; return TRUE; } else { uint16_t src1a = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T); uint16_t src2a = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT16_T); uint16_t src3a = *(uint16_t *) (src+0*spitch+2*BYTES_PER_UINT16_T); uint16_t src1b = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T); uint16_t src2b = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT16_T); uint16_t src3b = *(uint16_t *) (src+1*spitch+2*BYTES_PER_UINT16_T); uint16_t src1c = *(uint16_t *) (src+2*spitch+0*BYTES_PER_UINT16_T); uint16_t src2c = *(uint16_t *) (src+2*spitch+1*BYTES_PER_UINT16_T); uint16_t src3c = *(uint16_t *) (src+2*spitch+2*BYTES_PER_UINT16_T); uint16_t src1d = *(uint16_t *) (src+3*spitch+0*BYTES_PER_UINT16_T); uint16_t src2d = *(uint16_t *) (src+3*spitch+1*BYTES_PER_UINT16_T); uint16_t src3d = *(uint16_t *) (src+3*spitch+2*BYTES_PER_UINT16_T); *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T) = src1a; *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T) = src2a; *(uint16_t *) (dst+0*dpitch+2*BYTES_PER_UINT16_T) = src3a; *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T) = src1b; *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T) = src2b; *(uint16_t *) (dst+1*dpitch+2*BYTES_PER_UINT16_T) = src3b; *(uint16_t *) (dst+2*dpitch+0*BYTES_PER_UINT16_T) = src1c; *(uint16_t *) (dst+2*dpitch+1*BYTES_PER_UINT16_T) = src2c; *(uint16_t *) (dst+2*dpitch+2*BYTES_PER_UINT16_T) = src3c; *(uint16_t *) (dst+3*dpitch+0*BYTES_PER_UINT16_T) = src1d; *(uint16_t *) (dst+3*dpitch+1*BYTES_PER_UINT16_T) = src2d; *(uint16_t *) (dst+3*dpitch+2*BYTES_PER_UINT16_T) = src3d; return TRUE; } break; case 4: if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T); uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T); uint64_t src1c = *(uint64_t *) (src+2*spitch+0*BYTES_PER_UINT64_T); uint64_t src1d = *(uint64_t *) (src+3*spitch+0*BYTES_PER_UINT64_T); *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a; *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b; *(uint64_t *) (dst+2*dpitch+0*BYTES_PER_UINT64_T) = src1c; *(uint64_t *) (dst+3*dpitch+0*BYTES_PER_UINT64_T) = src1d; return TRUE; } else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T); uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T); uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T); uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T); uint32_t src1c = *(uint32_t *) (src+2*spitch+0*BYTES_PER_UINT32_T); uint32_t src2c = *(uint32_t *) (src+2*spitch+1*BYTES_PER_UINT32_T); uint32_t src1d = *(uint32_t *) (src+3*spitch+0*BYTES_PER_UINT32_T); uint32_t src2d = *(uint32_t *) (src+3*spitch+1*BYTES_PER_UINT32_T); *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T) = src1a; *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T) = src2a; *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T) = src1b; *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T) = src2b; *(uint32_t *) (dst+2*dpitch+0*BYTES_PER_UINT32_T) = src1c; *(uint32_t *) (dst+2*dpitch+1*BYTES_PER_UINT32_T) = src2c; *(uint32_t *) (dst+3*dpitch+0*BYTES_PER_UINT32_T) = src1d; *(uint32_t *) (dst+3*dpitch+1*BYTES_PER_UINT32_T) = src2d; return TRUE; } else { uint16_t src1a = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T); uint16_t src2a = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT16_T); uint16_t src3a = *(uint16_t *) (src+0*spitch+2*BYTES_PER_UINT16_T); uint16_t src4a = *(uint16_t *) (src+0*spitch+3*BYTES_PER_UINT16_T); uint16_t src1b = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T); uint16_t src2b = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT16_T); uint16_t src3b = *(uint16_t *) (src+1*spitch+2*BYTES_PER_UINT16_T); uint16_t src4b = *(uint16_t *) (src+1*spitch+3*BYTES_PER_UINT16_T); uint16_t src1c = *(uint16_t *) (src+2*spitch+0*BYTES_PER_UINT16_T); uint16_t src2c = *(uint16_t *) (src+2*spitch+1*BYTES_PER_UINT16_T); uint16_t src3c = *(uint16_t *) (src+2*spitch+2*BYTES_PER_UINT16_T); uint16_t src4c = *(uint16_t *) (src+2*spitch+3*BYTES_PER_UINT16_T); uint16_t src1d = *(uint16_t *) (src+3*spitch+0*BYTES_PER_UINT16_T); uint16_t src2d = *(uint16_t *) (src+3*spitch+1*BYTES_PER_UINT16_T); uint16_t src3d = *(uint16_t *) (src+3*spitch+2*BYTES_PER_UINT16_T); uint16_t src4d = *(uint16_t *) (src+3*spitch+3*BYTES_PER_UINT16_T); *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T) = src1a; *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T) = src2a; *(uint16_t *) (dst+0*dpitch+2*BYTES_PER_UINT16_T) = src3a; *(uint16_t *) (dst+0*dpitch+3*BYTES_PER_UINT16_T) = src4a; *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T) = src1b; *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T) = src2b; *(uint16_t *) (dst+1*dpitch+2*BYTES_PER_UINT16_T) = src3b; *(uint16_t *) (dst+1*dpitch+3*BYTES_PER_UINT16_T) = src4b; *(uint16_t *) (dst+2*dpitch+0*BYTES_PER_UINT16_T) = src1c; *(uint16_t *) (dst+2*dpitch+1*BYTES_PER_UINT16_T) = src2c; *(uint16_t *) (dst+2*dpitch+2*BYTES_PER_UINT16_T) = src3c; *(uint16_t *) (dst+2*dpitch+3*BYTES_PER_UINT16_T) = src4c; *(uint16_t *) (dst+3*dpitch+0*BYTES_PER_UINT16_T) = src1d; *(uint16_t *) (dst+3*dpitch+1*BYTES_PER_UINT16_T) = src2d; *(uint16_t *) (dst+3*dpitch+2*BYTES_PER_UINT16_T) = src3d; *(uint16_t *) (dst+3*dpitch+3*BYTES_PER_UINT16_T) = src4d; return TRUE; } break; case 5: if (xdir >= 0) { swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 1, xdir, dpitch, spitch); } else { swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 1, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch); } return TRUE; break; case 6: if (xdir >= 0) { if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2 * BYTES_PER_UINT16_T)) { swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 2, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 2 * BYTES_PER_UINT16_T, src + 2 * BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch); } else { swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 2, xdir, dpitch, spitch); } } else { if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2 * BYTES_PER_UINT16_T)) { swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 2 * BYTES_PER_UINT16_T, src + 2 * BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 2, xdir, dpitch, spitch); } else { swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 2, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch); } } return TRUE; break; case 7: if (xdir >= 0) { swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch); } else { swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch); } return TRUE; break; // TODO: Add more alignment checks for 8 pixel-wide cases for performance reasons? // For example, handling (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,DOUBLE_WORD_ALIGNMENT_BYTE_SIZE/2)) and related half-aligned cases... case 8: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T)); uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T)); uint32x4_t src1c = vld1q_u32((uint32_t *)(src+2*spitch+0*BYTES_PER_UINT32X4_T)); uint32x4_t src1d = vld1q_u32((uint32_t *)(src+3*spitch+0*BYTES_PER_UINT32X4_T)); vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a); vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b); vst1q_u32((uint32_t *)(dst+2*dpitch+0*BYTES_PER_UINT32X4_T),src1c); vst1q_u32((uint32_t *)(dst+3*dpitch+0*BYTES_PER_UINT32X4_T),src1d); return TRUE; } else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T); uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T); uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T); uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T); uint64_t src1c = *(uint64_t *) (src+2*spitch+0*BYTES_PER_UINT64_T); uint64_t src2c = *(uint64_t *) (src+2*spitch+1*BYTES_PER_UINT64_T); uint64_t src1d = *(uint64_t *) (src+3*spitch+0*BYTES_PER_UINT64_T); uint64_t src2d = *(uint64_t *) (src+3*spitch+1*BYTES_PER_UINT64_T); *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a; *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a; *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b; *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b; *(uint64_t *) (dst+2*dpitch+0*BYTES_PER_UINT64_T) = src1c; *(uint64_t *) (dst+2*dpitch+1*BYTES_PER_UINT64_T) = src2c; *(uint64_t *) (dst+3*dpitch+0*BYTES_PER_UINT64_T) = src1d; *(uint64_t *) (dst+3*dpitch+1*BYTES_PER_UINT64_T) = src2d; return TRUE; } else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T); uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T); uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T); uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T); uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T); uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T); uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T); uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T); uint32_t src1c = *(uint32_t *) (src+2*spitch+0*BYTES_PER_UINT32_T); uint32_t src2c = *(uint32_t *) (src+2*spitch+1*BYTES_PER_UINT32_T); uint32_t src3c = *(uint32_t *) (src+2*spitch+2*BYTES_PER_UINT32_T); uint32_t src4c = *(uint32_t *) (src+2*spitch+3*BYTES_PER_UINT32_T); uint32_t src1d = *(uint32_t *) (src+3*spitch+0*BYTES_PER_UINT32_T); uint32_t src2d = *(uint32_t *) (src+3*spitch+1*BYTES_PER_UINT32_T); uint32_t src3d = *(uint32_t *) (src+3*spitch+2*BYTES_PER_UINT32_T); uint32_t src4d = *(uint32_t *) (src+3*spitch+3*BYTES_PER_UINT32_T); *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T) = src1a; *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T) = src2a; *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T) = src3a; *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T) = src4a; *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T) = src1b; *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T) = src2b; *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T) = src3b; *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T) = src4b; *(uint32_t *) (dst+2*dpitch+0*BYTES_PER_UINT32_T) = src1c; *(uint32_t *) (dst+2*dpitch+1*BYTES_PER_UINT32_T) = src2c; *(uint32_t *) (dst+2*dpitch+2*BYTES_PER_UINT32_T) = src3c; *(uint32_t *) (dst+2*dpitch+3*BYTES_PER_UINT32_T) = src4c; *(uint32_t *) (dst+3*dpitch+0*BYTES_PER_UINT32_T) = src1d; *(uint32_t *) (dst+3*dpitch+1*BYTES_PER_UINT32_T) = src2d; *(uint32_t *) (dst+3*dpitch+2*BYTES_PER_UINT32_T) = src3d; *(uint32_t *) (dst+3*dpitch+3*BYTES_PER_UINT32_T) = src4d; return TRUE; } else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) { uint16_t src1a = *(uint16_t *) (src+0*spitch+0); uint32_t src2a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); uint32_t src3a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); uint32_t src4a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); uint16_t src5a = *(uint16_t *) (src+0*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); uint16_t src1b = *(uint16_t *) (src+1*spitch+0); uint32_t src2b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); uint32_t src3b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); uint32_t src4b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); uint16_t src5b = *(uint16_t *) (src+1*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); uint16_t src1c = *(uint16_t *) (src+2*spitch+0); uint32_t src2c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); uint32_t src3c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); uint32_t src4c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); uint16_t src5c = *(uint16_t *) (src+2*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); uint16_t src1d = *(uint16_t *) (src+3*spitch+0); uint32_t src2d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); uint32_t src3d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); uint32_t src4d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); uint16_t src5d = *(uint16_t *) (src+3*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); *(uint16_t *) (dst+0*dpitch+0) = src1a; *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2a; *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3a; *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4a; *(uint16_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5a; *(uint16_t *) (dst+1*dpitch+0) = src1b; *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2b; *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3b; *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4b; *(uint16_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5b; *(uint16_t *) (dst+2*dpitch+0) = src1c; *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2c; *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3c; *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4c; *(uint16_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5c; *(uint16_t *) (dst+3*dpitch+0) = src1d; *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2d; *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3d; *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4d; *(uint16_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5d; return TRUE; } else { uint16_t src1a = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T); uint16_t src2a = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT16_T); uint16_t src3a = *(uint16_t *) (src+0*spitch+2*BYTES_PER_UINT16_T); uint16_t src4a = *(uint16_t *) (src+0*spitch+3*BYTES_PER_UINT16_T); uint16_t src5a = *(uint16_t *) (src+0*spitch+4*BYTES_PER_UINT16_T); uint16_t src6a = *(uint16_t *) (src+0*spitch+5*BYTES_PER_UINT16_T); uint16_t src7a = *(uint16_t *) (src+0*spitch+6*BYTES_PER_UINT16_T); uint16_t src8a = *(uint16_t *) (src+0*spitch+7*BYTES_PER_UINT16_T); uint16_t src1b = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T); uint16_t src2b = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT16_T); uint16_t src3b = *(uint16_t *) (src+1*spitch+2*BYTES_PER_UINT16_T); uint16_t src4b = *(uint16_t *) (src+1*spitch+3*BYTES_PER_UINT16_T); uint16_t src5b = *(uint16_t *) (src+1*spitch+4*BYTES_PER_UINT16_T); uint16_t src6b = *(uint16_t *) (src+1*spitch+5*BYTES_PER_UINT16_T); uint16_t src7b = *(uint16_t *) (src+1*spitch+6*BYTES_PER_UINT16_T); uint16_t src8b = *(uint16_t *) (src+1*spitch+7*BYTES_PER_UINT16_T); uint16_t src1c = *(uint16_t *) (src+2*spitch+0*BYTES_PER_UINT16_T); uint16_t src2c = *(uint16_t *) (src+2*spitch+1*BYTES_PER_UINT16_T); uint16_t src3c = *(uint16_t *) (src+2*spitch+2*BYTES_PER_UINT16_T); uint16_t src4c = *(uint16_t *) (src+2*spitch+3*BYTES_PER_UINT16_T); uint16_t src5c = *(uint16_t *) (src+2*spitch+4*BYTES_PER_UINT16_T); uint16_t src6c = *(uint16_t *) (src+2*spitch+5*BYTES_PER_UINT16_T); uint16_t src7c = *(uint16_t *) (src+2*spitch+6*BYTES_PER_UINT16_T); uint16_t src8c = *(uint16_t *) (src+2*spitch+7*BYTES_PER_UINT16_T); uint16_t src1d = *(uint16_t *) (src+3*spitch+0*BYTES_PER_UINT16_T); uint16_t src2d = *(uint16_t *) (src+3*spitch+1*BYTES_PER_UINT16_T); uint16_t src3d = *(uint16_t *) (src+3*spitch+2*BYTES_PER_UINT16_T); uint16_t src4d = *(uint16_t *) (src+3*spitch+3*BYTES_PER_UINT16_T); uint16_t src5d = *(uint16_t *) (src+3*spitch+4*BYTES_PER_UINT16_T); uint16_t src6d = *(uint16_t *) (src+3*spitch+5*BYTES_PER_UINT16_T); uint16_t src7d = *(uint16_t *) (src+3*spitch+6*BYTES_PER_UINT16_T); uint16_t src8d = *(uint16_t *) (src+3*spitch+7*BYTES_PER_UINT16_T); *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T) = src1a; *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T) = src2a; *(uint16_t *) (dst+0*dpitch+2*BYTES_PER_UINT16_T) = src3a; *(uint16_t *) (dst+0*dpitch+3*BYTES_PER_UINT16_T) = src4a; *(uint16_t *) (dst+0*dpitch+4*BYTES_PER_UINT16_T) = src5a; *(uint16_t *) (dst+0*dpitch+5*BYTES_PER_UINT16_T) = src6a; *(uint16_t *) (dst+0*dpitch+6*BYTES_PER_UINT16_T) = src7a; *(uint16_t *) (dst+0*dpitch+7*BYTES_PER_UINT16_T) = src8a; *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T) = src1b; *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T) = src2b; *(uint16_t *) (dst+1*dpitch+2*BYTES_PER_UINT16_T) = src3b; *(uint16_t *) (dst+1*dpitch+3*BYTES_PER_UINT16_T) = src4b; *(uint16_t *) (dst+1*dpitch+4*BYTES_PER_UINT16_T) = src5b; *(uint16_t *) (dst+1*dpitch+5*BYTES_PER_UINT16_T) = src6b; *(uint16_t *) (dst+1*dpitch+6*BYTES_PER_UINT16_T) = src7b; *(uint16_t *) (dst+1*dpitch+7*BYTES_PER_UINT16_T) = src8b; *(uint16_t *) (dst+2*dpitch+0*BYTES_PER_UINT16_T) = src1c; *(uint16_t *) (dst+2*dpitch+1*BYTES_PER_UINT16_T) = src2c; *(uint16_t *) (dst+2*dpitch+2*BYTES_PER_UINT16_T) = src3c; *(uint16_t *) (dst+2*dpitch+3*BYTES_PER_UINT16_T) = src4c; *(uint16_t *) (dst+2*dpitch+4*BYTES_PER_UINT16_T) = src5c; *(uint16_t *) (dst+2*dpitch+5*BYTES_PER_UINT16_T) = src6c; *(uint16_t *) (dst+2*dpitch+6*BYTES_PER_UINT16_T) = src7c; *(uint16_t *) (dst+2*dpitch+7*BYTES_PER_UINT16_T) = src8c; *(uint16_t *) (dst+3*dpitch+0*BYTES_PER_UINT16_T) = src1d; *(uint16_t *) (dst+3*dpitch+1*BYTES_PER_UINT16_T) = src2d; *(uint16_t *) (dst+3*dpitch+2*BYTES_PER_UINT16_T) = src3d; *(uint16_t *) (dst+3*dpitch+3*BYTES_PER_UINT16_T) = src4d; *(uint16_t *) (dst+3*dpitch+4*BYTES_PER_UINT16_T) = src5d; *(uint16_t *) (dst+3*dpitch+5*BYTES_PER_UINT16_T) = src6d; *(uint16_t *) (dst+3*dpitch+6*BYTES_PER_UINT16_T) = src7d; *(uint16_t *) (dst+3*dpitch+7*BYTES_PER_UINT16_T) = src8d; return TRUE; } break; case 16: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T)); uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T)); uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T)); uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T)); uint32x4_t src1c = vld1q_u32((uint32_t *)(src+2*spitch+0*BYTES_PER_UINT32X4_T)); uint32x4_t src2c = vld1q_u32((uint32_t *)(src+2*spitch+1*BYTES_PER_UINT32X4_T)); uint32x4_t src1d = vld1q_u32((uint32_t *)(src+3*spitch+0*BYTES_PER_UINT32X4_T)); uint32x4_t src2d = vld1q_u32((uint32_t *)(src+3*spitch+1*BYTES_PER_UINT32X4_T)); vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a); vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a); vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b); vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b); vst1q_u32((uint32_t *)(dst+2*dpitch+0*BYTES_PER_UINT32X4_T),src1c); vst1q_u32((uint32_t *)(dst+2*dpitch+1*BYTES_PER_UINT32X4_T),src2c); vst1q_u32((uint32_t *)(dst+3*dpitch+0*BYTES_PER_UINT32X4_T),src1d); vst1q_u32((uint32_t *)(dst+3*dpitch+1*BYTES_PER_UINT32X4_T),src2d); return TRUE; } else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T); uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T); uint64_t src3a = *(uint64_t *) (src+0*spitch+2*BYTES_PER_UINT64_T); uint64_t src4a = *(uint64_t *) (src+0*spitch+3*BYTES_PER_UINT64_T); uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T); uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T); uint64_t src3b = *(uint64_t *) (src+1*spitch+2*BYTES_PER_UINT64_T); uint64_t src4b = *(uint64_t *) (src+1*spitch+3*BYTES_PER_UINT64_T); uint64_t src1c = *(uint64_t *) (src+2*spitch+0*BYTES_PER_UINT64_T); uint64_t src2c = *(uint64_t *) (src+2*spitch+1*BYTES_PER_UINT64_T); uint64_t src3c = *(uint64_t *) (src+2*spitch+2*BYTES_PER_UINT64_T); uint64_t src4c = *(uint64_t *) (src+2*spitch+3*BYTES_PER_UINT64_T); uint64_t src1d = *(uint64_t *) (src+3*spitch+0*BYTES_PER_UINT64_T); uint64_t src2d = *(uint64_t *) (src+3*spitch+1*BYTES_PER_UINT64_T); uint64_t src3d = *(uint64_t *) (src+3*spitch+2*BYTES_PER_UINT64_T); uint64_t src4d = *(uint64_t *) (src+3*spitch+3*BYTES_PER_UINT64_T); *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a; *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a; *(uint64_t *) (dst+0*dpitch+2*BYTES_PER_UINT64_T) = src3a; *(uint64_t *) (dst+0*dpitch+3*BYTES_PER_UINT64_T) = src4a; *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b; *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b; *(uint64_t *) (dst+1*dpitch+2*BYTES_PER_UINT64_T) = src3b; *(uint64_t *) (dst+1*dpitch+3*BYTES_PER_UINT64_T) = src4b; *(uint64_t *) (dst+2*dpitch+0*BYTES_PER_UINT64_T) = src1c; *(uint64_t *) (dst+2*dpitch+1*BYTES_PER_UINT64_T) = src2c; *(uint64_t *) (dst+2*dpitch+2*BYTES_PER_UINT64_T) = src3c; *(uint64_t *) (dst+2*dpitch+3*BYTES_PER_UINT64_T) = src4c; *(uint64_t *) (dst+3*dpitch+0*BYTES_PER_UINT64_T) = src1d; *(uint64_t *) (dst+3*dpitch+1*BYTES_PER_UINT64_T) = src2d; *(uint64_t *) (dst+3*dpitch+2*BYTES_PER_UINT64_T) = src3d; *(uint64_t *) (dst+3*dpitch+3*BYTES_PER_UINT64_T) = src4d; return TRUE; } else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2*BYTES_PER_UINT16_T)) { uint32_t src1a = *(uint32_t *) (src+0*spitch+0); uint64_t src2a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T); uint64_t src3a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T); uint64_t src4a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T); uint32_t src5a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T); uint32_t src1b = *(uint32_t *) (src+1*spitch+0); uint64_t src2b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T); uint64_t src3b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T); uint64_t src4b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T); uint32_t src5b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T); uint32_t src1c = *(uint32_t *) (src+2*spitch+0); uint64_t src2c = *(uint64_t *) (src+2*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T); uint64_t src3c = *(uint64_t *) (src+2*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T); uint64_t src4c = *(uint64_t *) (src+2*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T); uint32_t src5c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T); uint32_t src1d = *(uint32_t *) (src+3*spitch+0); uint64_t src2d = *(uint64_t *) (src+3*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T); uint64_t src3d = *(uint64_t *) (src+3*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T); uint64_t src4d = *(uint64_t *) (src+3*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T); uint32_t src5d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T); *(uint32_t *) (dst+0*dpitch+0) = src1a; *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2a; *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3a; *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4a; *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5a; *(uint32_t *) (dst+1*dpitch+0) = src1b; *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2b; *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3b; *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4b; *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5b; *(uint32_t *) (dst+2*dpitch+0) = src1c; *(uint64_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2c; *(uint64_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3c; *(uint64_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4c; *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5c; *(uint32_t *) (dst+3*dpitch+0) = src1d; *(uint64_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2d; *(uint64_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3d; *(uint64_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4d; *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5d; return TRUE; } else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T); uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T); uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T); uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T); uint32_t src5a = *(uint32_t *) (src+0*spitch+4*BYTES_PER_UINT32_T); uint32_t src6a = *(uint32_t *) (src+0*spitch+5*BYTES_PER_UINT32_T); uint32_t src7a = *(uint32_t *) (src+0*spitch+6*BYTES_PER_UINT32_T); uint32_t src8a = *(uint32_t *) (src+0*spitch+7*BYTES_PER_UINT32_T); uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T); uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T); uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T); uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T); uint32_t src5b = *(uint32_t *) (src+1*spitch+4*BYTES_PER_UINT32_T); uint32_t src6b = *(uint32_t *) (src+1*spitch+5*BYTES_PER_UINT32_T); uint32_t src7b = *(uint32_t *) (src+1*spitch+6*BYTES_PER_UINT32_T); uint32_t src8b = *(uint32_t *) (src+1*spitch+7*BYTES_PER_UINT32_T); uint32_t src1c = *(uint32_t *) (src+2*spitch+0*BYTES_PER_UINT32_T); uint32_t src2c = *(uint32_t *) (src+2*spitch+1*BYTES_PER_UINT32_T); uint32_t src3c = *(uint32_t *) (src+2*spitch+2*BYTES_PER_UINT32_T); uint32_t src4c = *(uint32_t *) (src+2*spitch+3*BYTES_PER_UINT32_T); uint32_t src5c = *(uint32_t *) (src+2*spitch+4*BYTES_PER_UINT32_T); uint32_t src6c = *(uint32_t *) (src+2*spitch+5*BYTES_PER_UINT32_T); uint32_t src7c = *(uint32_t *) (src+2*spitch+6*BYTES_PER_UINT32_T); uint32_t src8c = *(uint32_t *) (src+2*spitch+7*BYTES_PER_UINT32_T); uint32_t src1d = *(uint32_t *) (src+3*spitch+0*BYTES_PER_UINT32_T); uint32_t src2d = *(uint32_t *) (src+3*spitch+1*BYTES_PER_UINT32_T); uint32_t src3d = *(uint32_t *) (src+3*spitch+2*BYTES_PER_UINT32_T); uint32_t src4d = *(uint32_t *) (src+3*spitch+3*BYTES_PER_UINT32_T); uint32_t src5d = *(uint32_t *) (src+3*spitch+4*BYTES_PER_UINT32_T); uint32_t src6d = *(uint32_t *) (src+3*spitch+5*BYTES_PER_UINT32_T); uint32_t src7d = *(uint32_t *) (src+3*spitch+6*BYTES_PER_UINT32_T); uint32_t src8d = *(uint32_t *) (src+3*spitch+7*BYTES_PER_UINT32_T); *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T) = src1a; *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T) = src2a; *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T) = src3a; *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T) = src4a; *(uint32_t *) (dst+0*dpitch+4*BYTES_PER_UINT32_T) = src5a; *(uint32_t *) (dst+0*dpitch+5*BYTES_PER_UINT32_T) = src6a; *(uint32_t *) (dst+0*dpitch+6*BYTES_PER_UINT32_T) = src7a; *(uint32_t *) (dst+0*dpitch+7*BYTES_PER_UINT32_T) = src8a; *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T) = src1b; *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T) = src2b; *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T) = src3b; *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T) = src4b; *(uint32_t *) (dst+1*dpitch+4*BYTES_PER_UINT32_T) = src5b; *(uint32_t *) (dst+1*dpitch+5*BYTES_PER_UINT32_T) = src6b; *(uint32_t *) (dst+1*dpitch+6*BYTES_PER_UINT32_T) = src7b; *(uint32_t *) (dst+1*dpitch+7*BYTES_PER_UINT32_T) = src8b; *(uint32_t *) (dst+2*dpitch+0*BYTES_PER_UINT32_T) = src1c; *(uint32_t *) (dst+2*dpitch+1*BYTES_PER_UINT32_T) = src2c; *(uint32_t *) (dst+2*dpitch+2*BYTES_PER_UINT32_T) = src3c; *(uint32_t *) (dst+2*dpitch+3*BYTES_PER_UINT32_T) = src4c; *(uint32_t *) (dst+2*dpitch+4*BYTES_PER_UINT32_T) = src5c; *(uint32_t *) (dst+2*dpitch+5*BYTES_PER_UINT32_T) = src6c; *(uint32_t *) (dst+2*dpitch+6*BYTES_PER_UINT32_T) = src7c; *(uint32_t *) (dst+2*dpitch+7*BYTES_PER_UINT32_T) = src8c; *(uint32_t *) (dst+3*dpitch+0*BYTES_PER_UINT32_T) = src1d; *(uint32_t *) (dst+3*dpitch+1*BYTES_PER_UINT32_T) = src2d; *(uint32_t *) (dst+3*dpitch+2*BYTES_PER_UINT32_T) = src3d; *(uint32_t *) (dst+3*dpitch+3*BYTES_PER_UINT32_T) = src4d; *(uint32_t *) (dst+3*dpitch+4*BYTES_PER_UINT32_T) = src5d; *(uint32_t *) (dst+3*dpitch+5*BYTES_PER_UINT32_T) = src6d; *(uint32_t *) (dst+3*dpitch+6*BYTES_PER_UINT32_T) = src7d; *(uint32_t *) (dst+3*dpitch+7*BYTES_PER_UINT32_T) = src8d; return TRUE; } else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) { uint16_t src1a = *(uint16_t *) (src+0*spitch+0); uint32_t src2a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); uint32_t src3a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); uint32_t src4a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); uint32_t src5a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); uint32_t src6a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T); uint32_t src7a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T); uint32_t src8a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T); uint16_t src9a = *(uint16_t *) (src+0*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T); uint16_t src1b = *(uint16_t *) (src+1*spitch+0); uint32_t src2b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); uint32_t src3b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); uint32_t src4b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); uint32_t src5b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); uint32_t src6b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T); uint32_t src7b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T); uint32_t src8b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T); uint16_t src9b = *(uint16_t *) (src+1*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T); uint16_t src1c = *(uint16_t *) (src+2*spitch+0); uint32_t src2c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); uint32_t src3c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); uint32_t src4c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); uint32_t src5c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); uint32_t src6c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T); uint32_t src7c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T); uint32_t src8c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T); uint16_t src9c = *(uint16_t *) (src+2*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T); uint16_t src1d = *(uint16_t *) (src+3*spitch+0); uint32_t src2d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); uint32_t src3d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); uint32_t src4d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); uint32_t src5d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); uint32_t src6d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T); uint32_t src7d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T); uint32_t src8d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T); uint16_t src9d = *(uint16_t *) (src+3*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T); *(uint16_t *) (dst+0*dpitch+0) = src1a; *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2a; *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3a; *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4a; *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5a; *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T) = src6a; *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T) = src7a; *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T) = src8a; *(uint16_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T) = src9a; *(uint16_t *) (dst+1*dpitch+0) = src1b; *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2b; *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3b; *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4b; *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5b; *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T) = src6b; *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T) = src7b; *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T) = src8b; *(uint16_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T) = src9b; *(uint16_t *) (dst+2*dpitch+0) = src1c; *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2c; *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3c; *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4c; *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5c; *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T) = src6c; *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T) = src7c; *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T) = src8c; *(uint16_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T) = src9c; *(uint16_t *) (dst+3*dpitch+0) = src1d; *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2d; *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3d; *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4d; *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5d; *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T) = src6d; *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T) = src7d; *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T) = src8d; *(uint16_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T) = src9d; return TRUE; } else { // Don't bother unrolling loops, since that won't help for more than around 8 operations. // Instead, just call multiple fixed functions. if (xdir >= 0) { swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); } else { swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch); } return TRUE; } break; // TODO: Add more alignment checks for 32 pixel-wide cases for performance reasons? // For example, handling (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,XXX)) and related cases could make a big difference here... case 32: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T)); uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T)); uint32x4_t src3a = vld1q_u32((uint32_t *)(src+0*spitch+2*BYTES_PER_UINT32X4_T)); uint32x4_t src4a = vld1q_u32((uint32_t *)(src+0*spitch+3*BYTES_PER_UINT32X4_T)); uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T)); uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T)); uint32x4_t src3b = vld1q_u32((uint32_t *)(src+1*spitch+2*BYTES_PER_UINT32X4_T)); uint32x4_t src4b = vld1q_u32((uint32_t *)(src+1*spitch+3*BYTES_PER_UINT32X4_T)); uint32x4_t src1c = vld1q_u32((uint32_t *)(src+2*spitch+0*BYTES_PER_UINT32X4_T)); uint32x4_t src2c = vld1q_u32((uint32_t *)(src+2*spitch+1*BYTES_PER_UINT32X4_T)); uint32x4_t src3c = vld1q_u32((uint32_t *)(src+2*spitch+2*BYTES_PER_UINT32X4_T)); uint32x4_t src4c = vld1q_u32((uint32_t *)(src+2*spitch+3*BYTES_PER_UINT32X4_T)); uint32x4_t src1d = vld1q_u32((uint32_t *)(src+3*spitch+0*BYTES_PER_UINT32X4_T)); uint32x4_t src2d = vld1q_u32((uint32_t *)(src+3*spitch+1*BYTES_PER_UINT32X4_T)); uint32x4_t src3d = vld1q_u32((uint32_t *)(src+3*spitch+2*BYTES_PER_UINT32X4_T)); uint32x4_t src4d = vld1q_u32((uint32_t *)(src+3*spitch+3*BYTES_PER_UINT32X4_T)); vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a); vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a); vst1q_u32((uint32_t *)(dst+0*dpitch+2*BYTES_PER_UINT32X4_T),src3a); vst1q_u32((uint32_t *)(dst+0*dpitch+3*BYTES_PER_UINT32X4_T),src4a); vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b); vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b); vst1q_u32((uint32_t *)(dst+1*dpitch+2*BYTES_PER_UINT32X4_T),src3b); vst1q_u32((uint32_t *)(dst+1*dpitch+3*BYTES_PER_UINT32X4_T),src4b); vst1q_u32((uint32_t *)(dst+2*dpitch+0*BYTES_PER_UINT32X4_T),src1c); vst1q_u32((uint32_t *)(dst+2*dpitch+1*BYTES_PER_UINT32X4_T),src2c); vst1q_u32((uint32_t *)(dst+2*dpitch+2*BYTES_PER_UINT32X4_T),src3c); vst1q_u32((uint32_t *)(dst+2*dpitch+3*BYTES_PER_UINT32X4_T),src4c); vst1q_u32((uint32_t *)(dst+3*dpitch+0*BYTES_PER_UINT32X4_T),src1d); vst1q_u32((uint32_t *)(dst+3*dpitch+1*BYTES_PER_UINT32X4_T),src2d); vst1q_u32((uint32_t *)(dst+3*dpitch+2*BYTES_PER_UINT32X4_T),src3d); vst1q_u32((uint32_t *)(dst+3*dpitch+3*BYTES_PER_UINT32X4_T),src4d); return TRUE; } break; } return FALSE; } /* Draw multiple rows with a specific memory copy without narrow width functions. */ #define DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(MEMCPY_FUNCTION,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS) \ do { \ /* Draw four rows at a time, in the most efficient way. */ \ while (h >= FOUR_ROWS) { \ h -= FOUR_ROWS; \ \ BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); \ MEMCPY_FUNCTION(dst + 0*dpitch, src + 0*spitch, w * BYTES_PER_UINT16_T); \ MEMCPY_FUNCTION(dst + 1*dpitch, src + 1*spitch, w * BYTES_PER_UINT16_T); \ MEMCPY_FUNCTION(dst + 2*dpitch, src + 2*spitch, w * BYTES_PER_UINT16_T); \ MEMCPY_FUNCTION(dst + 3*dpitch, src + 3*spitch, w * BYTES_PER_UINT16_T); \ UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); \ \ dst += FOUR_ROWS * dpitch; \ src += FOUR_ROWS * spitch; \ } \ \ BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); \ \ /* Draw two rows at a time, in the most efficient way. */ \ while (h >= TWO_ROWS) { \ h -= TWO_ROWS; \ \ MEMCPY_FUNCTION(dst + 0*dpitch, src + 0*spitch, w * BYTES_PER_UINT16_T); \ MEMCPY_FUNCTION(dst + 1*dpitch, src + 1*spitch, w * BYTES_PER_UINT16_T); \ \ dst += TWO_ROWS * dpitch; \ src += TWO_ROWS * spitch; \ } \ \ /* Draw one row at a time, in the most efficient way. */ \ while (h >= ONE_ROW) { \ h -= ONE_ROW; \ \ MEMCPY_FUNCTION(dst, src, w * BYTES_PER_UINT16_T); \ \ dst += ONE_ROW * dpitch; \ src += ONE_ROW * spitch; \ } \ \ UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); \ \ } while (0) /* Draw multiple rows with a specific memory copy. */ #define DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(MEMCPY_FUNCTION,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS) \ do { \ /* Draw four rows at a time, in the most efficient way. */ \ while (h >= FOUR_ROWS) { \ h -= FOUR_ROWS; \ \ BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); \ \ /* First, check if the blit can be done using unaligned fixed-size operations for four rows at a time. */ \ if (swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, w, xdir, dpitch, spitch)) \ ; \ else \ { \ MEMCPY_FUNCTION(dst + 0*dpitch, src + 0*spitch, w * BYTES_PER_UINT16_T); \ MEMCPY_FUNCTION(dst + 1*dpitch, src + 1*spitch, w * BYTES_PER_UINT16_T); \ MEMCPY_FUNCTION(dst + 2*dpitch, src + 2*spitch, w * BYTES_PER_UINT16_T); \ MEMCPY_FUNCTION(dst + 3*dpitch, src + 3*spitch, w * BYTES_PER_UINT16_T); \ } \ \ UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); \ \ dst += FOUR_ROWS * dpitch; \ src += FOUR_ROWS * spitch; \ } \ \ BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); \ \ /* Draw two rows at a time, in the most efficient way. */ \ while (h >= TWO_ROWS) { \ h -= TWO_ROWS; \ \ /* First, check if the blit can be done using unaligned fixed-size operations for two rows at a time. */ \ if (swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, w, xdir, dpitch, spitch)) \ ; \ else \ { \ MEMCPY_FUNCTION(dst + 0*dpitch, src + 0*spitch, w * BYTES_PER_UINT16_T); \ MEMCPY_FUNCTION(dst + 1*dpitch, src + 1*spitch, w * BYTES_PER_UINT16_T); \ } \ \ dst += TWO_ROWS * dpitch; \ src += TWO_ROWS * spitch; \ } \ \ /* Draw one row at a time, in the most efficient way. */ \ while (h >= ONE_ROW) { \ h -= ONE_ROW; \ \ /* First, check if the blit can be done using unaligned fixed-size operations for two rows at a time. */ \ if (swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, w, xdir)) \ ; \ else \ { \ MEMCPY_FUNCTION(dst, src, w * BYTES_PER_UINT16_T); \ } \ \ dst += ONE_ROW * dpitch; \ src += ONE_ROW * spitch; \ } \ \ UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); \ \ } while (0) /* Draw multiple rows with small fixed width functions in a positive X direction. */ #define DRAW_MULTIPLE_ROWS_WITH_POSITIVE_XDIR(BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS) \ do { \ /* Draw four rows at a time, in the most efficient way. */ \ while (h >= FOUR_ROWS) { \ h -= FOUR_ROWS; \ \ BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); \ \ /* First, check if the blit can be done using unaligned fixed-size operations for four rows at a time. */ \ if (swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, w, xdir, dpitch, spitch)) \ ; \ else \ { \ /* NOTE: Assumes that all copies of less than 8 pixels succeed, so return values are not checked. */ \ swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch); \ swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, w % 8, xdir, dpitch, spitch); \ } \ \ UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); \ \ dst += FOUR_ROWS * dpitch; \ src += FOUR_ROWS * spitch; \ } \ \ BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); \ \ /* Draw two rows at a time, in the most efficient way. */ \ while (h >= TWO_ROWS) { \ h -= TWO_ROWS; \ \ /* First, check if the blit can be done using unaligned fixed-size operations for two rows at a time. */ \ if (swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, w, xdir, dpitch, spitch)) \ ; \ else \ { \ /* NOTE: Assumes that all copies of less than 8 pixels succeed, so return values are not checked. */ \ swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch); \ swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, w % 8, xdir, dpitch, spitch); \ } \ \ dst += TWO_ROWS * dpitch; \ src += TWO_ROWS * spitch; \ } \ \ /* Draw one row at a time, in the most efficient way. */ \ while (h >= ONE_ROW) { \ h -= ONE_ROW; \ \ /* First, check if the blit can be done using unaligned fixed-size operations for two rows at a time. */ \ if (swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, w, xdir)) \ ; \ else \ { \ /* NOTE: Assumes that all copies of less than 8 pixels succeed, so return values are not checked. */ \ swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 8, xdir); \ swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, w % 8, xdir); \ } \ \ dst += ONE_ROW * dpitch; \ src += ONE_ROW * spitch; \ } \ \ UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); \ \ } while (0) /* Draw multiple rows with small fixed width functions in a negative X direction. */ #define DRAW_MULTIPLE_ROWS_WITH_NEGATIVE_XDIR(BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS) \ do { \ /* Draw four rows at a time, in the most efficient way. */ \ while (h >= FOUR_ROWS) { \ h -= FOUR_ROWS; \ \ BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); \ \ /* First, check if the blit can be done using unaligned fixed-size operations for four rows at a time. */ \ if (swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, w, xdir, dpitch, spitch)) \ ; \ else \ { \ /* NOTE: Assumes that all copies of less than 8 pixels succeed, so return values are not checked. */ \ swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, w % 8, xdir, dpitch, spitch); \ swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch); \ } \ \ UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); \ \ dst += FOUR_ROWS * dpitch; \ src += FOUR_ROWS * spitch; \ } \ \ BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); \ \ /* Draw two rows at a time, in the most efficient way. */ \ while (h >= TWO_ROWS) { \ h -= TWO_ROWS; \ \ /* First, check if the blit can be done using unaligned fixed-size operations for two rows at a time. */ \ if (swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, w, xdir, dpitch, spitch)) \ ; \ else \ { \ /* NOTE: Assumes that all copies of less than 8 pixels succeed, so return values are not checked. */ \ swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, w % 8, xdir, dpitch, spitch); \ swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch); \ } \ \ dst += TWO_ROWS * dpitch; \ src += TWO_ROWS * spitch; \ } \ \ /* Draw one row at a time, in the most efficient way. */ \ while (h >= ONE_ROW) { \ h -= ONE_ROW; \ \ /* First, check if the blit can be done using unaligned fixed-size operations for two rows at a time. */ \ if (swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, w, xdir)) \ ; \ else \ { \ /* NOTE: Assumes that all copies of less than 8 pixels succeed, so return values are not checked. */ \ swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, w % 8, xdir); \ swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 8, xdir); \ } \ \ dst += ONE_ROW * dpitch; \ src += ONE_ROW * spitch; \ } \ \ UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); \ \ } while (0) /* Copy a line of 16bpp pixels, using fixed width functions only. */ /* (Pointers are assumed to be half-word-aligned, which should be guaranteed for 16bpp.) */ static inline void swCopyRect16BppFixedWidth_Unaligned(unsigned char *dst, unsigned char *src, int w, int h, int xdir, int ydir, int dpitch, int spitch, BOOL rowsOverlap, BOOL blockSignalsForVFP) { // It's critically important to keep in mind what the ordering requirements are for the functions below. The following rules must be followed: // (1) neon_memcpy() and memcpy() should only be called when 'rowsOverlap' is false. // (These functions are usually faster than neon_memmove() and memmove(), so they are preferred when possible.) // (2) neon_memmove() and memmove() *MUST* be called when 'rowsOverlap' is true. // (3) Calls to X_memcpy()/X_memmove() for multiple rows must be ordered by ydir. However, since spitch and dpitch have signs that match xdir and ydir, // it is sufficient to calculate source and destination address with spitch and dpitch to guarantee this ordering. // (4) Single calls to swCopy16BppSmallFixedWidths1Row_Unaligned(), swCopy16BppSmallFixedWidths2Rows_Unaligned(), and swCopy16BppSmallFixedWidths4Rows_Unaligned() // may be called for any combination of xdir and ydir. // (5) However, not-widthstanding the point above, if multiple calls to the three swCopy16BppSmallFixedWidthsxRow(s)_Unaligned() functions are made // for the same row, then calls must be made from left-to-right for xdir=1 and right-to-left for xdir=-1. // Violating the above rules may not obviously generate incorrect results, but full compliance tests will probably fail for obscure corner cases. const int FOUR_ROWS = 4; const int TWO_ROWS = 2; const int ONE_ROW = 1; // Prefer memcpy() and memmove() when copies are wide. if (w >= 16) { // Just check for rows overlapping, which is all that is needed to distinguish between memmove() and memcpy(). // NOTE: memcpy() is generally faster than neon_memcpy() up to about 128 bytes and specialized draw functions are faster for up to 32 bytes. if (rowsOverlap) { if (w > 64) { DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); } else if (w == 64) { DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); } else { DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memmove, SIGNAL_BLOCK_NOOP, SIGNAL_BLOCK_NOOP); } } else { if (w > 64) { DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); } else if (w == 64) { DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); } else { DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memcpy, SIGNAL_BLOCK_NOOP, SIGNAL_BLOCK_NOOP); } } } // Handle remaining cases -- including reverse (buffered) copies. else { if (xdir >= 0) { DRAW_MULTIPLE_ROWS_WITH_POSITIVE_XDIR(BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); } else { DRAW_MULTIPLE_ROWS_WITH_NEGATIVE_XDIR(BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); } } } /* Copy a line of 8bpp pixels. (No alignment assumed.) */ /* NOTE: This is probably only needed for 12bpp planar video copies. */ static inline void swCopyRect8Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h, int xdir, int ydir, int dpitch, int spitch, BOOL rowsOverlap, BOOL blockSignalsForVFP) { const int ONE_ROW = 1; while (h > 0) { h -= ONE_ROW; // Handle remaining cases -- including reverse (buffered) copies. // NOTE: memcpy() is generally faster than neon_memcpy() up to about 128 bytes. if (xdir >= 0 || !rowsOverlap) { if (w >= 128) { BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); neon_memcpy(dst, src, w); UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); } else memcpy(dst, src, w); } else { if (w >= 128) { BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); neon_memmove(dst, src, w); UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); } else memmove(dst, src, w); } dst += dpitch; src += spitch; } } /* Copy a line of 16bpp pixels. */ /* (Pointers are assumed to be half-word-aligned, which should be guaranteed for 16bpp.) */ static inline void swCopyRect16Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h, int xdir, int ydir, int dpitch, int spitch, BOOL rowsOverlap, BOOL blockSignalsForVFP) { // Handle single-pixel width columns as a special case. // Since this function only requires half-word-alignment, which is guaranteed at this point, // it's safe to call now with no further tests. if (w == 1) { swCopy2ByteWideRectangle_HalfWordAligned(dst, src, h, dpitch, spitch); return; } swCopyRect16BppFixedWidth_Unaligned(dst, src, w, h, xdir, ydir, dpitch, spitch, rowsOverlap, blockSignalsForVFP); } /* Copy a line of 24bpp pixels. */ /* (Pointers are not assumed to be aligned for 24bpp.) */ static inline void swCopyRect24Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h, int xdir, int ydir, int dpitch, int spitch, BOOL rowsOverlap, BOOL blockSignalsForVFP) { // TODO: Make optimized draws for fixed sizes? Performance for this case is likely to be pretty poor. const int ONE_ROW = 1; while (h > 0) { h -= ONE_ROW; // Handle remaining cases -- including reverse (buffered) copies. // NOTE: memcpy() is generally faster than neon_memcpy() up to about 128 bytes. if (xdir >= 0 || !rowsOverlap) { if (w >= 42) { BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); neon_memcpy(dst, src, w * BYTES_PER_24BPP_PIXEL); UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); } else memcpy(dst, src, w * BYTES_PER_24BPP_PIXEL); } else { if (w >= 42) { BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); neon_memmove(dst, src, w * BYTES_PER_24BPP_PIXEL); UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); } else memmove(dst, src, w * BYTES_PER_24BPP_PIXEL); } dst += dpitch; src += spitch; } } /* Copy a line of 32bpp pixels. */ /* (Pointers are assumed to be word-aligned, which should be guaranteed for 32bpp.) */ static inline void swCopyRect32Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h, int xdir, int ydir, int dpitch, int spitch, BOOL rowsOverlap, BOOL blockSignalsForVFP) { // As a pretty good first pass at optimization, use the 16bpp code to draw 32bpp rectangles. swCopyRect16Bpp_Unaligned(dst, src, w * BYTES_PER_UINT32_T / BYTES_PER_UINT16_T, h, xdir, ydir, dpitch, spitch, rowsOverlap, blockSignalsForVFP); } /* Perform a software blit */ void swBlit(MSMPtr pMsm, MSMBlitRec *blit, int bpp, BOOL blockSignalsForVFP) { int h = blit->dstRect->h; int w = blit->dstRect->w; int cpp = bpp / 8; int spitch = blit->src->pitch; int dpitch = blit->dst->pitch; uint8_t *srcLine; uint8_t *src; uint8_t *dstLine; uint8_t *dst; int src_x = blit->srcRect->x; int dst_x = blit->dstRect->x; int xdir = 1; int ydir = 1; if (blit->src->flags & MSM_BLIT_GEM) { struct msm_drm_bo *bo = (struct msm_drm_bo *) blit->src->priv[0]; msm_drm_bo_map(bo); srcLine = (uint8_t *)bo->virt; } else { srcLine = (uint8_t *) pMsm->fbmem + blit->src->priv[0]; } if (blit->dst->flags & MSM_BLIT_GEM) { struct msm_drm_bo *bo = (struct msm_drm_bo *) blit->dst->priv[0]; msm_drm_bo_map(bo); dstLine = (uint8_t *) bo->virt; } else { dstLine = (uint8_t *) pMsm->fbmem + blit->dst->priv[0]; } if (srcLine == NULL || dstLine == NULL) { if (blit->src->flags & MSM_BLIT_GEM) msm_drm_bo_unmap((struct msm_drm_bo *) blit->src->priv[0]); if (blit->dst->flags & MSM_BLIT_GEM) msm_drm_bo_unmap((struct msm_drm_bo *) blit->dst->priv[0]); return; } srcLine += (blit->srcRect->y * spitch); dstLine += (blit->dstRect->y * dpitch); src = srcLine + blit->srcRect->x * cpp; dst = dstLine + blit->dstRect->x * cpp; // This trivial one-pixel copy is independent of xdir and ydir, so it can be done before the overlap check. // (This makes the 1x1 copy case significantly faster and there is reason to believe this is a common case.) if (h == 1 && w == 1) { switch (bpp) { case 8: *dst = *src; break; case 16: *(uint16_t *)dst = *(uint16_t *)src; break; case 24: { uint8_t src1a = *(src+0); uint8_t src1b = *(src+1); uint8_t src1c = *(src+2); *(dst+0) = src1a; *(dst+1) = src1b; *(dst+2) = src1c; } break; case 32: *(uint32_t *)dst = *(uint32_t *)src; break; default: break; } goto unmapbo; } // If starting destination and source lines are the same, // and rectangles overlap, then buffering of complete rows or reverse row-copies are required. BOOL rowsOverlap = FALSE; if (isOverlap(blit, bpp)) { // If destination and source rectangles overlap and destination is to right of source, then copies must be done from the right to the left. if (dst_x > src_x) { xdir = -1; } // If destination and source rectangles overlap and destination is lower than source (or same position as), then copies must be done from the bottom up. if (dstLine >= srcLine) { ydir = -1; src += ((h - 1) * spitch); dst += ((h - 1) * dpitch); spitch = -spitch; dpitch = -dpitch; } // If initial destination line is equal to the initial source line then each row overlaps (requiring memmove() or equivalent). if (dstLine == srcLine) rowsOverlap = TRUE; } // Call BPP-specific code to draw pixels. switch (bpp) { case 8: swCopyRect8Bpp_Unaligned(dst, src, w, h, xdir, ydir, dpitch, spitch, rowsOverlap, blockSignalsForVFP); break; case 16: swCopyRect16Bpp_Unaligned(dst, src, w, h, xdir, ydir, dpitch, spitch, rowsOverlap, blockSignalsForVFP); break; case 24: swCopyRect24Bpp_Unaligned(dst, src, w, h, xdir, ydir, dpitch, spitch, rowsOverlap, blockSignalsForVFP); break; case 32: swCopyRect32Bpp_Unaligned(dst, src, w, h, xdir, ydir, dpitch, spitch, rowsOverlap, blockSignalsForVFP); break; default: break; } unmapbo: if (blit->src->flags & MSM_BLIT_GEM) msm_drm_bo_unmap((struct msm_drm_bo *) blit->src->priv[0]); if (blit->dst->flags & MSM_BLIT_GEM) msm_drm_bo_unmap((struct msm_drm_bo *) blit->dst->priv[0]); } /* Perform a software blit, but assume no overlap. */ void swBlit_NoOverlap(unsigned char * __restrict__ dst, unsigned char * __restrict__ src, int w, int h, int dpitch, int spitch, int bpp, BOOL blockSignalsForVFP) { // Trivial one-pixel copy. // (This makes the 1x1 copy case significantly faster and there is reason to believe this is a common case.) if (h == 1 && w == 1) { switch (bpp) { case 8: *dst = *src; break; case 16: *(uint16_t *)dst = *(uint16_t *)src; break; case 24: { uint8_t src1a = *(src+0); uint8_t src1b = *(src+1); uint8_t src1c = *(src+2); *(dst+0) = src1a; *(dst+1) = src1b; *(dst+2) = src1c; } break; case 32: *(uint32_t *)dst = *(uint32_t *)src; break; default: break; } return; } // Call BPP-specific code to draw pixels. const int xdir = 1, ydir = 1, rowsOverlap = FALSE; switch (bpp) { case 8: swCopyRect8Bpp_Unaligned(dst, src, w, h, xdir, ydir, dpitch, spitch, rowsOverlap, blockSignalsForVFP); break; case 16: swCopyRect16Bpp_Unaligned(dst, src, w, h, xdir, ydir, dpitch, spitch, rowsOverlap, blockSignalsForVFP); break; case 24: swCopyRect24Bpp_Unaligned(dst, src, w, h, xdir, ydir, dpitch, spitch, rowsOverlap, blockSignalsForVFP); break; case 32: swCopyRect32Bpp_Unaligned(dst, src, w, h, xdir, ydir, dpitch, spitch, rowsOverlap, blockSignalsForVFP); break; default: return; } } xf86-video-msm/src/msm-driver.c0000755000175000017500000010023011615776600016507 0ustar paulliupaulliu/* msm-driver.c * * Copyright (c) 2009-2010, Code Aurora Forum. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Code Aurora nor * the names of its contributors may be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NON-INFRINGEMENT ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include "xf86.h" #include "damage.h" #include "xf86_OSlib.h" #include "xf86Crtc.h" #include "mipointer.h" #include "mibstore.h" #include "micmap.h" #include "fb.h" #include "dixstruct.h" #include "msm.h" #include "msm-drm.h" #if USEDRI #define _XF86DRI_SERVER_ #endif #if USEDRI2 #include "xf86drm.h" #endif #define MSM_NAME "msm" #define MSM_DRIVER_NAME "msm" #define MSM_VERSION_MAJOR PACKAGE_VERSION_MAJOR #define MSM_VERSION_MINOR PACKAGE_VERSION_MINOR #define MSM_VERSION_PATCH PACKAGE_VERSION_PATCHLEVEL #define MSM_VERSION_CURRENT \ ((MSM_VERSION_MAJOR << 20) |\ (MSM_VERSION_MINOR << 10) | \ (MSM_VERSION_PATCH)) /* List of available strings for fbCache support */ static const char *fbCacheStrings[] = { #ifdef MSMFB_GET_PAGE_PROTECTION [MDP_FB_PAGE_PROTECTION_NONCACHED] = "Noncached", [MDP_FB_PAGE_PROTECTION_WRITECOMBINE] = "WriteCombine", [MDP_FB_PAGE_PROTECTION_WRITETHROUGHCACHE] = "WriteThroughCache", [MDP_FB_PAGE_PROTECTION_WRITEBACKCACHE] = "WriteBackCache", [MDP_FB_PAGE_PROTECTION_WRITEBACKWACACHE] = "WriteBackWACache", #endif NULL }; /* This enumerates all of the available options */ typedef enum { OPTION_FB, OPTION_NOACCEL, OPTION_SWBLIT, OPTION_DRI, OPTION_DRI2, OPTION_SWCURSOR, OPTION_VSYNC, OPTION_SOCKGID, OPTION_NOSIGBLOCK, OPTION_FASTFILL, OPTION_FASTCOMPOSITE, OPTION_FASTCOMPOSITEREPEAT, OPTION_FASTVIDEOMEMCOPY, OPTION_FASTAPPFBMEMCOPY, OPTION_FBCACHE, OPTION_PIXMAP_MEMTYPE, } MSMOpts; /* An aray containing the options that the user can configure in xorg.conf */ static const OptionInfoRec MSMOptions[] = { {OPTION_FB, "fb", OPTV_STRING, {0}, FALSE}, {OPTION_NOACCEL, "NoAccel", OPTV_BOOLEAN, {0}, FALSE}, {OPTION_SWBLIT, "SWBlit", OPTV_BOOLEAN, {0}, FALSE}, {OPTION_DRI, "DRI", OPTV_BOOLEAN, {0}, FALSE}, {OPTION_DRI2, "DRI2", OPTV_BOOLEAN, {0}, FALSE}, {OPTION_SWCURSOR, "SWCursor", OPTV_BOOLEAN, {0}, FALSE}, {OPTION_SOCKGID, "SocketGID", OPTV_STRING, {0}, FALSE}, {OPTION_VSYNC, "DefaultVsync", OPTV_INTEGER, {0}, FALSE}, {OPTION_NOSIGBLOCK, "NoSigBlock", OPTV_BOOLEAN, {0}, FALSE}, {OPTION_FASTFILL, "FastFill", OPTV_BOOLEAN, {0}, FALSE}, {OPTION_FASTCOMPOSITE, "FastComposite", OPTV_BOOLEAN, {0}, FALSE}, {OPTION_FASTCOMPOSITEREPEAT, "FastCompositeRepeat", OPTV_BOOLEAN, {0}, FALSE}, {OPTION_FASTVIDEOMEMCOPY, "FastVideoMemCopy", OPTV_BOOLEAN, {0}, FALSE}, {OPTION_FASTAPPFBMEMCOPY, "FastAppFBMemCopy", OPTV_BOOLEAN, {0}, FALSE}, {OPTION_FBCACHE, "FBCache", OPTV_STRING, {0}, FALSE}, {OPTION_PIXMAP_MEMTYPE, "PixmapMemtype", OPTV_STRING, {0}, FALSE}, {-1, NULL, OPTV_NONE, {0}, FALSE} }; /** Return a string for the given chipset type */ static char * msmGetChipset(int chipID) { switch (chipID) { case MSM_TYPE_7201: return "7201"; break; case MSM_TYPE_8X50: return "8x50"; } return ""; } #if USEDRI2 static Bool MSMInitDRM(ScrnInfoPtr pScrn) { MSMPtr pMsm = MSMPTR(pScrn); int i, fd; drmVersionPtr version; drmSetVersion sv; int ret; /* Ugly, huh? */ pMsm->drmFD = 0; pMsm->drmDevName[0] = '\0'; for(i = 0; i < DRM_MAX_MINOR; i++) { int ret = -1; snprintf(pMsm->drmDevName, sizeof(pMsm->drmDevName), DRM_DEV_NAME, DRM_DIR_NAME, i); fd = open(pMsm->drmDevName, O_RDWR); if (fd < 0) continue; version = drmGetVersion(fd); if (version) ret = strcmp(version->name, "kgsl"); drmFreeVersion(version); if (!ret) break; close(fd); } if (i ==DRM_MAX_MINOR) { xf86DrvMsg(pScrn->scrnIndex, X_WARNING, "Unable to open a DRM device\n"); close(fd); return FALSE; } sv.drm_di_major = 1; sv.drm_di_minor = 1; sv.drm_dd_major = -1; sv.drm_dd_minor = -1; ret = drmSetInterfaceVersion(fd, &sv); if (ret != 0) { xf86DrvMsg(pScrn->scrnIndex, X_WARNING, "Unable to set the DRM version: %d\n", ret); close(fd); return FALSE; } pMsm->drmFD = fd; return TRUE; } #endif /* Get the current mode from the framebuffer mode and * convert it into xfree86 timings */ static void MSMGetDefaultMode(MSMPtr pMsm) { char name[32]; sprintf(name, "%dx%d", pMsm->mode_info.xres, pMsm->mode_info.yres); pMsm->default_mode.name = strdup(name); if (pMsm->default_mode.name == NULL) pMsm->default_mode.name = ""; pMsm->default_mode.next = &pMsm->default_mode; pMsm->default_mode.prev = &pMsm->default_mode; pMsm->default_mode.type |= M_T_BUILTIN | M_T_PREFERRED; pMsm->default_mode.HDisplay = pMsm->mode_info.xres; pMsm->default_mode.HSyncStart = pMsm->default_mode.HDisplay + pMsm->mode_info.right_margin; pMsm->default_mode.HSyncEnd = pMsm->default_mode.HSyncStart + pMsm->mode_info.hsync_len; pMsm->default_mode.HTotal = pMsm->default_mode.HSyncEnd + pMsm->mode_info.left_margin; pMsm->default_mode.VDisplay = pMsm->mode_info.yres; pMsm->default_mode.VSyncStart = pMsm->default_mode.VDisplay + pMsm->mode_info.lower_margin; pMsm->default_mode.VSyncEnd = pMsm->default_mode.VSyncStart + pMsm->mode_info.vsync_len; pMsm->default_mode.VTotal = pMsm->default_mode.VSyncEnd + pMsm->mode_info.upper_margin; /* The clock number we get is not the actual pixclock for the display, * which automagically updates at a fixed rate. There is no good way * to automatically figure out the fixed rate, so we use a config * value */ pMsm->default_mode.Clock = (pMsm->defaultVsync * pMsm->default_mode.HTotal * pMsm->default_mode.VTotal) / 1000; pMsm->default_mode.CrtcHDisplay = pMsm->default_mode.HDisplay; pMsm->default_mode.CrtcHSyncStart = pMsm->default_mode.HSyncStart; pMsm->default_mode.CrtcHSyncEnd = pMsm->default_mode.HSyncEnd; pMsm->default_mode.CrtcHTotal = pMsm->default_mode.HTotal; pMsm->default_mode.CrtcVDisplay = pMsm->default_mode.VDisplay; pMsm->default_mode.CrtcVSyncStart = pMsm->default_mode.VSyncStart; pMsm->default_mode.CrtcVSyncEnd = pMsm->default_mode.VSyncEnd; pMsm->default_mode.CrtcVTotal = pMsm->default_mode.VTotal; pMsm->default_mode.CrtcHAdjusted = FALSE; pMsm->default_mode.CrtcVAdjusted = FALSE; } static Bool MSMCrtcResize(ScrnInfoPtr pScrn, int width, int height) { return TRUE; } static const xf86CrtcConfigFuncsRec MSMCrtcConfigFuncs = { MSMCrtcResize, }; static int _getgid(const char *gid, gid_t *ret) { struct group *grp; gid_t g; grp = getgrnam(gid); if (grp != NULL) { *ret = grp->gr_gid; return 0; } g = strtoul(gid, NULL, 0); if (g != 0) { grp = getgrgid(g); if (grp != NULL) { *ret = grp->gr_gid; return 0; } } return -1; } /* A simple case-insenstive string comparison function. */ static int stricmp(const char *left, const char *right) { const int MAXSTRINGLEN = 100; char leftCopy[MAXSTRINGLEN], rightCopy[MAXSTRINGLEN]; int i; // Make temporary copies of comparison strings. strncpy(leftCopy,left, MAXSTRINGLEN); strncpy(rightCopy,right, MAXSTRINGLEN); // Convert English upper-case characters to lower-case. i = 0; while (leftCopy[i] != '\0') { if (leftCopy[i] >= 'A' && leftCopy[i] <= 'Z') leftCopy[i] += 'a' - 'A'; i++; } i = 0; while (rightCopy[i] != '\0') { if (rightCopy[i] >= 'A' && rightCopy[i] <= 'Z') rightCopy[i] += 'a' - 'A'; i++; } return strcmp(leftCopy, rightCopy); } /* This is the main initialization function for the screen */ static Bool MSMPreInit(ScrnInfoPtr pScrn, int flags) { MSMPtr pMsm; EntityInfoPtr pEnt; char *dev, *gid, *str; int mdpver, panelid; int depth, fbbpp; OptionInfoPtr options; rgb defaultWeight = { 0, 0, 0 }; int vsync; /* Omit ourselves from auto-probing (which is bound to * fail on our hardware anyway) */ if (flags & PROBE_DETECT) return FALSE; if (pScrn->numEntities != 1) { return FALSE; } /* Just use the current monitor specified in the * xorg.conf. This really means little to us since * we have no choice over which monitor is used, * but X needs this to be set */ pScrn->monitor = pScrn->confScreen->monitor; /* Allocate room for our private data */ if (pScrn->driverPrivate == NULL) pScrn->driverPrivate = xnfcalloc(sizeof(MSMRec), 1); pMsm = MSMPTR(pScrn); if (pMsm == NULL) { ErrorF("Unable to allocate memory\n"); return FALSE; } pEnt = xf86GetEntityInfo(pScrn->entityList[0]); /* Open the FB device specified by the user */ dev = xf86FindOptionValue(pEnt->device->options, "fb"); pMsm->fd = open(dev, O_RDWR, 0); if (pMsm->fd < 0) { xf86DrvMsg(pScrn->scrnIndex, X_ERROR, "Opening '%s' failed: %s\n", dev, strerror(errno)); xfree(pMsm); return FALSE; } /* Unblank the screen if it was previously blanked */ ioctl(pMsm->fd, FBIOBLANK, FB_BLANK_UNBLANK); /* Make sure the software refresher is on */ ioctl(pMsm->fd, MSMFB_RESUME_SW_REFRESHER, 0); /* Get the fixed info (par) structure */ if (ioctl(pMsm->fd, FBIOGET_FSCREENINFO, &pMsm->fixed_info)) { xf86DrvMsg(pScrn->scrnIndex, X_ERROR, "Unable to read hardware info from %s: %s\n", dev, strerror(errno)); xfree(pMsm); return FALSE; } /* Parse the ID and figure out what version of the MDP and what * panel ID we have */ if (sscanf(pMsm->fixed_info.id, "msmfb%d_%x", &mdpver, &panelid) < 2) { xf86DrvMsg(pScrn->scrnIndex, X_ERROR, "Unable to determine the MDP and panel type\n"); xfree(pMsm); return FALSE; } switch (mdpver) { case 22: pMsm->chipID = MSM_TYPE_7201; break; case 31: pMsm->chipID = MSM_TYPE_8X50; break; } /* FIXME: If we want to parse the panel type, it happens here */ /* Setup memory */ /* FIXME: This is where we will be in close communication with * the fbdev driver to allocate memory. In the mean time, we * just reuse the framebuffer memory */ pScrn->videoRam = pMsm->fixed_info.smem_len; /* Get the current screen setting */ if (ioctl(pMsm->fd, FBIOGET_VSCREENINFO, &pMsm->mode_info)) { xf86DrvMsg(pScrn->scrnIndex, X_ERROR, "Unable to read the current mode from %s: %s\n", dev, strerror(errno)); xfree(pMsm); return FALSE; } /* Fixme: 16bpp and 24bpp for now; other depths later */ /* -- but only when they can be tested. */ /* (Tested on 16bpp Alaska and SURF and 24bpp ASUS CAT.) */ if (pMsm->mode_info.bits_per_pixel != 16 && pMsm->mode_info.bits_per_pixel != 24) { xf86DrvMsg(pScrn->scrnIndex, X_ERROR, "The driver can only support 16bpp and 24bpp output\n"); xfree(pMsm); return FALSE; } switch(pMsm->mode_info.bits_per_pixel) { case 16: depth = 16; fbbpp = 16; break; case 24: depth = 24; if (pMsm->mode_info.transp.offset == 24 && pMsm->mode_info.transp.length == 8) fbbpp = 32; else fbbpp = 24; break; default: xf86DrvMsg(pScrn->scrnIndex, X_ERROR, "The driver can only support 16bpp and 24bpp output\n"); xfree(pMsm); return FALSE; } if (!xf86SetDepthBpp(pScrn, depth, 16, fbbpp, Support24bppFb | Support32bppFb | SupportConvert32to24 | SupportConvert24to32)) { ErrorF("Unable to set bitdepth\n"); xfree(pMsm); return FALSE; } xf86PrintDepthBpp(pScrn); pScrn->rgbBits = 8; if (!xf86SetWeight(pScrn, defaultWeight, defaultWeight)) { xfree(pMsm); return FALSE; } /* Initialize default visual */ if (!xf86SetDefaultVisual(pScrn, -1)) { xfree(pMsm); return FALSE; } { Gamma zeros = { 0.0, 0.0, 0.0 }; if (!xf86SetGamma(pScrn, zeros)) { xfree(pMsm); return FALSE; } } pScrn->progClock = TRUE; pScrn->chipset = MSM_DRIVER_NAME; xf86DrvMsg(pScrn->scrnIndex, X_INFO, "MSM %s variant (video memory:" " %dkB)\n", msmGetChipset(pMsm->chipID), pScrn->videoRam / 1024); /* Default options with no conf settings */ pMsm->xvports = 3; xf86CollectOptions(pScrn, NULL); /* We need to allocate this memory here because we have multiple * screens, and we can't go writing on the MSMOptions structure */ options = xalloc(sizeof(MSMOptions)); if (options == NULL) { xfree(pMsm); return FALSE; } memcpy(options, MSMOptions, sizeof(MSMOptions)); xf86ProcessOptions(pScrn->scrnIndex, pScrn->options, options); /* NoAccel - default TRUE */ pMsm->accel = xf86ReturnOptValBool(options, OPTION_NOACCEL, TRUE); /* SWBlit - default FALSE */ pMsm->useSWBlit = xf86ReturnOptValBool(options, OPTION_SWBLIT, FALSE); #if USEDRI2 /* DRI2 - default TRUE */ pMsm->useDRI2 = xf86ReturnOptValBool(options, OPTION_DRI2, TRUE); #else pMsm->useDRI2 = FALSE; #endif /* DRI - default FALSE */ if (pMsm->useDRI2 == FALSE) pMsm->useDRI = xf86ReturnOptValBool(options, OPTION_DRI, FALSE); else pMsm->useDRI = FALSE; /* SWCursor - default FALSE */ pMsm->HWCursor = !xf86ReturnOptValBool(options, OPTION_SWCURSOR, FALSE); /* DefaultVsync - default 60 */ pMsm->defaultVsync = 60; if (xf86GetOptValInteger(options, OPTION_VSYNC, &vsync)) { if (vsync > 0 && vsync < 120) pMsm->defaultVsync = vsync; } /* NoSigBlock - default TRUE */ pMsm->NoSigBlock = xf86ReturnOptValBool(options, OPTION_NOSIGBLOCK, TRUE); /* SocketGID - default effective GID */ pMsm->socketGID = getegid(); gid = xf86GetOptValString(options, OPTION_SOCKGID); if (gid && _getgid(gid, &pMsm->socketGID)) xf86DrvMsg(pScrn->scrnIndex, X_ERROR, "Socket GID %s is not valid\n", gid); /* FastFill - default TRUE */ pMsm->FastFill = xf86ReturnOptValBool(options, OPTION_FASTFILL, TRUE); /* FastComposite - default FALSE */ pMsm->FastComposite = xf86ReturnOptValBool(options, OPTION_FASTCOMPOSITE, FALSE); /* FastCompositeRepeat - default FALSE */ pMsm->FastCompositeRepeat = xf86ReturnOptValBool(options, OPTION_FASTCOMPOSITEREPEAT, FALSE); /* FastVideoMemCopy - default FALSE */ pMsm->FastVideoMemCopy = xf86ReturnOptValBool(options, OPTION_FASTVIDEOMEMCOPY, FALSE); /* FastAppFBMemCopy - default FALSE */ pMsm->FastAppFBMemCopy = xf86ReturnOptValBool(options, OPTION_FASTAPPFBMEMCOPY, FALSE); /* FBCache - default WriteThroughCache */ pMsm->FBCache = MDP_FB_PAGE_PROTECTION_WRITETHROUGHCACHE; str = xf86GetOptValString(options, OPTION_FBCACHE); if (str) { int i; for(i = 0; fbCacheStrings[i] != NULL; i++) { if (!stricmp(str, fbCacheStrings[i])) { pMsm->FBCache = i; break; } } if (fbCacheStrings[i] == NULL) xf86DrvMsg(pScrn->scrnIndex, X_ERROR, "Invalid FBCache '%s'\n", str); } /* PixmapMemtype - default KMEM */ pMsm->pixmapMemtype = MSM_DRM_MEMTYPE_KMEM; str = xf86GetOptValString(options, OPTION_PIXMAP_MEMTYPE); if (str) { /* No for loop here because the memory types are masks, not indexes */ if (!stricmp(str, "KMEM")) pMsm->pixmapMemtype = MSM_DRM_MEMTYPE_KMEM; else if (!stricmp(str, "UncachedKMEM")) pMsm->pixmapMemtype = MSM_DRM_MEMTYPE_KMEM_NOCACHE; else if (!stricmp(str, "EBI")) pMsm->pixmapMemtype = MSM_DRM_MEMTYPE_EBI; else if (!stricmp(str, "SMI")) pMsm->pixmapMemtype = MSM_DRM_MEMTYPE_SMI; else xf86DrvMsg(pScrn->scrnIndex, X_ERROR, "Invalid pixmap memory type %s\n", str); } /* Other drivers will keep this copy of the options in the private rec. * I don't see any reason to do that unless we have other functions * that need it */ xfree(options); #if USEDRI2 if (pMsm->useDRI2 && !MSMInitDRM(pScrn)) { xf86DrvMsg(pScrn->scrnIndex, X_ERROR, "DRI2: Unable to open DRM\n"); pMsm->useDRI2 = FALSE; } #endif /* Set up the binder socket */ MSMBinderInit(pMsm); /* Set up the virtual size */ pScrn->virtualX = pScrn->display->virtualX > pMsm->mode_info.xres ? pScrn->display->virtualX : pMsm->mode_info.xres; pScrn->virtualY = pScrn->display->virtualY > pMsm->mode_info.yres ? pScrn->display->virtualY : pMsm->mode_info.yres; if (pScrn->virtualX > pMsm->mode_info.xres_virtual) pScrn->virtualX = pMsm->mode_info.xres_virtual; if (pScrn->virtualY > pMsm->mode_info.yres_virtual) pScrn->virtualY = pMsm->mode_info.yres_virtual; /* displayWidth is the width of the line in pixels */ /* The framebuffer driver should always report the line length, * but in case it doesn't, we can calculate it ourselves */ if (pMsm->fixed_info.line_length) { pScrn->displayWidth = pMsm->fixed_info.line_length; } else { pScrn->displayWidth = pMsm->mode_info.xres_virtual * pMsm->mode_info.bits_per_pixel / 8; } pScrn->displayWidth /= (pScrn->bitsPerPixel / 8); /* Set up the view port */ pScrn->frameX0 = 0; pScrn->frameY0 = 0; pScrn->frameX1 = pMsm->mode_info.xres; pScrn->frameY1 = pMsm->mode_info.yres; MSMGetDefaultMode(pMsm); /* Make a copy of the mode - this is important, because some * where in the RandR setup, these modes get deleted */ pScrn->modes = xf86DuplicateMode(&pMsm->default_mode); pScrn->currentMode = pScrn->modes; /* Set up the colors - this is from fbdevhw, which implies * that it is important for TrueColor and DirectColor modes */ pScrn->offset.red = pMsm->mode_info.red.offset; pScrn->offset.green = pMsm->mode_info.green.offset; pScrn->offset.blue = pMsm->mode_info.blue.offset; pScrn->mask.red = ((1 << pMsm->mode_info.red.length) - 1) << pMsm->mode_info.red.offset; pScrn->mask.green = ((1 << pMsm->mode_info.green.length) - 1) << pMsm->mode_info.green.offset; pScrn->mask.blue = ((1 << pMsm->mode_info.blue.length) - 1) << pMsm->mode_info.blue.offset; xf86CrtcConfigInit(pScrn, &MSMCrtcConfigFuncs); MSMCrtcSetup(pScrn); xf86CrtcSetSizeRange(pScrn, pMsm->mode_info.xres, pMsm->mode_info.yres, pMsm->mode_info.xres, pMsm->mode_info.yres); /* Setup the output */ MSMOutputSetup(pScrn); if (!xf86InitialConfiguration(pScrn, FALSE)) { xfree(pMsm); return FALSE; } xf86PrintModes(pScrn); /* FIXME: We will probably need to be more exact when setting * the DPI. For now, we just use the default (96,96 I think) */ xf86SetDpi(pScrn, 0, 0); xf86DrvMsg(pScrn->scrnIndex, X_INFO, "MSM Options:\n"); xf86DrvMsg(pScrn->scrnIndex, X_INFO, " HW Accel: %s\n", pMsm->accel ? "Enabled" : "Disabled"); xf86DrvMsg(pScrn->scrnIndex, X_INFO, " SW Blit: %s\n", pMsm->useSWBlit ? "Enabled" : "Disabled"); xf86DrvMsg(pScrn->scrnIndex, X_INFO, " DRI: %s\n", pMsm->useDRI ? "Enabled" : "Disabled"); xf86DrvMsg(pScrn->scrnIndex, X_INFO, " DRI2: %s\n", pMsm->useDRI2 ? "Enabled" : "Disabled"); xf86DrvMsg(pScrn->scrnIndex, X_INFO, " HW Cursor: %s\n", pMsm->HWCursor ? "Enabled" : "Disabled"); xf86DrvMsg(pScrn->scrnIndex, X_INFO, " Default Vsync: %d\n", pMsm->defaultVsync); xf86DrvMsg(pScrn->scrnIndex, X_INFO, " NoSigBlock: %s\n", pMsm->NoSigBlock ? "Enabled" : "Disabled"); xf86DrvMsg(pScrn->scrnIndex, X_INFO, " FastFill: %s\n", pMsm->FastFill ? "Enabled" : "Disabled"); xf86DrvMsg(pScrn->scrnIndex, X_INFO, " FastComposite: %s\n", pMsm->FastComposite ? "Enabled" : "Disabled"); xf86DrvMsg(pScrn->scrnIndex, X_INFO, " FastCompositeRepeat: %s\n", pMsm->FastCompositeRepeat ? "Enabled" : "Disabled"); xf86DrvMsg(pScrn->scrnIndex, X_INFO, " FastVideoMemCopy: %s\n", pMsm->FastVideoMemCopy ? "Enabled" : "Disabled"); xf86DrvMsg(pScrn->scrnIndex, X_INFO, " FastAppFBMemCopy: %s\n", pMsm->FastAppFBMemCopy ? "Enabled" : "Disabled"); xf86DrvMsg(pScrn->scrnIndex, X_INFO, " FBCache: %s\n", fbCacheStrings[pMsm->FBCache]); switch(pMsm->pixmapMemtype) { case MSM_DRM_MEMTYPE_KMEM: xf86DrvMsg(pScrn->scrnIndex, X_INFO, " Pixmap: KMEM\n"); break; case MSM_DRM_MEMTYPE_KMEM_NOCACHE: xf86DrvMsg(pScrn->scrnIndex, X_INFO, " Pixmap: Uncached KMEM\n"); break; case MSM_DRM_MEMTYPE_EBI: xf86DrvMsg(pScrn->scrnIndex, X_INFO, " Pixmap: EBI\n"); break; case MSM_DRM_MEMTYPE_SMI: xf86DrvMsg(pScrn->scrnIndex, X_INFO, " Pixmap: SMI\n"); break; } return TRUE; } static Bool MSMSaveScreen(ScreenPtr pScreen, int mode) { /* Nothing to do here, yet */ return TRUE; } static Bool MSMCloseScreen(int scrnIndex, ScreenPtr pScreen) { ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum]; MSMPtr pMsm = MSMPTR(pScrn); /* Close EXA */ if (pMsm->accel && pMsm->pExa) { exaDriverFini(pScreen); xfree(pMsm->pExa); pMsm->pExa = NULL; } /* Close DRI2 */ #if USEDRI2 if (pMsm->useDRI2) MSMDRI2ScreenClose(pScreen); #endif /* Unmap the framebuffer memory */ munmap(pMsm->fbmem, pMsm->fixed_info.smem_len); pScreen->CloseScreen = pMsm->CloseScreen; return (*pScreen->CloseScreen) (scrnIndex, pScreen); } static Bool MSMScreenInit(int scrnIndex, ScreenPtr pScreen, int argc, char **argv) { ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum]; MSMPtr pMsm = MSMPTR(pScrn); #if defined (MSMFB_GET_PAGE_PROTECTION) && defined (MSMFB_SET_PAGE_PROTECTION) /* If the frame buffer can be cached, do so. */ /* CAUTION: This needs to be done *BEFORE* the mmap() call, or it has no effect. */ /* FIXME: The current page protection should ideally be saved here and restored */ /* when exiting the driver, but there may be little point in doing this */ /* since the XServer typically won't exit for most applications. */ { const int desired_fb_page_protection = pMsm->FBCache; struct mdp_page_protection fb_page_protection; // If the kernel supports the FB Caching settings infrastructure, // then set the frame buffer cache settings. // Otherwise, issue a warning and continue. if (ioctl(pMsm->fd, MSMFB_GET_PAGE_PROTECTION, &fb_page_protection)) { xf86DrvMsg(scrnIndex, X_WARNING, "MSMFB_GET_PAGE_PROTECTION IOCTL: Unable to get current FB cache settings.\n"); } else { if (fb_page_protection.page_protection != desired_fb_page_protection) { fb_page_protection.page_protection = desired_fb_page_protection; if (ioctl(pMsm->fd, MSMFB_SET_PAGE_PROTECTION, &fb_page_protection)) { xf86DrvMsg(scrnIndex, X_ERROR, "MSMFB_SET_PAGE_PROTECTION IOCTL: Unable to set requested FB cache settings: %s.\n", fbCacheStrings[desired_fb_page_protection]); return FALSE; } } } } #endif // defined (MSMFB_GET_PAGE_PROTECTION) && defined (MSMFB_SET_PAGE_PROTECTION) /* Map the framebuffer memory */ pMsm->fbmem = mmap(NULL, pMsm->fixed_info.smem_len, PROT_READ | PROT_WRITE, MAP_SHARED, pMsm->fd, 0); /* If we can't map the memory, then this is a short trip */ if (pMsm->fbmem == MAP_FAILED) { xf86DrvMsg(scrnIndex, X_ERROR, "Unable to map the " "framebuffer memory: %s\n", strerror(errno)); return FALSE; } /* Set up the mode - this doesn't actually touch the hardware, * but it makes RandR all happy */ if (!xf86SetDesiredModes(pScrn)) { xf86DrvMsg(scrnIndex, X_ERROR, "Unable to set the mode"); return FALSE; } /* Set up the X visuals */ miClearVisualTypes(); /* We only support TrueColor at the moment, and I suspect that is all * we will ever support */ if (!miSetVisualTypes(pScrn->depth, TrueColorMask, pScrn->rgbBits, TrueColor)) { xf86DrvMsg(scrnIndex, X_ERROR, "Unable to set up the visual" " for %d BPP\n", pScrn->bitsPerPixel); return FALSE; } if (!miSetPixmapDepths()) { xf86DrvMsg(scrnIndex, X_ERROR, "Unable to set the pixmap depth\n"); return FALSE; } #if USEDRI2 if (pMsm->useDRI2) pMsm->useDRI2 = MSMDRI2ScreenInit(pScreen); #endif #if USEDRI pMsm->DRIEnabled = FALSE; if (!pMsm->useDRI2 && pMsm->useDRI) { pMsm->dri = xcalloc(1, sizeof(struct msm_dri)); pMsm->dri->depthBits = pScrn->depth; pMsm->DRIEnabled = MSMDRIScreenInit(pScreen); } #endif /* Set up the X drawing area */ xf86LoadSubModule(pScrn, "fb"); if (!fbScreenInit(pScreen, pMsm->fbmem, pScrn->virtualX, pScrn->virtualY, pScrn->xDpi, pScrn->yDpi, pScrn->displayWidth, pScrn->bitsPerPixel)) { xf86DrvMsg(scrnIndex, X_ERROR, "fbScreenInit failed\n"); return FALSE; } /* Set up the color information for the visual(s) */ if (pScrn->bitsPerPixel > 8) { VisualPtr visual = pScreen->visuals + pScreen->numVisuals; while (--visual >= pScreen->visuals) { if ((visual->class | DynamicClass) == DirectColor) { visual->offsetRed = pScrn->offset.red; visual->offsetGreen = pScrn->offset.green; visual->offsetBlue = pScrn->offset.blue; visual->redMask = pScrn->mask.red; visual->greenMask = pScrn->mask.green; visual->blueMask = pScrn->mask.blue; } } } /* Set up the Render fallbacks */ if (!fbPictureInit(pScreen, NULL, 0)) { xf86DrvMsg(scrnIndex, X_ERROR, "fbPictureInit failed\n"); return FALSE; } /* Set default colors */ xf86SetBlackWhitePixels(pScreen); /* Set up the backing store */ miInitializeBackingStore(pScreen); xf86SetBackingStore(pScreen); if (pMsm->accel) { /* Set up EXA */ xf86LoadSubModule(pScrn, "exa"); if (!MSMSetupExa(pScreen)) ErrorF("Unable to setup EXA\n"); } /* Set up the software cursor */ miDCInitialize(pScreen, xf86GetPointerScreenFuncs()); /* Try to set up the HW cursor */ if (pMsm->HWCursor == TRUE) pMsm->HWCursor = MSMCursorInit(pScreen); /* Set up the default colormap */ if (!miCreateDefColormap(pScreen)) { xf86DrvMsg(scrnIndex, X_ERROR, "miCreateDefColormap failed\n"); return FALSE; } #if USEDRI if (pMsm->DRIEnabled) MSMDRIFinishScreenInit(pScreen); #endif /* Set up Xv */ MSMInitVideo(pScreen); /* FIXME: Set up DPMS here */ pScreen->SaveScreen = MSMSaveScreen; /* Set up our own CloseScreen function */ pMsm->CloseScreen = pScreen->CloseScreen; pScreen->CloseScreen = MSMCloseScreen; if (!xf86CrtcScreenInit(pScreen)) { xf86DrvMsg(scrnIndex, X_ERROR, "CRTCScreenInit failed\n"); return FALSE; } return TRUE; } static Bool MSMSwitchMode(int scrnIndex, DisplayModePtr mode, int flags) { /* FIXME: We should only have the one mode, so we shouldn't ever call * this function - regardless, it needs to be stubbed - so what * do we return, TRUE or FALSE? */ return TRUE; } static Bool MSMEnterVT(int ScrnIndex, int flags) { /* Nothing to do here yet - there might be some triggers that we need * to throw at the framebuffer */ return TRUE; } static void MSMLeaveVT(int ScrnIndex, int flags) { /* Restore any framebufferish things here */ } /* ------------------------------------------------------------ */ /* Following is the standard driver setup that probes for the */ /* hardware and sets up the structures. */ static SymTabRec MSMChipsets[] = { {0, "MSM7201"}, {1, "QSD8X50"}, {-1, NULL} }; static const OptionInfoRec * MSMAvailableOptions(int chipid, int busid) { return MSMOptions; } static void MSMIdentify(int flags) { xf86PrintChipsets(MSM_NAME, "Driver for Qualcomm MSM processors", MSMChipsets); } static Bool MSMProbe(DriverPtr drv, int flags) { GDevPtr *sections; int nsects; char *dev; Bool foundScreen = FALSE; ScrnInfoPtr pScrn = NULL; int fd, i; /* For now, just return false during a probe */ if (flags & PROBE_DETECT) return FALSE; /* Find all of the device sections in the config */ nsects = xf86MatchDevice(MSM_NAME, §ions); if (nsects <= 0) return FALSE; /* We know that we will only have at most 4 possible outputs */ for (i = 0; i < (nsects > 4 ? 4 : nsects); i++) { dev = xf86FindOptionValue(sections[i]->options, "fb"); xf86Msg(X_WARNING, "Section %d - looking for %s\n", i, dev); /* FIXME: There should be some discussion about how we * refer to devices - blindly matching to /dev/fbX files * seems like it could backfire on us. For now, force * the user to set the backing FB in the xorg.conf */ if (dev == NULL) { xf86Msg(X_WARNING, "no device specified in section %d\n", i); continue; } fd = open(dev, O_RDWR, 0); if (fd <= 0) { xf86Msg(X_WARNING, "Could not open '%s': %s\n", dev, strerror(errno)); continue; } else { struct fb_fix_screeninfo info; int entity; if (ioctl(fd, FBIOGET_FSCREENINFO, &info)) { xf86Msg(X_WARNING, "Unable to read hardware info " "from %s: %s\n", dev, strerror(errno)); close(fd); continue; } close(fd); /* Make sure that this is a MSM driver */ if (strncmp(info.id, "msmfb", 5)) { xf86Msg(X_WARNING, "%s is not a MSM device: %s\n", dev, info.id); continue; } foundScreen = TRUE; entity = xf86ClaimFbSlot(drv, 0, sections[i], TRUE); pScrn = xf86ConfigFbEntity(NULL, 0, entity, NULL, NULL, NULL, NULL); xf86Msg(X_WARNING, "Add screen %p\n", pScrn); /* Set up the hooks for the screen */ pScrn->driverVersion = MSM_VERSION_CURRENT; pScrn->driverName = MSM_NAME; pScrn->name = MSM_NAME; pScrn->Probe = MSMProbe; pScrn->PreInit = MSMPreInit; pScrn->ScreenInit = MSMScreenInit; pScrn->SwitchMode = MSMSwitchMode; pScrn->EnterVT = MSMEnterVT; pScrn->LeaveVT = MSMLeaveVT; } } xfree(sections); return foundScreen; } _X_EXPORT DriverRec msmDriver = { MSM_VERSION_CURRENT, MSM_DRIVER_NAME, MSMIdentify, MSMProbe, MSMAvailableOptions, NULL, 0, NULL }; MODULESETUPPROTO(msmSetup); /* Versioning information for the module - most of these variables will come from config.h generated by ./configure */ static XF86ModuleVersionInfo msmVersRec = { MSM_DRIVER_NAME, MODULEVENDORSTRING, MODINFOSTRING1, MODINFOSTRING2, XORG_VERSION_CURRENT, MSM_VERSION_MAJOR, MSM_VERSION_MINOR, MSM_VERSION_PATCH, ABI_CLASS_VIDEODRV, ABI_VIDEODRV_VERSION, NULL, {0, 0, 0, 0}, }; _X_EXPORT XF86ModuleData msmModuleData = { &msmVersRec, msmSetup, NULL }; pointer msmSetup(pointer module, pointer ops, int *errmaj, int *errmin) { static Bool initDone = FALSE; if (initDone == FALSE) { initDone = TRUE; xf86AddDriver(&msmDriver, module, HaveDriverFuncs); /* FIXME: Load symbol references here */ return (pointer) 1; } else { if (errmaj) *errmaj = LDR_ONCEONLY; return NULL; } } xf86-video-msm/src/msm_fourcc.h0000644000175000017500000000574411615776600016577 0ustar paulliupaulliu /* msm_fourcc.h * * Copyright (c) 2009, Code Aurora Forum. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Code Aurora nor * the names of its contributors may be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NON-INFRINGEMENT ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef MSM_FOURCC_H_ #define MSM_FOURCC_H_ /* This is from the trident fourcc definition */ #define FOURCC_RGB565 0x36315652 #define XVIMAGE_RGB565 \ { \ FOURCC_RGB565, \ XvRGB,\ LSBFirst,\ {'R','V','1','6',\ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00},\ 16,\ XvPacked,\ 1,\ 16, 0xF800, 0x07E0, 0x001F,\ 0, 0, 0,\ 0, 0, 0,\ 0, 0, 0,\ {'R','G','B',0,\ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},\ XvTopToBottom\ } #define FOURCC_NV12 0x3231564E #define XVIMAGE_NV12 \ { \ FOURCC_NV12, \ XvYUV, \ LSBFirst, \ {'N','V','1','2', \ 0x00,0x00,0x00,0x10,0x80,0x00,0x00,0xAA,0x00,0x38,0x9B,0x71}, \ 12, \ XvPlanar, \ 2, \ 0, 0, 0, 0 , \ 8, 8, 8, \ 1, 2, 2, \ 1, 2, 2, \ {'Y','U','V',0, \ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, \ XvTopToBottom \ } #define FOURCC_NV21 0x3132564E #define XVIMAGE_NV21 \ { \ FOURCC_NV21, \ XvYUV, \ LSBFirst, \ {'N','V','3','2', \ 0x00,0x00,0x00,0x10,0x80,0x00,0x00,0xAA,0x00,0x38,0x9B,0x71}, \ 12, \ XvPlanar, \ 2, \ 0, 0, 0, 0 , \ 8, 8, 8, \ 1, 2, 2, \ 1, 2, 2, \ {'Y','V','U',0, \ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, \ XvTopToBottom \ } #endif xf86-video-msm/src/msm-render.h0000644000175000017500000000515011615776600016502 0ustar paulliupaulliu/* msm-render.h * * Copyright (c) 2009, Code Aurora Forum. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Code Aurora nor * the names of its contributors may be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NON-INFRINGEMENT ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef MSM_RENDER_H_ #define MSM_RENDER_H_ #include "msm.h" #define MSM_BLIT_GEM 0x01 #define MSM_BLIT_FB 0x02 #define MSM_BLIT_PMEM 0x04 typedef struct { int width; int height; int format; int pitch; unsigned long priv[2]; int flags; } MSMBlitSurface; typedef struct { int x, y, w, h; } MSMBlitRect; typedef struct { MSMBlitSurface *src; MSMBlitSurface *dst; MSMBlitRect *srcRect; MSMBlitRect *dstRect; } MSMBlitRec; BOOL isCopyMDPCompatible(MSMBlitRec *blit, int bpp); void swBlit(MSMPtr,MSMBlitRec *blit, int bpp, BOOL blockSignalsForVFP); void swBlit_NoOverlap(unsigned char * __restrict__ dst, unsigned char * __restrict__ src, int w, int h, int dpitch, int spitch, int bpp, BOOL blockSignalsForVFP); void swFill(MSMPtr pMsm, int byteOffset, int destSurfaceWidthPixels, int x, int y, int w, int h, uint32_t src, int bitsPerPixel, BOOL blockSignalsForVFP); void hwBlit(MSMPtr, MSMBlitRec *blit, int flags); void hwBlitFlush(MSMPtr); void hwBlitReset(void); #endif xf86-video-msm/src/neon_memmove.S0000644000175000017500000005556511615776600017107 0ustar paulliupaulliu/*************************************************************************** Copyright (c) 2009, Code Aurora Forum. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Code Aurora nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ***************************************************************************/ /*************************************************************************** * Neon memmove: Attempts to do a memmove with Neon registers if possible, * Inputs: * dest: The destination buffer * src: The source buffer * n: The size of the buffer to transfer * Outputs: * ***************************************************************************/ /* * General note: * The original code that was compiled for rvct used PUSH/POP and VPUSH/VPOP * However, it looks like the 2006 CodeSourcery Assembler has issues generating * the correct object code for VPOP, resulting in horrific stack crashes. * As a result, I've temporarily move PUSH->STMDB, POP->LDMIA, VPUSH->VSTMDB, * and VPOP->VLDMIA. We can revert this back once we update our toolchain. * * Also, VSHL swaps the source register and the shift-amount register * around in 2006-q3. I've coded this incorrectly so it turns out correct * in the object code, but we'll need to undo that later... */ .code 32 .align 4 .globl neon_memmove .func neon_memmove: #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) stmdb sp!, {r0} #else push {r0} #endif /* * The requirements for memmove state that the function should * operate as if data were being copied from the source to a * buffer, then to the destination. This is to allow a user * to copy data from a source and target that overlap. * * We can't just do byte copies front-to-back automatically, since * there's a good chance we may have an overlap (why else would someone * intentionally use memmove then?). * * We'll break this into two parts. Front-to-back, or back-to-front * copies. */ neon_memmove_cmf: cmp r0, r1 blt neon_front_to_back_copy bgt neon_back_to_front_copy b neon_memmove_done /* ############################################################# * Front to Back copy */ neon_front_to_back_copy: /* * For small copies, just do a quick memcpy. We can do this for * front-to-back copies, aligned or unaligned, since we're only * doing 1 byte at a time... */ cmp r2, #4 bgt neon_f2b_gt4 cmp r2, #0 neon_f2b_smallcopy_loop: beq neon_memmove_done ldrb r12, [r1], #1 subs r2, r2, #1 strb r12, [r0], #1 b neon_f2b_smallcopy_loop neon_f2b_gt4: /* Preload what we can...*/ pld [r0,#0] pld [r1,#0] /* The window size is in r3. */ sub r3, r1, r0 #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) stmdb sp!, {r4-r6} #else push {r4-r6} #endif neon_f2b_check_align: /* Check alignment. */ ands r12, r0, #0x3 beq neon_f2b_source_align_check cmp r12, #2 ldrb r4, [r1], #1 ldrleb r5, [r1], #1 ldrltb r6, [r1], #1 rsb r12, r12, #4 sub r2, r2, r12 strb r4, [r0], #1 strleb r5, [r0], #1 strltb r6, [r0], #1 neon_f2b_source_align_check: ands r12, r1, #0x3 bne neon_f2b_nonaligned neon_f2b_try_16_align: /* If we're >64, attempt to align on 16-bytes. Smaller amounts * don't seem to be worth handling. */ cmp r2, #64 blt neon_f2b_align_route /* This is where we try 16-byte alignment. */ ands r12, r0, #0xf beq neon_f2b_align_route rsb r12, r12, #16 neon_f2b_16_start: sub r2, r2, r12 lsrs r5, r12, #2 neon_f2b_align_16_4: ldr r4, [r1], #4 subs r5, r5, #1 str r4, [r0], #4 bne neon_f2b_align_16_4 neon_f2b_align_route: /* ############################################################# * Front to Back copy - aligned */ /* * Note that we can't just route based on the size in r2. If that's * larger than the overlap window in r3, we could potentially * (and likely!) destroy data we're copying. */ cmp r2, r3 movle r12, r2 movgt r12, r3 cmp r12, #256 bge neon_f2b_copy_128_a cmp r12, #64 bge neon_f2b_copy_32_a cmp r12, #16 bge neon_f2b_copy_16_a cmp r12, #8 bge neon_f2b_copy_8_a cmp r12, #4 bge neon_f2b_copy_4_a b neon_f2b_copy_1_a neon_f2b_copy_128_a: #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) vstmdb sp!, {q4-q7} #else vpush {q4-q7} #endif mov r12, r2, lsr #7 neon_f2b_copy_128_a_loop: vld1.32 {q0,q1}, [r1]! vld1.32 {q2,q3}, [r1]! vld1.32 {q4,q5}, [r1]! vld1.32 {q6,q7}, [r1]! pld [r1, #0] pld [r1, #128] vst1.32 {q0,q1}, [r0]! vst1.32 {q2,q3}, [r0]! vst1.32 {q4,q5}, [r0]! vst1.32 {q6,q7}, [r0]! subs r12, r12, #1 pld [r0, #0] pld [r0, #128] bne neon_f2b_copy_128_a_loop #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) vldmia sp!, {q4-q7} #else vpop {q4-q7} #endif ands r2, r2, #0x7f beq neon_f2b_finish cmp r2, #32 bge neon_f2b_copy_32_a b neon_f2b_copy_finish_a neon_f2b_copy_32_a: mov r12, r2, lsr #5 neon_f2b_copy_32_a_loop: vld1.32 {q0,q1}, [r1]! subs r12, r12, #1 pld [r1, #0] vst1.32 {q0,q1}, [r0]! bne neon_f2b_copy_32_a_loop ands r2, r2, #0x1f beq neon_f2b_finish neon_f2b_copy_finish_a: neon_f2b_copy_16_a: movs r12, r2, lsr #4 beq neon_f2b_copy_8_a neon_f2b_copy_16_a_loop: vld1.32 {q0}, [r1]! subs r12, r12, #1 vst1.32 {q0}, [r0]! bne neon_f2b_copy_16_a_loop ands r2, r2, #0xf beq neon_f2b_finish neon_f2b_copy_8_a: cmp r2, #8 blt neon_f2b_copy_4_a ldm r1!, {r4-r5} subs r2, r2, #8 stm r0!, {r4-r5} neon_f2b_copy_4_a: cmp r2, #4 blt neon_f2b_copy_1_a ldr r4, [r1], #4 subs r2, r2, #4 str r4, [r0], #4 neon_f2b_copy_1_a: cmp r2, #0 beq neon_f2b_finish neon_f2b_copy_1_a_loop: ldrb r12, [r1], #1 subs r2, r2, #1 strb r12, [r0], #1 bne neon_f2b_copy_1_a_loop b neon_f2b_finish /* ############################################################# * Front to Back copy - unaligned */ neon_f2b_nonaligned: /* * For sizes < 8, does it really make sense to do the whole shift * party? Note that we DON'T want to call neon_f2b_copy_1_u, * since we'll end up trying to pop r8-r11, and we DON'T want * to do that... */ cmp r2, #8 ble neon_f2b_copy_1_a #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) stmdb sp!, {r7-r9} #else push {r7-r9} #endif cmp r12, #2 ldrb r4, [r1], #1 ldrleb r5, [r1], #1 ldrltb r6, [r1], #1 rsb r8, r12, #4 sub r2, r2, r8 lsl r8, r8, #3 orrle r4, r4, r5, lsl #8 orrlt r4, r4, r6, lsl #16 rsb r9, r8, #32 /* * r4 = overflow bits * r8 = # of bits we copied into the r4 register to align source. * r9 = 32 - r8 * r12 = Index counter for each size, so we determine how many times * the given size will go into r2, then count down that # of * times in r12. */ cmp r2, #64 blt neon_f2b_unaligned_route ands r12, r0, #0xf beq neon_f2b_unaligned_route cmp r3, #4 blt neon_f2b_unaligned_route rsb r12, r12, #16 neon_f2b_16_start_u: sub r2, r2, r12 lsrs r6, r12, #2 neon_f2b_align_16_4_u: ldr r5, [r1], #4 subs r6, r6, #1 orr r4, r4, r5, lsl r8 str r4, [r0], #4 mov r4, r5, lsr r9 bne neon_f2b_align_16_4_u neon_f2b_unaligned_route: cmp r2, r3 movle r12, r2 movgt r12, r3 cmp r12, #256 bge neon_f2b_copy_64_u cmp r12, #64 bge neon_f2b_copy_32_u cmp r12, #16 bge neon_f2b_copy_16_u cmp r12, #8 bge neon_f2b_copy_8_u cmp r12, #4 bge neon_f2b_copy_4_u b neon_f2b_last_bits_u neon_f2b_copy_64_u: #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) vstmdb sp!, {q4} vstmdb sp!, {q5-q8} #else vpush {q4} vpush {q5-q8} #endif vdup.u32 q8, r8 mov r12, r2, lsr #6 and r2, r2, #0x3f neon_f2b_copy_64_u_loop: vld1.32 {q4, q5}, [r1]! vld1.32 {q6, q7}, [r1]! lsls r5, r8, #28 bcc neon_f2b_copy_64_u_b8 bpl neon_f2b_copy_64_u_b16 vshr.u64 q0, q4, #40 vshr.u64 q1, q5, #40 vshr.u64 q2, q6, #40 vshr.u64 q3, q7, #40 b neon_f2b_copy_64_unify neon_f2b_copy_64_u_b8: vshr.u64 q0, q4, #56 vshr.u64 q1, q5, #56 vshr.u64 q2, q6, #56 vshr.u64 q3, q7, #56 b neon_f2b_copy_64_unify neon_f2b_copy_64_u_b16: vshr.u64 q0, q4, #48 vshr.u64 q1, q5, #48 vshr.u64 q2, q6, #48 vshr.u64 q3, q7, #48 neon_f2b_copy_64_unify: #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) vshl.u64 q4, q8, q4 vshl.u64 q5, q8, q5 vshl.u64 q6, q8, q6 vshl.u64 q7, q8, q7 #else vshl.u64 q4, q4, q8 vshl.u64 q5, q5, q8 vshl.u64 q6, q6, q8 vshl.u64 q7, q7, q8 #endif vmov r5, s14 vorr d9, d9, d0 vmov s14, r4 vorr d10, d10, d1 vorr d11, d11, d2 vorr d12, d12, d3 vorr d13, d13, d4 vorr d14, d14, d5 vorr d15, d15, d6 vorr d8, d8, d7 subs r12, r12, #1 pld [r1, #0] pld [r1, #128] mov r4, r5 vst1.32 {q4, q5}, [r0]! vst1.32 {q6, q7}, [r0]! pld [r0, #0] pld [r0, #128] bne neon_f2b_copy_64_u_loop #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) vldmia sp!, {q5-q8} vldmia sp!, {q4} #else vpop {q5-q8} vpop {q4} #endif cmp r2, #32 bge neon_f2b_copy_32_u b neon_f2b_copy_finish_u neon_f2b_copy_32_u: #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) vstmdb sp!, {q4} #else vpush {q4} #endif vdup.u32 q4, r8 mov r12, r2, lsr #5 and r2, r2, #0x1f neon_f2b_copy_32_u_loop: vld1.32 {q0, q1}, [r1]! lsls r5, r8, #28 bcc neon_f2b_copy_32_u_b8 bpl neon_f2b_copy_32_u_b16 vshr.u64 q2, q0, #40 vshr.u64 q3, q1, #40 b neon_f2b_copy_32_unify neon_f2b_copy_32_u_b8: vshr.u64 q2, q0, #56 vshr.u64 q3, q1, #56 b neon_f2b_copy_32_unify neon_f2b_copy_32_u_b16: vshr.u64 q2, q0, #48 vshr.u64 q3, q1, #48 neon_f2b_copy_32_unify: #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) vshl.u64 q0, q4, q0 vshl.u64 q1, q4, q1 #else vshl.u64 q0, q0, q4 vshl.u64 q1, q1, q4 #endif vmov r5, s14 vorr d1, d1, d4 vmov s14, r4 vorr d2, d2, d5 vorr d3, d3, d6 vorr d0, d0, d7 subs r12, r12, #1 pld [r1, #0] mov r4, r5 vst1.32 {q0, q1}, [r0]! bne neon_f2b_copy_32_u_loop #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) vldmia sp!, {q4} #else vpop {q4} #endif neon_f2b_copy_finish_u: neon_f2b_copy_16_u: movs r12, r2, lsr #4 beq neon_f2b_copy_8_u vdup.u32 q2, r8 and r2, r2, #0xf neon_f2b_copy_16_u_loop: vld1.32 {q0}, [r1]! lsls r5, r8, #28 bcc neon_f2b_copy_16_u_b8 bpl neon_f2b_copy_16_u_b16 vshr.u64 q1, q0, #40 b neon_f2b_copy_16_unify neon_f2b_copy_16_u_b8: vshr.u64 q1, q0, #56 b neon_f2b_copy_16_unify neon_f2b_copy_16_u_b16: vshr.u64 q1, q0, #48 neon_f2b_copy_16_unify: #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) vshl.u64 q0, q2, q0 #else vshl.u64 q0, q0, q2 #endif vmov r5, s6 vorr d1, d1, d2 vmov s6, r4 vorr d0, d0, d3 subs r12, r12, #1 mov r4, r5 vst1.32 {q0}, [r0]! bne neon_f2b_copy_16_u_loop neon_f2b_copy_8_u: cmp r2, #8 blt neon_f2b_copy_4_u ldm r1!, {r6-r7} subs r2, r2, #8 orr r4, r4, r6, lsl r8 mov r5, r6, lsr r9 orr r5, r5, r7, lsl r8 stm r0!, {r4-r5} mov r4, r7, lsr r9 neon_f2b_copy_4_u: cmp r2, #4 blt neon_f2b_last_bits_u ldr r5, [r1], #4 subs r2, r2, #4 orr r4, r4, r5, lsl r8 str r4, [r0], #4 mov r4, r5, lsr r9 neon_f2b_last_bits_u: lsr r8, r8, #0x3 neon_f2b_last_bits_u_loop: strb r4, [r0], #1 subs r8, r8, #1 lsr r4, r4, #8 bne neon_f2b_last_bits_u_loop neon_f2b_copy_1_u: cmp r2, #0 beq neon_f2b_finish_u neon_f2b_copy_1_u_loop: ldrb r12, [r1], #1 subs r2, r2, #1 strb r12, [r0], #1 bne neon_f2b_copy_1_u_loop neon_f2b_finish_u: #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) ldmia sp!, {r7-r9} #else pop {r7-r9} #endif /* ############################################################# * Front to Back copy - finish */ neon_f2b_finish: #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) ldmia sp!, {r4-r6} #else pop {r4-r6} #endif b neon_memmove_done /* ############################################################# * Back to Front copy */ neon_back_to_front_copy: /* * Here, we'll want to shift to the end of the buffers. This * actually points us one past where we need to go, but since * we'll pre-decrement throughout, this will be fine. */ add r0, r0, r2 add r1, r1, r2 cmp r2, #4 bgt neon_b2f_gt4 cmp r2, #0 neon_b2f_smallcopy_loop: beq neon_memmove_done ldrb r12, [r1, #-1]! subs r2, r2, #1 strb r12, [r0, #-1]! b neon_b2f_smallcopy_loop neon_b2f_gt4: pld [r0, #0] pld [r1, #0] /* * The minimum of the overlap window size and the copy size * is in r3. */ sub r3, r0, r1 #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) stmdb sp!, {r4-r5} #else push {r4-r5} #endif /* * Check alignment. Since we'll pre-decrement as we step thru, we'll * need to make sure we're on word-alignment. */ neon_b2f_check_align: ands r12, r0, #0x3 beq neon_b2f_source_align_check sub r2, r2, r12 neon_b2f_shift_align: ldrb r4, [r1, #-1]! subs r12, r12, #1 strb r4, [r0, #-1]! bne neon_b2f_shift_align neon_b2f_source_align_check: ands r4, r1, #0x3 bne neon_b2f_nonaligned neon_b2f_try_16_align: /* If we're >64, attempt to align on 16-bytes. Smaller amounts * don't seem to be worth handling. */ cmp r2, #64 blt neon_b2f_align_route ands r12, r0, #0xf beq neon_b2f_align_route /* In this case, r12 has the number of bytes to roll backward. */ neon_b2f_16_start: sub r2, r2, r12 lsrs r5, r12, #2 neon_b2f_align_16_4: ldr r4, [r1, #-4]! subs r5, r5, #1 str r4, [r0, #-4]! bne neon_b2f_align_16_4 neon_b2f_align_route: /* * ############################################################# * Back to Front copy - aligned */ cmp r2, r3 movle r12, r2 movgt r12, r3 cmp r12, #256 bge neon_b2f_copy_128_a cmp r12, #64 bge neon_b2f_copy_32_a cmp r12, #8 bge neon_b2f_copy_8_a cmp r12, #4 bge neon_b2f_copy_4_a b neon_b2f_copy_1_a neon_b2f_copy_128_a: #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) vstmdb sp!, {q4-q7} #else vpush {q4-q7} #endif movs r12, r2, lsr #7 /* * This irks me. There MUST be a better way to read these in and * scan the register backward instead of making it go forward. Then * we need to do two subtractions... */ neon_b2f_copy_128_a_loop: sub r1, r1, #128 sub r0, r0, #128 vld1.32 {q0, q1}, [r1]! vld1.32 {q2, q3}, [r1]! vld1.32 {q4, q5}, [r1]! vld1.32 {q6, q7}, [r1]! pld [r1, #-128] pld [r1, #-256] vst1.32 {q0, q1}, [r0]! vst1.32 {q2, q3}, [r0]! vst1.32 {q4, q5}, [r0]! vst1.32 {q6, q7}, [r0]! subs r12, r12, #1 pld [r0, #-128] pld [r0, #-256] sub r1, r1, #128 sub r0, r0, #128 bne neon_b2f_copy_128_a_loop #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) vldmia sp!, {q4-q7} #else vpop {q4-q7} #endif ands r2, r2, #0x7f beq neon_b2f_finish cmp r2, #32 bge neon_b2f_copy_32_a b neon_b2f_copy_finish_a neon_b2f_copy_32_a: mov r12, r2, lsr #5 neon_b2f_copy_32_a_loop: sub r1, r1, #32 sub r0, r0, #32 vld1.32 {q0,q1}, [r1] subs r12, r12, #1 vst1.32 {q0,q1}, [r0] pld [r1, #0] bne neon_b2f_copy_32_a_loop ands r2, r2, #0x1f beq neon_b2f_finish neon_b2f_copy_finish_a: neon_b2f_copy_8_a: movs r12, r2, lsr #0x3 beq neon_b2f_copy_4_a neon_b2f_copy_8_a_loop: ldmdb r1!, {r4-r5} subs r12, r12, #1 stmdb r0!, {r4-r5} bne neon_b2f_copy_8_a_loop and r2, r2, #0x7 neon_b2f_copy_4_a: movs r12, r2, lsr #0x2 beq neon_b2f_copy_1_a and r2, r2, #0x3 neon_b2f_copy_4_a_loop: ldr r4, [r1, #-4]! subs r12, r12, #1 str r4, [r0, #-4]! bne neon_b2f_copy_4_a_loop neon_b2f_copy_1_a: cmp r2, #0 beq neon_b2f_finish neon_b2f_copy_1_a_loop: ldrb r12, [r1, #-1]! subs r2, r2, #1 strb r12, [r0, #-1]! bne neon_b2f_copy_1_a_loop /* ############################################################# * Back to Front copy - unaligned */ neon_b2f_nonaligned: /* * For sizes < 8, does it really make sense to do the whole shift * party? */ cmp r2, #8 ble neon_b2f_copy_1_a #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) stmdb sp!, {r6-r11} #else push {r6-r11} #endif /* * r3 = max window size * r4 = overflow bytes * r5 = bytes we're reading into * r6 = # bytes we're off. * r10 = copy of r6 */ and r6, r1, #0x3 eor r4, r4, r4 mov r10, r6 neon_b2f_realign: ldrb r5, [r1, #-1]! subs r6, r6, #1 orr r4, r5, r4, lsl #8 bne neon_b2f_realign /* * r10 = # of bits we copied into the r4 register to align source. * r11 = 32 - r10 * r12 = Index counter for each size, so we determine how many times * the given size will go into r2, then count down that # of * times in r12. */ sub r2, r2, r10 lsl r10, r10, #0x3 rsb r11, r10, #32 cmp r2, r3 movle r12, r2 movgt r12, r3 cmp r12, #256 bge neon_b2f_copy_64_u cmp r12, #64 bge neon_b2f_copy_32_u cmp r12, #8 bge neon_b2f_copy_8_u cmp r12, #4 bge neon_b2f_copy_4_u b neon_b2f_last_bits_u neon_b2f_copy_64_u: #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) vstmdb sp!, {q4,q5} vstmdb sp!, {q6-q8} #else vpush {q4,q5} vpush {q6-q8} #endif add r7, r11, #32 movs r12, r2, lsr #6 vdup.u32 q8, r7 neon_b2f_copy_64_u_loop: sub r1, r1, #64 sub r0, r0, #64 vld1.32 {q0, q1}, [r1]! vld1.32 {q2, q3}, [r1] sub r1, r1, #32 vmov q4, q0 vmov q5, q1 vmov q6, q2 vmov q7, q3 vmov r5, s0 mov r4, r4, lsl r11 lsls r6, r10, #28 bcc neon_b2f_copy_64_u_b8 bpl neon_b2f_copy_64_u_b16 vshr.u64 q0, q0, #24 vshr.u64 q1, q1, #24 vshr.u64 q2, q2, #24 vshr.u64 q3, q3, #24 b neon_b2f_copy_64_unify neon_b2f_copy_64_u_b8: vshr.u64 q0, q0, #8 vshr.u64 q1, q1, #8 vshr.u64 q2, q2, #8 vshr.u64 q3, q3, #8 b neon_b2f_copy_64_unify neon_b2f_copy_64_u_b16: vshr.u64 q0, q0, #16 vshr.u64 q1, q1, #16 vshr.u64 q2, q2, #16 vshr.u64 q3, q3, #16 neon_b2f_copy_64_unify: #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) vshl.u64 q4, q8, q4 vshl.u64 q5, q8, q5 vshl.u64 q6, q8, q6 vshl.u64 q7, q8, q7 #else vshl.u64 q4, q4, q8 vshl.u64 q5, q5, q8 vshl.u64 q6, q6, q8 vshl.u64 q7, q7, q8 #endif vmov s17, r4 vorr d7, d7, d8 vorr d6, d6, d15 vorr d5, d5, d14 vorr d4, d4, d13 vorr d3, d3, d12 vorr d2, d2, d11 vorr d1, d1, d10 vorr d0, d0, d9 mov r4, r5, lsl r11 subs r12, r12, #1 lsr r4, r4, r11 vst1.32 {q0, q1}, [r0]! vst1.32 {q2, q3}, [r0] pld [r1, #0] sub r0, r0, #32 bne neon_b2f_copy_64_u_loop #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) vldmia sp!, {q6-q8} vldmia sp!, {q4,q5} #else vpop {q6-q8} vpop {q4,q5} #endif ands r2, r2, #0x3f cmp r2, #32 bge neon_b2f_copy_32_u b neon_b2f_copy_finish_u neon_b2f_copy_32_u: #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) vstmdb sp!, {q4} #else vpush {q4} #endif add r7, r11, #32 movs r12, r2, lsr #5 vdup.u32 q4, r7 and r2, r2, #0x1f neon_b2f_copy_32_u_loop: sub r1, r1, #32 sub r0, r0, #32 vld1.32 {q0, q1}, [r1] vmov q2, q0 vmov q3, q1 vmov r5, s0 mov r4, r4, lsl r11 lsls r6, r10, #28 bcc neon_b2f_copy_32_u_b8 bpl neon_b2f_copy_32_u_b16 vshr.u64 q0, q0, #24 vshr.u64 q1, q1, #24 b neon_b2f_copy_32_unify neon_b2f_copy_32_u_b8: vshr.u64 q0, q0, #8 vshr.u64 q1, q1, #8 b neon_b2f_copy_32_unify neon_b2f_copy_32_u_b16: vshr.u64 q0, q0, #16 vshr.u64 q1, q1, #16 neon_b2f_copy_32_unify: #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) vshl.u64 q2, q4, q2 vshl.u64 q3, q4, q3 #else vshl.u64 q2, q2, q4 vshl.u64 q3, q3, q4 #endif vmov s9, r4 vorr d3, d3, d4 vorr d2, d2, d7 vorr d1, d1, d6 vorr d0, d0, d5 mov r4, r5, lsl r11 subs r12, r12, #1 lsr r4, r4, r11 vst1.32 {q0, q1}, [r0] pld [r1, #0] bne neon_b2f_copy_32_u_loop #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) vldmia sp!, {q4} #else vpop {q4} #endif neon_b2f_copy_finish_u: neon_b2f_copy_8_u: movs r12, r2, lsr #0x3 beq neon_b2f_copy_4_u mov r5, r4, lsl r11 neon_b2f_copy_8_u_loop: ldmdb r1!, {r6-r7} subs r12, r12, #1 orr r5, r5, r7, lsr r10 mov r4, r7, lsl r11 orr r4, r4, r6, lsr r10 stmdb r0!, {r4-r5} mov r4, r6, lsl r11 lsr r4, r4, r11 mov r5, r4, lsl r11 bne neon_b2f_copy_8_u_loop ands r2, r2, #0x7 neon_b2f_copy_4_u: movs r12, r2, lsr #0x2 beq neon_b2f_last_bits_u mov r5, r4, lsl r11 neon_b2f_copy_4_u_loop: ldr r6, [r1, #-4]! subs r12, r12, #1 orr r5, r5, r6, lsr r10 str r5, [r0, #-4]! mov r4, r6, lsl r11 lsr r4, r4, r11 mov r5, r4, lsl r11 bne neon_b2f_copy_4_u_loop and r2, r2, #0x3 neon_b2f_last_bits_u: neon_b2f_last_bits_u_loop: subs r10, r10, #8 mov r5, r4, lsr r10 strb r5, [r0, #-1]! bne neon_b2f_last_bits_u_loop neon_b2f_copy_1_u: cmp r2, #0 beq neon_b2f_finish_u neon_b2f_copy_1_u_loop: ldrb r12, [r1, #-1]! subs r2, r2, #1 strb r12, [r0, #-1]! bne neon_b2f_copy_1_u_loop neon_b2f_finish_u: #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) ldmia sp!, {r6-r11} #else pop {r6-r11} #endif neon_b2f_finish: #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) ldmia sp!, {r4-r5} #else pop {r4-r5} #endif neon_memmove_done: #if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) ldmia sp!, {r0} #else pop {r0} #endif bx lr .endfunc .end xf86-video-msm/src/msm-exa.c0000755000175000017500000005221411615776600016001 0ustar paulliupaulliu/* msm-exa.c * * Copyright (c) 2009, Code Aurora Forum. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Code Aurora nor * the names of its contributors may be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NON-INFRINGEMENT ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include "xf86.h" #include "exa.h" #include "msm.h" #include "msm-drm.h" #include "msm-render.h" unsigned int elapsed(struct timeval *start, struct timeval *end) { unsigned int ret; if (end->tv_sec > start->tv_sec) { ret = (end->tv_sec - start->tv_sec - 1) * 1000000; ret += (1000000 - start->tv_usec) + end->tv_usec; } else ret = end->tv_usec - start->tv_usec; return ret; } /* Specify how many blits we will execute at one time */ #define MSM_MAX_BLITS 16 /* Useful macros */ #define FIXED(_x) IntToxFixed(_x) #define INT(_x) xFixedToInt(_x) #define MSMPTR_FROM_PIXMAP(_x) \ MSMPTR(xf86Screens[(_x)->drawable.pScreen->myNum]) /* This is a local scratch structure used to store information */ static struct { unsigned int src_width; unsigned int src_height; unsigned long src_priv; int src_pitch; int src_format; int src_flags; int dst_priv; int dst_format; Pixel dst_fg_color; unsigned int flags; PictTransformPtr transform; int repeatType; } exaScratch; /* Defines for exaScratch.flags */ #define EXA_SCRATCH_REPEAT 0x01 /* Set to TRUE to create verbose error messages for MSM BLIT failures. */ #define DEBUG_MSM_BLIT (TRUE) /* This is a lookup table to convert between pictFormat and the format used by msm_fb */ static struct { int pictFormat; int msmFormat; } msmFormats[] = { { PICT_a8r8g8b8, MDP_XRGB_8888}, { PICT_x8r8g8b8, MDP_XRGB_8888}, { PICT_r8g8b8, MDP_RGB_888}, { PICT_r5g6b5, MDP_RGB_565}, { PICT_b5g6r5, MDP_BGR_565},}; /* Given a picture format, return the MDP format */ static int msm_lookup_format(PicturePtr p) { int i; for (i = 0; i < ARRAY_SIZE(msmFormats); i++) { if (msmFormats[i].pictFormat == p->format) return msmFormats[i].msmFormat; } /* Use this as the "error" value */ return MDP_IMGTYPE_LIMIT; } /* Store solid fill info for Neon (or, in future, MDP) acceleration. */ static Bool MSMPrepareSolid(PixmapPtr pxDst, int alu, Pixel planemask, Pixel fg) { MSMPtr pMsm = MSMPTR_FROM_PIXMAP(pxDst); if (!(pMsm->FastFill)) return FALSE; /* TODO: Support GXSet and GXClear for optimizing xterm background? */ if (planemask != ~0U || alu != GXcopy) return FALSE; /* FIXME: Don't support 24bpp yet, since it's not so easy. */ if (pxDst->drawable.bitsPerPixel == 16 || pxDst->drawable.bitsPerPixel == 32) { exaScratch.dst_priv = msm_pixmap_offset(pxDst); exaScratch.dst_fg_color = fg; return TRUE; } return FALSE; } /* Do a solid fill */ void MSMSolid(PixmapPtr pxDst, int x1, int y1, int x2, int y2) { /* TODO: Since all of the following values depend on pxDst, it could be moved to MSMPrepareSolid() to make MSMSolid() more efficient. */ MSMPtr pMsm = MSMPTR_FROM_PIXMAP(pxDst); int destSurfaceWidthPixels = pxDst->drawable.width; int bitsPerPixel = pxDst->drawable.bitsPerPixel; BOOL blockSignalsForVFP = !(pMsm->NoSigBlock); swFill(pMsm, exaScratch.dst_priv, destSurfaceWidthPixels, x1, y1, x2 - x1, y2 - y1, exaScratch.dst_fg_color, bitsPerPixel, blockSignalsForVFP); } /* Finish the solid fill */ void MSMDoneSolid(PixmapPtr pxDst) { /* Nothing to do at the moment. */ } /* Prepare to execute an accelerated copy */ static Bool MSMPrepareCopy(PixmapPtr pxSrc, PixmapPtr pxDst, int dx, int dy, int alu, Pixel planemask) { /* FIXME: Do we support other raster operations? */ if (planemask != ~0U || alu != GXcopy) return FALSE; /* Require that the source and destination are the same depths */ if (pxSrc->drawable.bitsPerPixel != pxDst->drawable.bitsPerPixel) return FALSE; /* Figure out the format of the operation */ switch (pxSrc->drawable.bitsPerPixel) { case 32: exaScratch.src_format = MDP_XRGB_8888; break; case 24: exaScratch.src_format = MDP_RGB_888; break; case 16: exaScratch.src_format = MDP_RGB_565; break; default: return FALSE; } hwBlitReset(); /* TODO: Note that dx and dy contain the copy direction. Using that data may allow us to skip that calculation in the driver. */ /* Remember the details of the source pixmap */ exaScratch.src_width = pxSrc->drawable.width; exaScratch.src_height = pxSrc->drawable.height; exaScratch.src_pitch = msm_pixmap_get_pitch(pxSrc); if (msm_pixmap_in_gem(pxSrc)) { exaScratch.src_flags = MSM_BLIT_GEM; exaScratch.src_priv = (unsigned long) msm_get_pixmap_bo(pxSrc); } else { exaScratch.src_flags = MSM_BLIT_FB; exaScratch.src_priv = msm_pixmap_offset(pxSrc); } return TRUE; } /* There are three scenarios in which we would use software - * 1) if SWBlit was set in the config file, 2) if the width and * height is less then a prearranged amount, or 3) if the source * and destination are overlapping. */ static inline void HWOrSWBlit(MSMPtr pMsm, int w, int h, MSMBlitSurface *src, MSMBlitSurface *dst, MSMBlitRect *srcRect, MSMBlitRect *dstRect, int bpp) { MSMBlitRec blit; blit.src = src; blit.dst = dst; blit.srcRect = srcRect; blit.dstRect = dstRect; if (pMsm->useSWBlit || (h * w <= 1500) || !isCopyMDPCompatible(&blit, bpp)) { hwBlitFlush(pMsm); swBlit(pMsm, &blit, bpp, !(pMsm->NoSigBlock)); } else hwBlit(pMsm, &blit, 0); } static void MSMDoCopy(PixmapPtr pxDst, int srcX, int srcY, int dstX, int dstY, int w, int h) { MSMPtr pMsm = MSMPTR_FROM_PIXMAP(pxDst); MSMBlitSurface srcSurface, dstSurface; MSMBlitRect srcRect, dstRect; int bpp = pxDst->drawable.bitsPerPixel; srcSurface.width = exaScratch.src_width; srcSurface.height = exaScratch.src_height; srcSurface.format = exaScratch.src_format; srcSurface.priv[0] = exaScratch.src_priv; srcSurface.flags = exaScratch.src_flags; srcSurface.pitch = exaScratch.src_pitch; dstSurface.width = pxDst->drawable.width; dstSurface.height = pxDst->drawable.height; dstSurface.format = exaScratch.src_format; dstSurface.pitch = msm_pixmap_get_pitch(pxDst); if (msm_pixmap_in_gem(pxDst)) { dstSurface.flags = MSM_BLIT_GEM; dstSurface.priv[0] = (unsigned long) msm_get_pixmap_bo(pxDst); } else { dstSurface.flags = MSM_BLIT_FB; dstSurface.priv[0] = msm_pixmap_offset(pxDst); } srcRect.x = srcX; srcRect.y = srcY; srcRect.w = w; srcRect.h = h; dstRect.x = dstX; dstRect.y = dstY; dstRect.w = w; dstRect.h = h; HWOrSWBlit(pMsm, w, h, &srcSurface, &dstSurface, &srcRect, &dstRect, bpp); } /* This function takes apart the transform attached to the picture, * and tries to figure out what we should do with it. The possible * options are: translate, rotate and scale. We can translate anywhere, * we can only rotate in 90 degree increments, and there is a limitation * on scaling (MDP supports a scaling range of 0.25X to 4.0X). */ static Bool MSMCheckTransform(PicturePtr pPict) { PictTransformPtr t = pPict->transform; /* Check for a simple translate */ if (t->matrix[0][0] == FIXED(1) && t->matrix[0][1] == FIXED(0) && t->matrix[1][0] == FIXED(0) && t->matrix[1][1] == FIXED(1) && t->matrix[2][0] == FIXED(0) && t->matrix[2][1] == FIXED(0) && t->matrix[2][2] == FIXED(1)) return TRUE; /* FIXME: Grok stretches and rotates too */ return FALSE; } static void MSMTransformPoint(PictTransform * t, int *x, int *y) { PictVector v; if (t == NULL) return; v.vector[0] = FIXED(*x); v.vector[1] = FIXED(*y); v.vector[2] = xFixed1; /* PictureTransformPoint uses pixman which in turn uses fixed math * to calculate the point. This should be faster then using the * floating point emulation, even with the additional function calls */ PictureTransformPoint(t, &v); *x = INT(v.vector[0]); *y = INT(v.vector[1]); } static Bool MSMCheckComposite(int op, PicturePtr pSrc, PicturePtr pMsk, PicturePtr pDst) { /* Only support src copies - I guess technically, we could also support * PIctOptDst? */ if (op != PictOpSrc) { return FALSE; } /* Do not support masks */ if (pMsk != NULL) { return FALSE; } exaScratch.src_format = msm_lookup_format(pSrc); exaScratch.dst_format = msm_lookup_format(pDst); /* Leave if we don't suport the source or destination format */ if (exaScratch.src_format == MDP_IMGTYPE_LIMIT || exaScratch.dst_format == MDP_IMGTYPE_LIMIT) { return FALSE; } /* With DRM in KMEM, we can no longer use the MDP for color conversion * blits, sorry */ if (exaScratch.src_format != exaScratch.dst_format) return FALSE; if (pDst->transform) { return FALSE; } /* Verify that we can accelerate the transform */ if (pSrc->transform && !MSMCheckTransform(pSrc)) return FALSE; exaScratch.transform = pSrc->transform; /* TODO: Exit if a filter is set (is this only needed with scaling?). */ return TRUE; } static Bool MSMPrepareComposite(int op, PicturePtr pSrc, PicturePtr pMsk, PicturePtr pDst, PixmapPtr pxSrc, PixmapPtr pxMsk, PixmapPtr pxDst) { /* Check if repeat flag is set and fail if repeats are not allowed. */ MSMPtr pMsm = MSMPTR_FROM_PIXMAP(pxDst); if (pSrc->repeat && !(pMsm->FastCompositeRepeat)) return FALSE; hwBlitReset(); /* Remember the surface information */ exaScratch.src_width = pxSrc->drawable.width; exaScratch.src_height = pxSrc->drawable.height; exaScratch.src_pitch = msm_pixmap_get_pitch(pxSrc); if (msm_pixmap_in_gem(pxSrc)) { exaScratch.src_flags = MSM_BLIT_GEM; exaScratch.src_priv = (unsigned long) msm_get_pixmap_bo(pxSrc); } else { exaScratch.src_flags = MSM_BLIT_FB; exaScratch.src_priv = msm_pixmap_offset(pxSrc); } /* Set a flag if this operation needs to be repeated */ exaScratch.flags |= (pSrc->repeat) ? EXA_SCRATCH_REPEAT : 0; exaScratch.repeatType = pSrc->repeatType; return TRUE; } static void MSMDoComposite(PixmapPtr pxDst, int srcX, int srcY, int maskX, int maskY, int dstX, int dstY, int width, int height) { MSMPtr pMsm = MSMPTR_FROM_PIXMAP(pxDst); int bpp = pxDst->drawable.bitsPerPixel; int dx = dstX; int dy = dstY; int w, h; /* Transform the source point */ MSMTransformPoint(exaScratch.transform, &srcX, &srcY); /* FIXME: Sometimes srcX and srcY are negative, and * there is no accompanying transform. This is the best * we can do to avoid bugs, but this still fails the * cairo paint-repeat test */ if (srcX < 0) { width += srcX; srcX = 0; } if (srcY < 0) { width += srcY; srcY = 0; } if (width < 0 || height < 0) return; w = (exaScratch.src_width < width) ? exaScratch.src_width : width; h = (exaScratch.src_height < height) ? exaScratch.src_height : height; while (1) { /* Clip the operation to make sure it stays within bounds */ int dw = (dx + w > dstX + width) ? dstX + width - dx : w; int dh = (dy + h > dstY + height) ? dstY + height - dy : h; MSMBlitSurface srcSurface, dstSurface; MSMBlitRect srcRect, dstRect; srcSurface.width = exaScratch.src_width; srcSurface.height = exaScratch.src_height; srcSurface.format = exaScratch.src_format; srcSurface.priv[0] = exaScratch.src_priv; srcSurface.flags = exaScratch.src_flags; srcSurface.pitch = exaScratch.src_pitch; dstSurface.width = pxDst->drawable.width; dstSurface.height = pxDst->drawable.height; dstSurface.format = exaScratch.dst_format; dstSurface.pitch = msm_pixmap_get_pitch(pxDst); if (msm_pixmap_in_gem(pxDst)) { dstSurface.flags = MSM_BLIT_GEM; dstSurface.priv[0] = (unsigned long) msm_get_pixmap_bo(pxDst); } else { dstSurface.flags = MSM_BLIT_FB; dstSurface.priv[0] = msm_pixmap_offset(pxDst); } srcRect.x = srcX; srcRect.y = srcY; srcRect.w = dw; srcRect.h = dh; dstRect.x = dx; dstRect.y = dy; dstRect.w = dw; dstRect.h = dh; /* All the operations are straight copies, so * use software or hardware rendering, depending on which is most efficient. */ HWOrSWBlit(pMsm, w, h, &srcSurface, &dstSurface, &srcRect, &dstRect, bpp); /* If the repeat flag isn't set, then we are done */ if (!(exaScratch.flags & EXA_SCRATCH_REPEAT)) break; dx += dw; if (dx >= dstX + width) { dx = dstX; dy += dh; } if (dy >= dstY + height) break; } } static void MSMDone(PixmapPtr ptr) { MSMPtr pMsm = MSMPTR_FROM_PIXMAP(ptr); hwBlitFlush(pMsm); } /* Upload bytes from a source address to a pixmap in on-screen memory. */ Bool MSMUploadToScreen(PixmapPtr pxDst, int dstX, int dstY, int w, int h, char *src, int srcPitch) { if (pxDst->drawable.bitsPerPixel == 16 || pxDst->drawable.bitsPerPixel == 24 || pxDst->drawable.bitsPerPixel == 32) { int bitsPerPixel = pxDst->drawable.bitsPerPixel; int bytesPerPixel = bitsPerPixel / 8; MSMPtr pMsm = MSMPTR_FROM_PIXMAP(pxDst); int dstPitch = msm_pixmap_get_pitch(pxDst); int dstOffset = msm_pixmap_offset(pxDst); char *dst = (char *) (pMsm->fbmem) + dstOffset + dstY * dstPitch + dstX * bytesPerPixel; BOOL blockSignalsForVFP = !(pMsm->NoSigBlock); swBlit_NoOverlap((unsigned char* __restrict__) dst, (unsigned char* __restrict__) src, w, h, dstPitch, srcPitch, bitsPerPixel, blockSignalsForVFP); return TRUE; } else return FALSE; } /* Dowload bytes from a pixmap in on-screen memory to a destination address. */ Bool MSMDownloadFromScreen(PixmapPtr pxSrc, int srcX, int srcY, int w, int h, char *dst, int dstPitch) { if (pxSrc->drawable.bitsPerPixel == 16 || pxSrc->drawable.bitsPerPixel == 24 || pxSrc->drawable.bitsPerPixel == 32) { int bitsPerPixel = pxSrc->drawable.bitsPerPixel; int bytesPerPixel = bitsPerPixel / 8; MSMPtr pMsm = MSMPTR_FROM_PIXMAP(pxSrc); int srcPitch = msm_pixmap_get_pitch(pxSrc); int srcOffset = msm_pixmap_offset(pxSrc); char *src = (char *) (pMsm->fbmem) + srcOffset + srcY * srcPitch + srcX * bytesPerPixel; BOOL blockSignalsForVFP = !(pMsm->NoSigBlock); swBlit_NoOverlap((unsigned char* __restrict__) dst, (unsigned char* __restrict__) src, w, h, dstPitch, srcPitch, bitsPerPixel, blockSignalsForVFP); return TRUE; } else return FALSE; } static void MSMWaitMarker(ScreenPtr pScreen, int marker) { } static Bool MSMPixmapIsOffscreen(PixmapPtr pPixmap) { ScreenPtr pScreen = pPixmap->drawable.pScreen; struct msm_pixmap_priv *priv; if (pScreen->GetScreenPixmap(pScreen) == pPixmap) return TRUE; priv = exaGetPixmapDriverPrivate(pPixmap); if (priv && priv->bo) { if (msm_drm_bo_get_memtype(priv->bo) == MSM_DRM_MEMTYPE_EBI) return TRUE; return pPixmap->devPrivate.ptr ? FALSE : TRUE; } return FALSE; } static Bool MSMPrepareAccess(PixmapPtr pPixmap, int index) { struct msm_pixmap_priv *priv; priv = exaGetPixmapDriverPrivate(pPixmap); if (!priv) return FALSE; if (!priv->bo) return TRUE; if (priv->bo) { if (msm_drm_bo_map(priv->bo)) return FALSE; } if (pPixmap->devPrivate.ptr == NULL) pPixmap->devPrivate.ptr = (void *) priv->bo->virt; /* Technically we should do this for all depths, but that seems to freak out the mouse cursor, so just do the adjustment for 16bpp */ if (pPixmap->drawable.bitsPerPixel == 16) { priv->SavedPitch = pPixmap->devKind; pPixmap->devKind = ((pPixmap->drawable.width + 31) & ~31) * (pPixmap->drawable.bitsPerPixel >> 3); } return TRUE; } static void MSMFinishAccess(PixmapPtr pPixmap, int index) { struct msm_pixmap_priv *priv; priv = exaGetPixmapDriverPrivate(pPixmap); if (priv && priv->SavedPitch) { pPixmap->devKind = priv->SavedPitch; priv->SavedPitch = 0; } } static void * MSMCreatePixmap(ScreenPtr pScreen, int size, int align) { struct msm_pixmap_priv *priv; ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum]; MSMPtr pMsm = MSMPTR(pScrn); priv = xcalloc(1, sizeof(struct msm_pixmap_priv)); if (priv == NULL) return NULL; if (!size) return priv; priv->bo = msm_drm_bo_create_memtype(pMsm->drmFD, size, pMsm->pixmapMemtype); if (priv->bo) return priv; xfree(priv); return NULL; } static void MSMDestroyPixmap(ScreenPtr pScreen, void *dpriv) { struct msm_pixmap_priv *priv = dpriv; if (!dpriv) return; if (priv->bo) msm_drm_bo_free(priv->bo); xfree(dpriv); } Bool MSMSetupExa(ScreenPtr pScreen) { ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum]; MSMPtr pMsm = MSMPTR(pScrn); ExaDriverPtr pExa; if (pMsm->pExa == NULL) pMsm->pExa = exaDriverAlloc(); if (pMsm->pExa == NULL) return FALSE; pExa = pMsm->pExa; /* This is the current major/minor that we support */ pExa->exa_major = 2; pExa->exa_minor = 2; pExa->memoryBase = pMsm->fbmem; pExa->maxX = pMsm->mode_info.xres_virtual; pExa->maxY = pMsm->mode_info.yres_virtual; pExa->flags = EXA_OFFSCREEN_PIXMAPS; pExa->offScreenBase = (pMsm->fixed_info.line_length * pMsm->mode_info.yres); pExa->memorySize = pMsm->fixed_info.smem_len; /* Align pixmap offsets along page boundaries */ pExa->pixmapOffsetAlign = 4096; /* Align pixmap pitches to the maximum needed aligment for the GPU - this ensures that we have enough room, and we adjust the pitches down to the depth later */ pExa->pixmapPitchAlign = 128; pExa->PrepareSolid = MSMPrepareSolid; if (pMsm->FastFill) { /* The performance of the solid fill functions may be tested with: x11perf -rectX (where "X" is 10, 100 or 500) x11perf -trapX (where "X" is 10, 100 or 300) x11perf -fcircleX (where "X" is 1, 10, 100 or 500) (there are many more x11perf tests that do solid fill) */ pExa->Solid = MSMSolid; pExa->DoneSolid = MSMDoneSolid; } /* Accelerated copy function handlers. The performance of the copy functions may be tested with: x11perf -scrollX where "X" is 10, 100 or 500. */ pExa->PrepareCopy = MSMPrepareCopy; pExa->Copy = MSMDoCopy; pExa->DoneCopy = MSMDone; pExa->WaitMarker = MSMWaitMarker; /* Accelerated compositing handler functions */ if (pMsm->FastComposite) { pExa->CheckComposite = MSMCheckComposite; pExa->PrepareComposite = MSMPrepareComposite; pExa->Composite = MSMDoComposite; pExa->DoneComposite = MSMDone; } /* UploadToScreen and DownloadFromScreen implementations can copy rectangular regions much faster than memcpy() because they are using Neon optimizations. FIXME: Unfortunately, this code currently causes diagonal artifacts on the screen for some reason. */ if (pMsm->FastAppFBMemCopy) { /* The performance of these functions may be tested with: x11perf -copypixwinX (for UploadToScreen()) x11perf -copywinpixX (for DownloadFromScreen()) where "X" is 10, 100 or 500. */ pExa->DownloadFromScreen = MSMDownloadFromScreen; pExa->UploadToScreen = MSMUploadToScreen; } #if USEDRI2 if (pMsm->useDRI2) { pExa->flags |= EXA_HANDLES_PIXMAPS; pExa->PixmapIsOffscreen = MSMPixmapIsOffscreen; pExa->CreatePixmap = MSMCreatePixmap; pExa->DestroyPixmap = MSMDestroyPixmap; pExa->PrepareAccess = MSMPrepareAccess; pExa->FinishAccess = MSMFinishAccess; } #endif return exaDriverInit(pScreen, pMsm->pExa); } xf86-video-msm/src/msm-hwrender.c0000644000175000017500000002455211615776600017043 0ustar paulliupaulliu/* msm-hwrender.c * * Copyright (c) 2009, Code Aurora Forum. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Code Aurora nor * the names of its contributors may be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NON-INFRINGEMENT ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include "msm.h" #include "msm-drm.h" #include "msm-render.h" #define MSM_MAX_BLITS 8 #define DEBUG_MSM_BLIT 1 static struct { unsigned int count; struct mdp_blit_req req[MSM_MAX_BLITS]; } mdp_blit_list; #if DEBUG_MSM_BLIT static char * const GetPixelFormatString(char * const dest, const uint32_t pixelFormat) { switch(pixelFormat) { case MDP_RGB_565: strcpy(dest, "rgb_565"); break; case MDP_XRGB_8888: strcpy(dest, "xrgb_8888"); break; case MDP_Y_CBCR_H2V2: strcpy(dest, "YCbCr_H2V2"); break; case MDP_ARGB_8888: strcpy(dest, "argb_8888"); break; case MDP_RGB_888: strcpy(dest, "rgb_888"); break; case MDP_Y_CRCB_H2V2: strcpy(dest, "YCrCb_H2V2"); break; case MDP_YCRYCB_H2V1: strcpy(dest, "YCrYCb_H2V1"); break; case MDP_Y_CRCB_H2V1: strcpy(dest, "YCrCb_H2V1"); break; case MDP_Y_CBCR_H2V1: strcpy(dest, "YCbCr_H2V1"); break; case MDP_RGBA_8888: strcpy(dest, "rgba_8888"); break; case MDP_BGRA_8888: strcpy(dest, "bgra_8888"); break; case MDP_BGR_565: strcpy(dest, "bgr_565"); break; case MDP_FB_FORMAT: strcpy(dest, "FBFormat"); break; default: strcpy(dest, "INVALID-FORMAT"); break; } return dest; } #endif static void DisplayMSMBlitError(MSMPtr pMsm, int count, struct mdp_blit_req *mdp_reqs) { #if DEBUG_MSM_BLIT // Report the error and give detailed info on exactly what was wrong. // We don't actually show complete blit request info: e.g. alpha, masks, formats, etc., are not shown. ErrorF("Error %d while executing MSMFB_BLIT (BlitCount = %d):\n", errno, count); int bl = 0; for (bl = 0; bl < count; bl++) { // Display parameters if they differ from default values re-display if they've changed from the previous setting. if ((bl == 0 && (mdp_reqs[bl].alpha != MDP_ALPHA_NOP || mdp_reqs[bl].transp_mask != MDP_TRANSP_NOP || mdp_reqs[bl].flags != 0 || mdp_reqs[bl].sharpening_strength != 64)) || (bl > 0 && (mdp_reqs[bl].alpha != mdp_reqs[bl-1].alpha || mdp_reqs[bl].transp_mask != mdp_reqs[bl-1].transp_mask || mdp_reqs[bl].flags != mdp_reqs[bl-1].flags || mdp_reqs[bl].sharpening_strength != mdp_reqs[bl-1].sharpening_strength))) { ErrorF(" Non-Default Params: "); if (mdp_reqs[bl].alpha != MDP_ALPHA_NOP || (bl > 0 && mdp_reqs[bl].alpha != mdp_reqs[bl-1].alpha)) ErrorF("Alpha=0x%x, ", mdp_reqs[bl].alpha); if (mdp_reqs[bl].transp_mask != MDP_TRANSP_NOP || (bl > 0 && mdp_reqs[bl].transp_mask != mdp_reqs[bl-1].transp_mask)) ErrorF("TranspMask=0x%x, ", mdp_reqs[bl].transp_mask); if (mdp_reqs[bl].flags != 0 || (bl > 0 && mdp_reqs[bl].flags != mdp_reqs[bl-1].flags)) ErrorF("Flags=0x%x, ", mdp_reqs[bl].flags); if (mdp_reqs[bl].sharpening_strength != 64 || (bl > 0 && mdp_reqs[bl].sharpening_strength != mdp_reqs[bl-1].sharpening_strength)) ErrorF("ShStrength=%d, ", mdp_reqs[bl].sharpening_strength); ErrorF("\n"); } // Display the image portion of the request for the first request and re-display it when it changes. if (bl < 1 || mdp_reqs[bl].src.offset != mdp_reqs[bl-1].src.offset || mdp_reqs[bl].src.width != mdp_reqs[bl-1].src.width || mdp_reqs[bl].src.height != mdp_reqs[bl-1].src.height || mdp_reqs[bl].src.format != mdp_reqs[bl-1].src.format || mdp_reqs[bl].dst.offset != mdp_reqs[bl-1].dst.offset || mdp_reqs[bl].dst.width != mdp_reqs[bl-1].dst.width || mdp_reqs[bl].dst.height != mdp_reqs[bl-1].dst.height || mdp_reqs[bl].dst.format != mdp_reqs[bl-1].dst.format) { char dstFormatString[100]; char srcFormatString[100]; GetPixelFormatString(dstFormatString, mdp_reqs[bl].dst.format); GetPixelFormatString(srcFormatString, mdp_reqs[bl].src.format); ErrorF(" Image: %u:(w=%u,h=%u)/%s --> %u:(w=%u,h=%u)/%s\n", mdp_reqs[bl].src.offset, mdp_reqs[bl].src.width, mdp_reqs[bl].src.height, srcFormatString, mdp_reqs[bl].dst.offset, mdp_reqs[bl].dst.width, mdp_reqs[bl].dst.height, dstFormatString); } // Display the rectangle coordinates and sizes that are actually copied. ErrorF(" CopyRect: (%u,%u) --> (%u,%u) of w=%u,h=%u", mdp_reqs[bl].src_rect.x, mdp_reqs[bl].src_rect.y, mdp_reqs[bl].dst_rect.x, mdp_reqs[bl].dst_rect.y, mdp_reqs[bl].src_rect.w, mdp_reqs[bl].src_rect.h); if (mdp_reqs[bl].src_rect.w != mdp_reqs[bl].dst_rect.w || mdp_reqs[bl].src_rect.h != mdp_reqs[bl].dst_rect.h) { ErrorF(" (scaled to w=%u,h=%u)", mdp_reqs[bl].src_rect.w, mdp_reqs[bl].src_rect.h); } // Check for a few invalid conditions and print errors for them if they are incorrect. if (mdp_reqs[bl].src_rect.x + mdp_reqs[bl].src_rect.w > mdp_reqs[bl].src.width) ErrorF(" -Err: sr.x+sr.w>s.width!"); if (mdp_reqs[bl].dst_rect.x + mdp_reqs[bl].dst_rect.w > mdp_reqs[bl].dst.width) ErrorF(" -Err: dr.x+dr.w>d.width!"); if (mdp_reqs[bl].src_rect.y + mdp_reqs[bl].src_rect.h > mdp_reqs[bl].src.height) ErrorF(" -Err: sr.y+sr.h>s.height!"); if (mdp_reqs[bl].dst_rect.y + mdp_reqs[bl].dst_rect.h > mdp_reqs[bl].dst.height) ErrorF(" -Err: dr.y+dr.h>d.height!"); ErrorF("\n"); } #else ErrorF("Error while executing MSMFB_BLIT\n"); #endif // DEBUG_MSM_BLIT } void hwBlitFlush(MSMPtr pMsm) { if (ioctl(pMsm->fd, MSMFB_BLIT, &mdp_blit_list)) DisplayMSMBlitError(pMsm, mdp_blit_list.count, &(mdp_blit_list.req[0])); mdp_blit_list.count = 0; } void hwBlitReset(void) { mdp_blit_list.count = 0; } static int formatToCpp(MSMPtr pMsm, int format) { switch(format) { case MDP_XRGB_8888: return 4; case MDP_RGB_888: return 3; case MDP_RGB_565: return 2; case MDP_YCRYCB_H2V1: case MDP_Y_CRCB_H2V2: case MDP_Y_CBCR_H2V2: return 2; case MDP_FB_FORMAT: return (pMsm->mode_info.bits_per_pixel >> 3); } return 0; } void hwBlit(MSMPtr pMsm, MSMBlitRec *blit, int flags) { int index = mdp_blit_list.count; mdp_blit_list.req[index].flags = flags; mdp_blit_list.req[index].alpha = 0xFF; mdp_blit_list.req[index].transp_mask = 0xFFFFFFFF; /* The width is actually the pitch / cpp */ mdp_blit_list.req[index].src.width = blit->src->pitch / formatToCpp(pMsm, blit->src->format); mdp_blit_list.req[index].src.height = blit->src->height; mdp_blit_list.req[index].src.format = blit->src->format; if (blit->src->flags & MSM_BLIT_FB) { mdp_blit_list.req[index].src.offset = blit->src->priv[0]; mdp_blit_list.req[index].src.memory_id = pMsm->fd; } else if (blit->src->flags & MSM_BLIT_PMEM) { mdp_blit_list.req[index].src.offset = blit->src->priv[1]; mdp_blit_list.req[index].src.memory_id = blit->src->priv[0]; } #if MDP_BLIT_REQ_VERSION >= 2 else { struct msm_drm_bo *bo = (struct msm_drm_bo *) blit->src->priv[0]; mdp_blit_list.req[index].flags |= MDP_BLIT_SRC_GEM; mdp_blit_list.req[index].src.offset = 0; mdp_blit_list.req[index].src.memory_id = pMsm->drmFD; mdp_blit_list.req[index].src.priv = bo->handle; } #endif mdp_blit_list.req[index].dst.width = blit->dst->pitch / formatToCpp(pMsm, blit->dst->format); mdp_blit_list.req[index].dst.height = blit->dst->height; mdp_blit_list.req[index].dst.format = blit->dst->format; if (blit->dst->flags & MSM_BLIT_FB) { mdp_blit_list.req[index].dst.offset = blit->dst->priv[0]; mdp_blit_list.req[index].dst.memory_id = pMsm->fd; } #if MDP_BLIT_REQ_VERSION >= 2 else { struct msm_drm_bo *bo = (struct msm_drm_bo *) blit->dst->priv[0]; mdp_blit_list.req[index].flags |= MDP_BLIT_DST_GEM; mdp_blit_list.req[index].dst.offset = 0; mdp_blit_list.req[index].dst.memory_id = pMsm->drmFD; mdp_blit_list.req[index].dst.priv = bo->handle; } #endif mdp_blit_list.req[index].src_rect.x = blit->srcRect->x; mdp_blit_list.req[index].src_rect.y = blit->srcRect->y; mdp_blit_list.req[index].src_rect.w = blit->srcRect->w; mdp_blit_list.req[index].src_rect.h = blit->srcRect->h; mdp_blit_list.req[index].dst_rect.x = blit->dstRect->x; mdp_blit_list.req[index].dst_rect.y = blit->dstRect->y; mdp_blit_list.req[index].dst_rect.w = blit->dstRect->w; mdp_blit_list.req[index].dst_rect.h = blit->dstRect->h; mdp_blit_list.count++; if (mdp_blit_list.count == MSM_MAX_BLITS) hwBlitFlush(pMsm); } xf86-video-msm/src/msm-drm.h0000644000175000017500000000474711615776600016020 0ustar paulliupaulliu/* msm-drm.h * * Copyright (c) 2009, Code Aurora Forum. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Code Aurora nor * the names of its contributors may be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NON-INFRINGEMENT ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef MSM_DRM_H_ #define MSM_DRM_H_ #define MSM_DRM_MEMTYPE_EBI 0 #define MSM_DRM_MEMTYPE_SMI 1 #define MSM_DRM_MEMTYPE_KMEM 2 #define MSM_DRM_MEMTYPE_KMEM_NOCACHE 3 struct msm_drm_bo { int fd; unsigned int name; int memtype; unsigned int size; unsigned int handle; void *virt; int ref; unsigned long long offset; }; int msm_drm_init(int fd); struct msm_drm_bo *msm_drm_bo_create(int fd, int size); int msm_drm_bo_flink(struct msm_drm_bo *bo, unsigned int *name); void msm_drm_bo_free(struct msm_drm_bo *bo); void msm_drm_bo_unmap(struct msm_drm_bo *bo); int msm_drm_bo_map(struct msm_drm_bo *bo); int msm_drm_bo_alloc(struct msm_drm_bo *bo); int msm_drm_bo_set_memtype(struct msm_drm_bo *bo, int type); int msm_drm_bo_get_memtype(struct msm_drm_bo *bo); struct msm_drm_bo *msm_drm_bo_create_memtype(int fd, int size, int type); #endif