Release_v0.3/ 0000775 0000000 0000000 00000000000 12231421770 0013203 5 ustar 00root root 0000000 0000000 Release_v0.3/.gitignore 0000664 0000000 0000000 00000000074 12231421770 0015174 0 ustar 00root root 0000000 0000000 *.o
CMakeCache.txt
CMakeFiles/
Makefile
cmake_install.cmake
Release_v0.3/CMake/ 0000775 0000000 0000000 00000000000 12231421770 0014163 5 ustar 00root root 0000000 0000000 Release_v0.3/CMake/CMakeConfigTemplate.hpp 0000664 0000000 0000000 00000001700 12231421770 0020474 0 ustar 00root root 0000000 0000000 /*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see .
*
* Author: Benjamin Segovia
*/
#ifndef CMAKE_CONFIG_HPP
#define CMAKE_CONFIG_HPP
#define ON true
#define OFF false
#define GEN_INSTALLATION_PATH "${CMAKE_INSTALL_PREFIX}/lib/i965/"
#endif /* CMAKE_CONFIG_HPP */
Release_v0.3/CMake/FindDRM.cmake 0000664 0000000 0000000 00000001413 12231421770 0016407 0 ustar 00root root 0000000 0000000 #
# Try to find X library and include path.
# Once done this will define
#
# DRM_FOUND
# DRM_INCLUDE_PATH
# DRM_LIBRARY
#
FIND_PATH(DRM_INCLUDE_PATH drm.h
~/include/libdrm/
/usr/include/libdrm/
/usr/local/include/libdrm/
/sw/include/libdrm/
/opt/local/include/libdrm/
DOC "The directory where drm.h resides")
FIND_LIBRARY(DRM_LIBRARY
NAMES DRM drm
PATHS
~/lib/
/usr/lib64
/usr/lib
/usr/local/lib64
/usr/local/lib
/sw/lib
/opt/local/lib
DOC "The DRM library")
IF(DRM_INCLUDE_PATH)
INCLUDE_DIRECTORIES(${DRM_INCLUDE_PATH})
SET(DRM_FOUND 1 CACHE STRING "Set to 1 if DRM is found, 0 otherwise")
ELSE(DRM_INCLUDE_PATH)
SET(DRM_FOUND 0 CACHE STRING "Set to 1 if DRM is found, 0 otherwise")
ENDIF(DRM_INCLUDE_PATH)
MARK_AS_ADVANCED(DRM_FOUND)
Release_v0.3/CMake/FindDRMIntel.cmake 0000664 0000000 0000000 00000001613 12231421770 0017405 0 ustar 00root root 0000000 0000000 #
# Try to find X library and include path.
# Once done this will define
#
# DRM_INTEL_FOUND
# DRM_INTEL_INCLUDE_PATH
#
FIND_PATH(DRM_INTEL_INCLUDE_PATH intel_bufmgr.h
~/include/libdrm/
/usr/include/libdrm/
/usr/local/include/libdrm/
/sw/include/libdrm/
/opt/local/include/libdrm/
DOC "The directory where intel_bufmgr.h resides")
FIND_LIBRARY(DRM_INTEL_LIBRARY
NAMES DRM_INTEL drm_intel
PATHS
~/lib/
/usr/lib64
/usr/lib
/usr/local/lib64
/usr/local/lib
/sw/lib
/opt/local/lib
/usr/lib/i386-linux-gnu/
DOC "The DRM_INTEL library")
IF(DRM_INTEL_INCLUDE_PATH)
INCLUDE_DIRECTORIES(${DRM_INTEL_INCLUDE_PATH})
SET(DRM_INTEL_FOUND 1 CACHE STRING "Set to 1 if DRM_INTEL is found, 0 otherwise")
ELSE(DRM_INTEL_INCLUDE_PATH)
SET(DRM_INTEL_FOUND 0 CACHE STRING "Set to 1 if DRM_INTEL is found, 0 otherwise")
ENDIF(DRM_INTEL_INCLUDE_PATH)
MARK_AS_ADVANCED(DRM_INTEL_FOUND)
Release_v0.3/CMake/FindEGL.cmake 0000664 0000000 0000000 00000003021 12231421770 0016371 0 ustar 00root root 0000000 0000000 #
# Try to find EGL library and include path.
# Once done this will define
#
# EGL_FOUND
# EGL_INCLUDE_PATH
# EGL_LIBRARY
#
FIND_PATH(EGL_INCLUDE_PATH EGL/egl.h
~/include/
/usr/include/
/usr/local/include/
/sw/include/
/opt/local/include/
DOC "The directory where gen/program.h resides")
FIND_LIBRARY(EGL_LIBRARY
NAMES EGL egl
PATHS
~/lib/
/usr/lib64
/usr/lib
/usr/local/lib64
/usr/local/lib
/sw/lib
/opt/local/lib
DOC "The EGL library")
IF(EGL_INCLUDE_PATH)
INCLUDE_DIRECTORIES(${EGL_INCLUDE_PATH})
SET(EGL_FOUND 1 CACHE STRING "Set to 1 if EGL is found, 0 otherwise")
ELSE(EGL_INCLUDE_PATH)
SET(EGL_FOUND 0 CACHE STRING "Set to 1 if EGL is found, 0 otherwise")
ENDIF(EGL_INCLUDE_PATH)
# Find mesa source code.
FIND_PATH(MESA_SOURCE_PREFIX src/mesa/main/texobj.c
$ENV{MESA_SOURCE_DIR}
${MAKE_CURRENT_SOURCE_DIR}/../mesa
~/mesa
DOC "The mesa source directory which is needed for cl_khr_gl_sharing.")
IF(MESA_SOURCE_PREFIX)
SET(MESA_SOURCE_INCLUDES ${MESA_SOURCE_PREFIX}/src/mesa
${MESA_SOURCE_PREFIX}/include
${MESA_SOURCE_PREFIX}/src/mapi
${MESA_SOURCE_PREFIX}/src/mesa/drivers/dri/i965/
${MESA_SOURCE_PREFIX}/src/mesa/drivers/dri/common/)
SET(MESA_SOURCE_FOUND 1 CACHE STRING "Set to 1 if mesa source code is found, 0 otherwise")
ELSE(MESA_SOURCE_PREFIX)
SET(MESA_SOURCE_FOUND 0 CACHE STRING "Set to 1 if mesa source code is found, 0 otherwise")
ENDIF(MESA_SOURCE_PREFIX)
MARK_AS_ADVANCED(EGL_FOUND)
Release_v0.3/CMake/FindGBE.cmake 0000664 0000000 0000000 00000001370 12231421770 0016364 0 ustar 00root root 0000000 0000000 #
# Try to find X library and include path.
# Once done this will define
#
# GBE_FOUND
# GBE_INCLUDE_PATH
# GBE_LIBRARY
#
FIND_PATH(GBE_INCLUDE_PATH gen/program.h
~/include/
/usr/include/
/usr/local/include/
/sw/include/
/opt/local/include/
DOC "The directory where gen/program.h resides")
FIND_LIBRARY(GBE_LIBRARY
NAMES GBE gbe
PATHS
~/lib/
/usr/lib64
/usr/lib
/usr/local/lib64
/usr/local/lib
/sw/lib
/opt/local/lib
DOC "The GBE library")
IF(GBE_INCLUDE_PATH)
INCLUDE_DIRECTORIES(${GBE_INCLUDE_PATH})
SET(GBE_FOUND 1 CACHE STRING "Set to 1 if GBE is found, 0 otherwise")
ELSE(GBE_INCLUDE_PATH)
SET(GBE_FOUND 0 CACHE STRING "Set to 1 if GBE is found, 0 otherwise")
ENDIF(GBE_INCLUDE_PATH)
MARK_AS_ADVANCED(GBE_FOUND)
Release_v0.3/CMake/FindLLVM.cmake 0000664 0000000 0000000 00000006654 12231421770 0016553 0 ustar 00root root 0000000 0000000 # Find the native LLVM includes and library
#
# LLVM_INCLUDE_DIR - where to find llvm include files
# LLVM_LIBRARY_DIR - where to find llvm libs
# LLVM_CFLAGS - llvm compiler flags
# LLVM_LFLAGS - llvm linker flags
# LLVM_MODULE_LIBS - list of llvm libs for working with modules.
# LLVM_FOUND - True if llvm found.
if (LLVM_INSTALL_DIR)
find_program(LLVM_CONFIG_EXECUTABLE NAMES llvm-config-32 llvm-config-3.2 llvm-config-31 llvm-config-3.1 llvm-config-3.4 llvm-config DOC "llvm-config executable" PATHS ${LLVM_INSTALL_DIR} NO_DEFAULT_PATH)
else (LLVM_INSTALL_DIR)
find_program(LLVM_CONFIG_EXECUTABLE NAMES llvm-config-32 llvm-config-3.2 llvm-config-31 llvm-config-3.1 llvm-config-3.4 llvm-config DOC "llvm-config executable")
endif (LLVM_INSTALL_DIR)
if (LLVM_CONFIG_EXECUTABLE)
message(STATUS "LLVM llvm-config found at: ${LLVM_CONFIG_EXECUTABLE}")
else (LLVM_CONFIG_EXECUTABLE)
message(FATAL_ERROR "Could NOT find LLVM executable, please add -DLLVM_INSTALL_DIR=/path/to/llvm-config/ in cmake command")
endif (LLVM_CONFIG_EXECUTABLE)
if (LLVM_FIND_VERSION_MAJOR AND LLVM_FIND_VERSION_MINOR)
SET(LLVM_FIND_VERSION_NODOT "${LLVM_FIND_VERSION_MAJOR}${LLVM_FIND_VERSION_MINOR}")
execute_process(
COMMAND ${LLVM_CONFIG_EXECUTABLE} --version
OUTPUT_VARIABLE LLVM_VERSION
)
string(REGEX REPLACE "([0-9]*)\\.([0-9]*)[^0-9]*" "\\1\\2 " LLVM_VERSION_NODOT ${LLVM_VERSION})
if (LLVM_VERSION_NODOT VERSION_LESS LLVM_FIND_VERSION_NODOT)
message(FATAL_ERROR "imcompatible LLVM version ${LLVM_VERSION} required ${LLVM_FIND_VERSION}")
else (LLVM_VERSION_NODOT VERSION_LESS LLVM_FIND_VERSION_NODOT)
if (LLVM_VERSION_NODOT VERSION_EQUAL LLVM_FIND_VERSION_NODOT)
message(STATUS "find stable LLVM version ${LLVM_VERSION}")
else (LLVM_VERSION_NODOT VERSION_EQUAL LLVM_FIND_VERSION_NODOT)
message(STATUS "find unstable LLVM version ${LLVM_VERSION}")
endif (LLVM_VERSION_NODOT VERSION_EQUAL LLVM_FIND_VERSION_NODOT)
add_definitions("-DLLVM_${LLVM_VERSION_NODOT}")
endif (LLVM_VERSION_NODOT VERSION_LESS LLVM_FIND_VERSION_NODOT)
endif (LLVM_FIND_VERSION_MAJOR AND LLVM_FIND_VERSION_MINOR)
execute_process(
COMMAND ${LLVM_CONFIG_EXECUTABLE} --includedir
OUTPUT_VARIABLE LLVM_INCLUDE_DIR
OUTPUT_STRIP_TRAILING_WHITESPACE
)
execute_process(
COMMAND ${LLVM_CONFIG_EXECUTABLE} --libdir
OUTPUT_VARIABLE LLVM_LIBRARY_DIR
OUTPUT_STRIP_TRAILING_WHITESPACE
)
execute_process(
COMMAND ${LLVM_CONFIG_EXECUTABLE} --cppflags
OUTPUT_VARIABLE LLVM_CFLAGS
OUTPUT_STRIP_TRAILING_WHITESPACE
)
execute_process(
COMMAND ${LLVM_CONFIG_EXECUTABLE} --ldflags
OUTPUT_VARIABLE LLVM_LFLAGS
OUTPUT_STRIP_TRAILING_WHITESPACE
)
execute_process(
COMMAND ${LLVM_CONFIG_EXECUTABLE} --libs
OUTPUT_VARIABLE LLVM_MODULE_LIBS
OUTPUT_STRIP_TRAILING_WHITESPACE
)
macro(add_one_lib name)
FIND_LIBRARY(CLANG_LIB
NAMES ${name}
PATHS ${LLVM_LIBRARY_DIR} )
set(CLANG_LIBRARIES ${CLANG_LIBRARIES} ${CLANG_LIB})
unset(CLANG_LIB CACHE)
endmacro()
#Assume clang lib path same as llvm lib path
add_one_lib("clangFrontend")
add_one_lib("clangSerialization")
add_one_lib("clangDriver")
add_one_lib("clangCodeGen")
add_one_lib("clangSema")
add_one_lib("clangStaticAnalyzerFrontend")
add_one_lib("clangStaticAnalyzerCheckers")
add_one_lib("clangStaticAnalyzerCore")
add_one_lib("clangAnalysis")
add_one_lib("clangEdit")
add_one_lib("clangAST")
add_one_lib("clangParse")
add_one_lib("clangSema")
add_one_lib("clangLex")
add_one_lib("clangBasic")
Release_v0.3/CMake/FindOCLIcd.cmake 0000664 0000000 0000000 00000001140 12231421770 0017017 0 ustar 00root root 0000000 0000000 #
# Try to find ocl_icd library and include path.
# Once done this will define
#
# OCLIcd_FOUND
# OCLIcd_INCLUDE_PATH
#
FIND_PATH(OCLIcd_INCLUDE_PATH ocl_icd.h
~/include/
/usr/include/
/usr/local/include/
/sw/include/
/opt/local/include/
DOC "The directory where ocl_icd.h resides")
IF(OCLIcd_INCLUDE_PATH)
INCLUDE_DIRECTORIES(${OCLIcd_INCLUDE_PATH})
SET(OCLIcd_FOUND 1 CACHE STRING "Set to 1 if OCLIcd is found, 0 otherwise")
ELSE(OCLIcd_INCLUDE_PATH)
SET(OCLIcd_FOUND 0 CACHE STRING "Set to 1 if OCLIcd is found, 0 otherwise")
ENDIF(OCLIcd_INCLUDE_PATH)
MARK_AS_ADVANCED(OCLIcd_FOUND)
Release_v0.3/CMake/FindXext.cmake 0000664 0000000 0000000 00000001324 12231421770 0016716 0 ustar 00root root 0000000 0000000 #
# Try to find Xext library path.
# Once done this will define
#
# XEXT_FOUND
# XEXT_LIBRARY
#
FIND_PATH(XEXT_INCLUDE_PATH X11/extensions/Xext.h
/usr/include
/usr/local/include
/sw/include
/opt/local/include
DOC "The directory where Xext.h resides")
FIND_LIBRARY(XEXT_LIBRARY
NAMES XEXT Xext
PATHS
/usr/lib64
/usr/lib
/usr/local/lib64
/usr/local/lib
/sw/lib
/opt/local/lib
DOC "The XEXT library")
IF(XEXT_INCLUDE_PATH)
INCLUDE_DIRECTORIES(${XEXT_INCLUDE_PATH})
SET(XEXT_FOUND 1 CACHE STRING "Set to 1 if XEXT is found, 0 otherwise")
ELSE(XEXT_INCLUDE_PATH)
SET(XEXT_FOUND 0 CACHE STRING "Set to 1 if XEXT is found, 0 otherwise")
ENDIF(XEXT_INCLUDE_PATH)
MARK_AS_ADVANCED(XEXT_FOUND)
Release_v0.3/CMake/FindXfixes.cmake 0000664 0000000 0000000 00000001372 12231421770 0017237 0 ustar 00root root 0000000 0000000 #
# Try to find Xfixes library path.
# Once done this will define
#
# XFIXES_FOUND
# XFIXES_LIBRARY
#
FIND_PATH(XFIXES_INCLUDE_PATH X11/extensions/Xfixes.h
/usr/include
/usr/local/include
/sw/include
/opt/local/include
DOC "The directory where Xfixes.h resides")
FIND_LIBRARY(XFIXES_LIBRARY
NAMES XFIXES Xfixes
PATHS
/usr/lib64
/usr/lib
/usr/local/lib64
/usr/local/lib
/sw/lib
/opt/local/lib
DOC "The XFIXES library")
IF(XFIXES_INCLUDE_PATH)
INCLUDE_DIRECTORIES(${XFIXES_INCLUDE_PATH})
SET(XFIXES_FOUND 1 CACHE STRING "Set to 1 if XFIXES is found, 0 otherwise")
ELSE(XFIXES_INCLUDE_PATH)
SET(XFIXES_FOUND 0 CACHE STRING "Set to 1 if XFIXES is found, 0 otherwise")
ENDIF(XFIXES_INCLUDE_PATH)
MARK_AS_ADVANCED(XFIXES_FOUND)
Release_v0.3/CMakeLists.txt 0000664 0000000 0000000 00000010314 12231421770 0015742 0 ustar 00root root 0000000 0000000 #############################################################################
# INTEL CORPORATION PROPRIETARY INFORMATION #
# This software is supplied under the terms of a license agreement or #
# nondisclosure agreement with Intel Corporation and may not be copied #
# or disclosed except in accordance with the terms of that agreement. #
# Copyright (C) 2009 Intel Corporation. All Rights Reserved. #
#############################################################################
CMAKE_MINIMUM_REQUIRED(VERSION 2.6.0)
PROJECT(OCL)
set (LIBCL_DRIVER_VERSION_MAJOR 0)
set (LIBCL_DRIVER_VERSION_MINOR 3)
set (LIBCL_C_VERSION_MAJOR 1)
set (LIBCL_C_VERSION_MINOR 1)
configure_file (
"src/OCLConfig.h.in"
"src/OCLConfig.h"
)
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR})
SET(CMAKE_VERBOSE_MAKEFILE "false")
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/CMake/")
SET(EMULATE_IVB false CACHE BOOL "To emulate IVB")
SET(EMULATE_SNB false CACHE BOOL "To emulate SNB")
SET(EMULATE_HSW false CACHE BOOL "To emulate HSW")
ADD_DEFINITIONS(-D__$(USER)__)
# Force Release with debug info
if (NOT CMAKE_BUILD_TYPE)
set (CMAKE_BUILD_TYPE RelWithDebInfo)
endif (NOT CMAKE_BUILD_TYPE)
set (CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE} CACHE STRING "assure config" FORCE)
message(STATUS "Building mode: " ${CMAKE_BUILD_TYPE})
SET(CMAKE_CXX_FLAGS_DEBUGO0 "-O0 -g")
SET(CMAKE_C_FLAGS_DEBUGO0 "-O0 -g")
IF (EMULATE_HSW)
SET (USE_FULSIM "true")
ADD_DEFINITIONS(-DEMULATE_GEN=75)
ELSEIF (EMULATE_IVB)
SET (USE_FULSIM "true")
ADD_DEFINITIONS(-DEMULATE_GEN=7)
ELSEIF (EMULATE_SNB)
SET (USE_FULSIM "true")
ADD_DEFINITIONS(-DEMULATE_GEN=6)
ELSE (EMULATE_IVB)
SET (USE_FULSIM "false")
ADD_DEFINITIONS(-DEMULATE_GEN=0)
ENDIF (EMULATE_HSW)
# XXX now hard coded to enable the clamp to border workaround for IVB.
ADD_DEFINITIONS(-DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND)
IF (USE_FULSIM)
ADD_DEFINITIONS(-DUSE_FULSIM=1)
ELSE (USE_FULSIM)
ADD_DEFINITIONS(-DUSE_FULSIM=0)
ENDIF (USE_FULSIM)
SET(CMAKE_CXX_FLAGS "-Wall -Wno-invalid-offsetof -mfpmath=sse -fno-rtti -Wcast-align -std=c++0x -msse2 -msse3 -mssse3 -msse4.1 ${CMAKE_CXX_FLAGS}")
SET(CMAKE_C_FLAGS "-Wall -mfpmath=sse -msse2 -Wcast-align -msse2 -msse3 -mssse3 -msse4.1 ${CMAKE_C_FLAGS}")
# Front end stuff we need
#INCLUDE(CMake/FindLLVM.cmake)
Find_Package(LLVM 3.1)
# XLib
Find_Package(X11)
IF(X11_FOUND)
MESSAGE(STATUS "Looking for XLib - found")
ELSE(X11_FOUND)
MESSAGE(STATUS "Looking for XLib - not found")
ENDIF(X11_FOUND)
# DRM
Find_Package(DRM)
IF(DRM_FOUND)
MESSAGE(STATUS "Looking for DRM - found")
ELSE(DRM_FOUND)
MESSAGE(STATUS "Looking for DRM - not found")
ENDIF(DRM_FOUND)
# OpenGL
Find_Package(OpenGL)
# Threads
Find_Package(Threads)
# DRM Intel
Find_Package(DRMIntel)
IF(DRM_INTEL_FOUND)
MESSAGE(STATUS "Looking for DRM Intel - found")
ELSE(DRM_INTEL_FOUND)
MESSAGE(STATUS "Looking for DRM Intel - not found")
ENDIF(DRM_INTEL_FOUND)
# Xext
Find_Package(Xext)
IF(XEXT_FOUND)
MESSAGE(STATUS "Looking for Xext - found")
ELSE(XEXT_FOUND)
MESSAGE(STATUS "Looking for Xext - not found")
ENDIF(XEXT_FOUND)
# Xfixes
Find_Package(Xfixes)
IF(XFIXES_FOUND)
MESSAGE(STATUS "Looking for Xfixes - found")
ELSE(XFIXES_FOUND)
MESSAGE(STATUS "Looking for Xfixes - not found")
ENDIF(XFIXES_FOUND)
# Gen-backend (compiler)
Find_Package(GBE)
IF(GBE_FOUND)
MESSAGE(STATUS "Looking for Gen-Backend - found")
ELSE(GBE_FOUND)
MESSAGE(STATUS "Looking for Gen-Backend - not found")
ENDIF(GBE_FOUND)
Find_Package(EGL)
IF(EGL_FOUND)
MESSAGE(STATUS "Looking for EGL - found")
ELSE(EGL_FOUND)
MESSAGE(STATUS "Looking for EGL - not found")
ENDIF(EGL_FOUND)
IF(MESA_SOURCE_FOUND)
MESSAGE(STATUS "Looking for mesa source code - found")
ELSE(MESA_SOURCE_FOUND)
MESSAGE(STATUS "Looking for mesa source code - not found, cl_khr_gl_sharing will be disabled.")
ENDIF(MESA_SOURCE_FOUND)
Find_Package(OCLIcd)
IF(OCLIcd_FOUND)
MESSAGE(STATUS "Looking for OCL ICD header file - found")
ELSE(OCLIcd_FOUND)
MESSAGE(STATUS "Looking for OCL ICD header file - not found")
ENDIF(OCLIcd_FOUND)
Find_Package(PythonInterp)
ADD_SUBDIRECTORY(include)
ADD_SUBDIRECTORY(backend)
ADD_SUBDIRECTORY(src)
ADD_SUBDIRECTORY(utests)
Release_v0.3/COPYING 0000664 0000000 0000000 00000063642 12231421770 0014251 0 ustar 00root root 0000000 0000000 GNU LESSER GENERAL PUBLIC LICENSE
Version 2.1, February 1999
Copyright (C) 1991, 1999 Free Software Foundation, Inc.
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
[This is the first released version of the Lesser GPL. It also counts
as the successor of the GNU Library Public License, version 2, hence
the version number 2.1.]
Preamble
The licenses for most software are designed to take away your
freedom to share and change it. By contrast, the GNU General Public
Licenses are intended to guarantee your freedom to share and change
free software--to make sure the software is free for all its users.
This license, the Lesser General Public License, applies to some
specially designated software packages--typically libraries--of the
Free Software Foundation and other authors who decide to use it. You
can use it too, but we suggest you first think carefully about whether
this license or the ordinary General Public License is the better
strategy to use in any particular case, based on the explanations below.
When we speak of free software, we are referring to freedom of use,
not price. Our General Public Licenses are designed to make sure that
you have the freedom to distribute copies of free software (and charge
for this service if you wish); that you receive source code or can get
it if you want it; that you can change the software and use pieces of
it in new free programs; and that you are informed that you can do
these things.
To protect your rights, we need to make restrictions that forbid
distributors to deny you these rights or to ask you to surrender these
rights. These restrictions translate to certain responsibilities for
you if you distribute copies of the library or if you modify it.
For example, if you distribute copies of the library, whether gratis
or for a fee, you must give the recipients all the rights that we gave
you. You must make sure that they, too, receive or can get the source
code. If you link other code with the library, you must provide
complete object files to the recipients, so that they can relink them
with the library after making changes to the library and recompiling
it. And you must show them these terms so they know their rights.
We protect your rights with a two-step method: (1) we copyright the
library, and (2) we offer you this license, which gives you legal
permission to copy, distribute and/or modify the library.
To protect each distributor, we want to make it very clear that
there is no warranty for the free library. Also, if the library is
modified by someone else and passed on, the recipients should know
that what they have is not the original version, so that the original
author's reputation will not be affected by problems that might be
introduced by others.
Finally, software patents pose a constant threat to the existence of
any free program. We wish to make sure that a company cannot
effectively restrict the users of a free program by obtaining a
restrictive license from a patent holder. Therefore, we insist that
any patent license obtained for a version of the library must be
consistent with the full freedom of use specified in this license.
Most GNU software, including some libraries, is covered by the
ordinary GNU General Public License. This license, the GNU Lesser
General Public License, applies to certain designated libraries, and
is quite different from the ordinary General Public License. We use
this license for certain libraries in order to permit linking those
libraries into non-free programs.
When a program is linked with a library, whether statically or using
a shared library, the combination of the two is legally speaking a
combined work, a derivative of the original library. The ordinary
General Public License therefore permits such linking only if the
entire combination fits its criteria of freedom. The Lesser General
Public License permits more lax criteria for linking other code with
the library.
We call this license the "Lesser" General Public License because it
does Less to protect the user's freedom than the ordinary General
Public License. It also provides other free software developers Less
of an advantage over competing non-free programs. These disadvantages
are the reason we use the ordinary General Public License for many
libraries. However, the Lesser license provides advantages in certain
special circumstances.
For example, on rare occasions, there may be a special need to
encourage the widest possible use of a certain library, so that it becomes
a de-facto standard. To achieve this, non-free programs must be
allowed to use the library. A more frequent case is that a free
library does the same job as widely used non-free libraries. In this
case, there is little to gain by limiting the free library to free
software only, so we use the Lesser General Public License.
In other cases, permission to use a particular library in non-free
programs enables a greater number of people to use a large body of
free software. For example, permission to use the GNU C Library in
non-free programs enables many more people to use the whole GNU
operating system, as well as its variant, the GNU/Linux operating
system.
Although the Lesser General Public License is Less protective of the
users' freedom, it does ensure that the user of a program that is
linked with the Library has the freedom and the wherewithal to run
that program using a modified version of the Library.
The precise terms and conditions for copying, distribution and
modification follow. Pay close attention to the difference between a
"work based on the library" and a "work that uses the library". The
former contains code derived from the library, whereas the latter must
be combined with the library in order to run.
GNU LESSER GENERAL PUBLIC LICENSE
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
0. This License Agreement applies to any software library or other
program which contains a notice placed by the copyright holder or
other authorized party saying it may be distributed under the terms of
this Lesser General Public License (also called "this License").
Each licensee is addressed as "you".
A "library" means a collection of software functions and/or data
prepared so as to be conveniently linked with application programs
(which use some of those functions and data) to form executables.
The "Library", below, refers to any such software library or work
which has been distributed under these terms. A "work based on the
Library" means either the Library or any derivative work under
copyright law: that is to say, a work containing the Library or a
portion of it, either verbatim or with modifications and/or translated
straightforwardly into another language. (Hereinafter, translation is
included without limitation in the term "modification".)
"Source code" for a work means the preferred form of the work for
making modifications to it. For a library, complete source code means
all the source code for all modules it contains, plus any associated
interface definition files, plus the scripts used to control compilation
and installation of the library.
Activities other than copying, distribution and modification are not
covered by this License; they are outside its scope. The act of
running a program using the Library is not restricted, and output from
such a program is covered only if its contents constitute a work based
on the Library (independent of the use of the Library in a tool for
writing it). Whether that is true depends on what the Library does
and what the program that uses the Library does.
1. You may copy and distribute verbatim copies of the Library's
complete source code as you receive it, in any medium, provided that
you conspicuously and appropriately publish on each copy an
appropriate copyright notice and disclaimer of warranty; keep intact
all the notices that refer to this License and to the absence of any
warranty; and distribute a copy of this License along with the
Library.
You may charge a fee for the physical act of transferring a copy,
and you may at your option offer warranty protection in exchange for a
fee.
2. You may modify your copy or copies of the Library or any portion
of it, thus forming a work based on the Library, and copy and
distribute such modifications or work under the terms of Section 1
above, provided that you also meet all of these conditions:
a) The modified work must itself be a software library.
b) You must cause the files modified to carry prominent notices
stating that you changed the files and the date of any change.
c) You must cause the whole of the work to be licensed at no
charge to all third parties under the terms of this License.
d) If a facility in the modified Library refers to a function or a
table of data to be supplied by an application program that uses
the facility, other than as an argument passed when the facility
is invoked, then you must make a good faith effort to ensure that,
in the event an application does not supply such function or
table, the facility still operates, and performs whatever part of
its purpose remains meaningful.
(For example, a function in a library to compute square roots has
a purpose that is entirely well-defined independent of the
application. Therefore, Subsection 2d requires that any
application-supplied function or table used by this function must
be optional: if the application does not supply it, the square
root function must still compute square roots.)
These requirements apply to the modified work as a whole. If
identifiable sections of that work are not derived from the Library,
and can be reasonably considered independent and separate works in
themselves, then this License, and its terms, do not apply to those
sections when you distribute them as separate works. But when you
distribute the same sections as part of a whole which is a work based
on the Library, the distribution of the whole must be on the terms of
this License, whose permissions for other licensees extend to the
entire whole, and thus to each and every part regardless of who wrote
it.
Thus, it is not the intent of this section to claim rights or contest
your rights to work written entirely by you; rather, the intent is to
exercise the right to control the distribution of derivative or
collective works based on the Library.
In addition, mere aggregation of another work not based on the Library
with the Library (or with a work based on the Library) on a volume of
a storage or distribution medium does not bring the other work under
the scope of this License.
3. You may opt to apply the terms of the ordinary GNU General Public
License instead of this License to a given copy of the Library. To do
this, you must alter all the notices that refer to this License, so
that they refer to the ordinary GNU General Public License, version 2,
instead of to this License. (If a newer version than version 2 of the
ordinary GNU General Public License has appeared, then you can specify
that version instead if you wish.) Do not make any other change in
these notices.
Once this change is made in a given copy, it is irreversible for
that copy, so the ordinary GNU General Public License applies to all
subsequent copies and derivative works made from that copy.
This option is useful when you wish to copy part of the code of
the Library into a program that is not a library.
4. You may copy and distribute the Library (or a portion or
derivative of it, under Section 2) in object code or executable form
under the terms of Sections 1 and 2 above provided that you accompany
it with the complete corresponding machine-readable source code, which
must be distributed under the terms of Sections 1 and 2 above on a
medium customarily used for software interchange.
If distribution of object code is made by offering access to copy
from a designated place, then offering equivalent access to copy the
source code from the same place satisfies the requirement to
distribute the source code, even though third parties are not
compelled to copy the source along with the object code.
5. A program that contains no derivative of any portion of the
Library, but is designed to work with the Library by being compiled or
linked with it, is called a "work that uses the Library". Such a
work, in isolation, is not a derivative work of the Library, and
therefore falls outside the scope of this License.
However, linking a "work that uses the Library" with the Library
creates an executable that is a derivative of the Library (because it
contains portions of the Library), rather than a "work that uses the
library". The executable is therefore covered by this License.
Section 6 states terms for distribution of such executables.
When a "work that uses the Library" uses material from a header file
that is part of the Library, the object code for the work may be a
derivative work of the Library even though the source code is not.
Whether this is true is especially significant if the work can be
linked without the Library, or if the work is itself a library. The
threshold for this to be true is not precisely defined by law.
If such an object file uses only numerical parameters, data
structure layouts and accessors, and small macros and small inline
functions (ten lines or less in length), then the use of the object
file is unrestricted, regardless of whether it is legally a derivative
work. (Executables containing this object code plus portions of the
Library will still fall under Section 6.)
Otherwise, if the work is a derivative of the Library, you may
distribute the object code for the work under the terms of Section 6.
Any executables containing that work also fall under Section 6,
whether or not they are linked directly with the Library itself.
6. As an exception to the Sections above, you may also combine or
link a "work that uses the Library" with the Library to produce a
work containing portions of the Library, and distribute that work
under terms of your choice, provided that the terms permit
modification of the work for the customer's own use and reverse
engineering for debugging such modifications.
You must give prominent notice with each copy of the work that the
Library is used in it and that the Library and its use are covered by
this License. You must supply a copy of this License. If the work
during execution displays copyright notices, you must include the
copyright notice for the Library among them, as well as a reference
directing the user to the copy of this License. Also, you must do one
of these things:
a) Accompany the work with the complete corresponding
machine-readable source code for the Library including whatever
changes were used in the work (which must be distributed under
Sections 1 and 2 above); and, if the work is an executable linked
with the Library, with the complete machine-readable "work that
uses the Library", as object code and/or source code, so that the
user can modify the Library and then relink to produce a modified
executable containing the modified Library. (It is understood
that the user who changes the contents of definitions files in the
Library will not necessarily be able to recompile the application
to use the modified definitions.)
b) Use a suitable shared library mechanism for linking with the
Library. A suitable mechanism is one that (1) uses at run time a
copy of the library already present on the user's computer system,
rather than copying library functions into the executable, and (2)
will operate properly with a modified version of the library, if
the user installs one, as long as the modified version is
interface-compatible with the version that the work was made with.
c) Accompany the work with a written offer, valid for at
least three years, to give the same user the materials
specified in Subsection 6a, above, for a charge no more
than the cost of performing this distribution.
d) If distribution of the work is made by offering access to copy
from a designated place, offer equivalent access to copy the above
specified materials from the same place.
e) Verify that the user has already received a copy of these
materials or that you have already sent this user a copy.
For an executable, the required form of the "work that uses the
Library" must include any data and utility programs needed for
reproducing the executable from it. However, as a special exception,
the materials to be distributed need not include anything that is
normally distributed (in either source or binary form) with the major
components (compiler, kernel, and so on) of the operating system on
which the executable runs, unless that component itself accompanies
the executable.
It may happen that this requirement contradicts the license
restrictions of other proprietary libraries that do not normally
accompany the operating system. Such a contradiction means you cannot
use both them and the Library together in an executable that you
distribute.
7. You may place library facilities that are a work based on the
Library side-by-side in a single library together with other library
facilities not covered by this License, and distribute such a combined
library, provided that the separate distribution of the work based on
the Library and of the other library facilities is otherwise
permitted, and provided that you do these two things:
a) Accompany the combined library with a copy of the same work
based on the Library, uncombined with any other library
facilities. This must be distributed under the terms of the
Sections above.
b) Give prominent notice with the combined library of the fact
that part of it is a work based on the Library, and explaining
where to find the accompanying uncombined form of the same work.
8. You may not copy, modify, sublicense, link with, or distribute
the Library except as expressly provided under this License. Any
attempt otherwise to copy, modify, sublicense, link with, or
distribute the Library is void, and will automatically terminate your
rights under this License. However, parties who have received copies,
or rights, from you under this License will not have their licenses
terminated so long as such parties remain in full compliance.
9. You are not required to accept this License, since you have not
signed it. However, nothing else grants you permission to modify or
distribute the Library or its derivative works. These actions are
prohibited by law if you do not accept this License. Therefore, by
modifying or distributing the Library (or any work based on the
Library), you indicate your acceptance of this License to do so, and
all its terms and conditions for copying, distributing or modifying
the Library or works based on it.
10. Each time you redistribute the Library (or any work based on the
Library), the recipient automatically receives a license from the
original licensor to copy, distribute, link with or modify the Library
subject to these terms and conditions. You may not impose any further
restrictions on the recipients' exercise of the rights granted herein.
You are not responsible for enforcing compliance by third parties with
this License.
11. If, as a consequence of a court judgment or allegation of patent
infringement or for any other reason (not limited to patent issues),
conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License. If you cannot
distribute so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you
may not distribute the Library at all. For example, if a patent
license would not permit royalty-free redistribution of the Library by
all those who receive copies directly or indirectly through you, then
the only way you could satisfy both it and this License would be to
refrain entirely from distribution of the Library.
If any portion of this section is held invalid or unenforceable under any
particular circumstance, the balance of the section is intended to apply,
and the section as a whole is intended to apply in other circumstances.
It is not the purpose of this section to induce you to infringe any
patents or other property right claims or to contest validity of any
such claims; this section has the sole purpose of protecting the
integrity of the free software distribution system which is
implemented by public license practices. Many people have made
generous contributions to the wide range of software distributed
through that system in reliance on consistent application of that
system; it is up to the author/donor to decide if he or she is willing
to distribute software through any other system and a licensee cannot
impose that choice.
This section is intended to make thoroughly clear what is believed to
be a consequence of the rest of this License.
12. If the distribution and/or use of the Library is restricted in
certain countries either by patents or by copyrighted interfaces, the
original copyright holder who places the Library under this License may add
an explicit geographical distribution limitation excluding those countries,
so that distribution is permitted only in or among countries not thus
excluded. In such case, this License incorporates the limitation as if
written in the body of this License.
13. The Free Software Foundation may publish revised and/or new
versions of the Lesser General Public License from time to time.
Such new versions will be similar in spirit to the present version,
but may differ in detail to address new problems or concerns.
Each version is given a distinguishing version number. If the Library
specifies a version number of this License which applies to it and
"any later version", you have the option of following the terms and
conditions either of that version or of any later version published by
the Free Software Foundation. If the Library does not specify a
license version number, you may choose any version ever published by
the Free Software Foundation.
14. If you wish to incorporate parts of the Library into other free
programs whose distribution conditions are incompatible with these,
write to the author to ask for permission. For software which is
copyrighted by the Free Software Foundation, write to the Free
Software Foundation; we sometimes make exceptions for this. Our
decision will be guided by the two goals of preserving the free status
of all derivatives of our free software and of promoting the sharing
and reuse of software generally.
NO WARRANTY
15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
DAMAGES.
END OF TERMS AND CONDITIONS
How to Apply These Terms to Your New Libraries
If you develop a new library, and you want it to be of the greatest
possible use to the public, we recommend making it free software that
everyone can redistribute and change. You can do so by permitting
redistribution under these terms (or, alternatively, under the terms of the
ordinary General Public License).
To apply these terms, attach the following notices to the library. It is
safest to attach them to the start of each source file to most effectively
convey the exclusion of warranty; and each file should have at least the
"copyright" line and a pointer to where the full notice is found.
Copyright (C)
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Also add information on how to contact you by electronic and paper mail.
You should also get your employer (if you work as a programmer) or your
school, if any, to sign a "copyright disclaimer" for the library, if
necessary. Here is a sample; alter the names:
Yoyodyne, Inc., hereby disclaims all copyright interest in the
library `Frob' (a library for tweaking knobs) written by James Random Hacker.
, 1 April 1990
Ty Coon, President of Vice
That's all there is to it!
Release_v0.3/README.md 0000664 0000000 0000000 00000000223 12231421770 0014457 0 ustar 00root root 0000000 0000000 We host documents at the following wiki page:
[http://wiki.freedesktop.org/www/Software/Beignet](http://wiki.freedesktop.org/www/Software/Beignet)
Release_v0.3/backend/ 0000775 0000000 0000000 00000000000 12231421770 0014572 5 ustar 00root root 0000000 0000000 Release_v0.3/backend/CMakeLists.txt 0000664 0000000 0000000 00000011366 12231421770 0017341 0 ustar 00root root 0000000 0000000 project (GBE)
set (LIBGBE_VERSION_MAJOR 0)
set (LIBGBE_VERSION_MINOR 2)
cmake_minimum_required (VERSION 2.6.0)
set (GBE_CMAKE_DIR "${GBE_SOURCE_DIR}/cmake")
set (CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${GBE_CMAKE_DIR}")
##############################################################
# Compilation directives
##############################################################
set (GBE_DEBUG_MEMORY false CACHE bool "Activate the memory debugger")
set (GBE_USE_BLOB false CACHE bool "Compile everything from one big file")
##############################################################
# Compiler
##############################################################
if (UNIX)
set (COMPILER "GCC" CACHE INT "Compiler to choose on Linux (GCC,ICC,CLANG)")
endif (UNIX)
# Force Release with debug info
if (NOT CMAKE_BUILD_TYPE)
set (CMAKE_BUILD_TYPE RelWithDebInfo)
endif (NOT CMAKE_BUILD_TYPE)
set (CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE} CACHE STRING "assure config" FORCE)
message(STATUS "Building mode: " ${CMAKE_BUILD_TYPE})
if (GBE_DEBUG_MEMORY)
set (GBE_DEBUG_MEMORY_FLAG "-DGBE_DEBUG_MEMORY=1")
else (GBE_DEBUG_MEMORY)
set (GBE_DEBUG_MEMORY_FLAG "-DGBE_DEBUG_MEMORY=0")
endif (GBE_DEBUG_MEMORY)
# Hide all symbols and allows the symbols declared as visible to be exported
set (CMAKE_C_CXX_FLAGS "-fvisibility=hidden ${CMAKE_C_CXX_FLAGS}")
if (COMPILER STREQUAL "GCC")
set (CMAKE_C_CXX_FLAGS "${CMAKE_C_CXX_FLAGS} -funroll-loops -Wstrict-aliasing=2 -fstrict-aliasing -msse2 -msse3 -mssse3 -msse4.1 -fPIC -Wall")
set (CMAKE_C_CXX_FLAGS "${CMAKE_C_CXX_FLAGS} ${LLVM_CFLAGS}")
set (CMAKE_CXX_FLAGS "${CMAKE_C_CXX_FLAGS} -Wno-invalid-offsetof -fno-rtti -std=c++0x")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GBE_DEBUG_MEMORY_FLAG}")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GBE_COMPILE_UTESTS_FLAG}")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-E")
set (CMAKE_SHARED_LINKER_FLAGS "-Wl,--no-undefined ${LLVM_LFLAGS}")
set (CMAKE_CXX_FLAGS_DEBUG "-g -DGBE_DEBUG=1")
set (CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DGBE_DEBUG=1")
set (CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG -DGBE_DEBUG=0")
set (CMAKE_CXX_FLAGS_RELEASE "-O2 -DNDEBUG -DGBE_DEBUG=0")
set (CMAKE_C_FLAGS "${CMAKE_C_CXX_FLAGS}")
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${GBE_DEBUG_MEMORY_FLAG}")
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${GBE_COMPILE_UTESTS_FLAG}")
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wl,-E")
set (CMAKE_C_FLAGS_DEBUG "-g -DGBE_DEBUG=1")
set (CMAKE_C_FLAGS_RELWITHDEBINFO "-O2 -g -DGBE_DEBUG=1")
set (CMAKE_C_FLAGS_MINSIZEREL "-Os -DNDEBUG -DGBE_DEBUG=0")
set (CMAKE_C_FLAGS_RELEASE "-O2 -DNDEBUG -DGBE_DEBUG=0")
elseif (COMPILER STREQUAL "CLANG")
set (CMAKE_C_COMPILER "clang")
set (CMAKE_C_FLAGS "-Wall -std=c99")
set (CMAKE_C_FLAGS_DEBUG "-g -DGBE_DEBUG=1")
set (CMAKE_C_FLAGS_RELWITHDEBINFO "-O2 -g -DGBE_DEBUG=1")
set (CMAKE_C_FLAGS_MINSIZEREL "-Os -DNDEBUG -DGBE_DEBUG=0")
set (CMAKE_C_FLAGS_RELEASE "-O2 -DNDEBUG -DGBE_DEBUG=0")
set (CMAKE_CXX_COMPILER "clang++")
set (CMAKE_CXX_FLAGS "-fstrict-aliasing -msse2 -fPIC -Wall -Wno-format-security -Wno-invalid-offsetof -std=c++0x")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GBE_DEBUG_MEMORY_FLAG}")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GBE_COMPILE_UTESTS_FLAG}")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${VISIBILITY_FLAG}")
set (CMAKE_CXX_FLAGS_DEBUG "-g -DGBE_DEBUG=1")
set (CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DGBE_DEBUG=1")
set (CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG -DGBE_DEBUG=0")
set (CMAKE_CXX_FLAGS_RELEASE "-O2 -DNDEBUG -DGBE_DEBUG=0")
set (CMAKE_AR "/usr/bin/llvm-ar")
set (CMAKE_LINKER "/usr/bin/llvm-ld")
set (CMAKE_NM "/usr/bin/llvm-nm")
set (CMAKE_OBJDUMP "/usr/bin/llvm-objdump")
set (CMAKE_RANLIB "ranlib")
elseif (COMPILER STREQUAL "ICC")
set (CMAKE_CXX_COMPILER "icpc")
set (CMAKE_C_COMPILER "icc")
set (CMAKE_CXX_FLAGS "-std=c++0x -wd2928 -Wall -fPIC -fstrict-aliasing -fp-model fast -xSSE2")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GBE_DEBUG_MEMORY_FLAG}")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GBE_COMPILE_UTESTS_FLAG}")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${VISIBILITY_FLAG} -Wl,-E")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GBE_DEBUG_MODE_FLAG}")
set (CMAKE_CXX_FLAGS_DEBUG "-g -O0 -DGBE_DEBUG=1")
set (CCMAKE_CXX_FLAGS_RELWITHDEBINFO "-g -O2 -DGBE_DEBUG=1")
set (CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG -O2 -DGBE_DEBUG=0")
set (CCMAKE_CXX_FLAGS_MINSIZEREL "-Os -DGBE_DEBUG=0")
set (CMAKE_EXE_LINKER_FLAGS "")
endif ()
include_directories (${CMAKE_CURRENT_BINARY_DIR})
##############################################################
# Project source code
##############################################################
add_subdirectory (src)
Release_v0.3/backend/kernels/ 0000775 0000000 0000000 00000000000 12231421770 0016235 5 ustar 00root root 0000000 0000000 Release_v0.3/backend/kernels/compile.sh 0000775 0000000 0000000 00000000145 12231421770 0020224 0 ustar 00root root 0000000 0000000 #!/bin/bash
clang -emit-llvm -O3 -target nvptx -c $1 -o $1.o
llvm-dis $1.o
rm $1.o
mv $1.o.ll $1.ll
Release_v0.3/backend/src/ 0000775 0000000 0000000 00000000000 12231421770 0015361 5 ustar 00root root 0000000 0000000 Release_v0.3/backend/src/.gitignore 0000664 0000000 0000000 00000000157 12231421770 0017354 0 ustar 00root root 0000000 0000000 GBEConfig.h
libgbe.so
ocl_common_defines_str.cpp
ocl_stdlib.h
ocl_stdlib.h.pch
ocl_stdlib_str.cpp
ocl_vector.h
Release_v0.3/backend/src/CMakeLists.txt 0000664 0000000 0000000 00000012661 12231421770 0020127 0 ustar 00root root 0000000 0000000 set (ocl_vector_spec_file ${GBE_SOURCE_DIR}/src/builtin_vector_proto.def)
set (ocl_vector_file ${GBE_SOURCE_DIR}/src/ocl_vector.h)
set (ocl_as_file ${GBE_SOURCE_DIR}/src/ocl_as.h)
set (ocl_convert_file ${GBE_SOURCE_DIR}/src/ocl_convert.h)
set (ocl_stdlib_tmpl_file ${GBE_SOURCE_DIR}/src/ocl_stdlib.tmpl.h)
set (ocl_common_header_file ${GBE_SOURCE_DIR}/src/ocl_common_defines.h)
set (ocl_blob_file ${CMAKE_CURRENT_BINARY_DIR}/ocl_stdlib.h)
set (ocl_blob_cpp_file ${GBE_SOURCE_DIR}/src/ocl_stdlib_str.cpp)
set (ocl_gen_blob_cmd ${GBE_SOURCE_DIR}/src/update_blob_ocl_header.py)
set (ocl_gen_vector_cmd ${GBE_SOURCE_DIR}/src/gen_builtin_vector.py)
set (string_header "\\\"string\\\"")
add_custom_command(
OUTPUT ${ocl_blob_cpp_file}
COMMAND rm -rf ${ocl_blob_cpp_file}
COMMAND echo "\\\#include ${string_header}" >> ${ocl_blob_cpp_file}
COMMAND echo "namespace gbe {" >> ${ocl_blob_cpp_file}
COMMAND echo "std::string ocl_stdlib_str = " >> ${ocl_blob_cpp_file}
# Yeah!!! welcome to back slash hell
COMMAND cat ${ocl_blob_file} |sed 's/\\\\/\\\\\\\\/g' | sed 's/\\\"/\\\\\\\"/g' | awk '{ printf \(\"\\"%s\\\\n\\"\\n\", $$0\) }' >> ${ocl_blob_cpp_file}
COMMAND echo "\;" >> ${ocl_blob_cpp_file}
COMMAND echo "}" >> ${ocl_blob_cpp_file}
COMMAND echo "" >> ${ocl_blob_cpp_file}
DEPENDS ${ocl_blob_file})
set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES "ocl_vector.h;ocl_stdlib.h")
add_custom_command(
OUTPUT ${ocl_vector_file}
COMMAND ${PYTHON_EXECUTABLE} ${ocl_gen_vector_cmd} ${ocl_vector_spec_file} ${ocl_vector_file}
DEPENDS ${ocl_gen_vector_cmd} ${ocl_vector_spec_file})
add_custom_command(
OUTPUT ${ocl_blob_file}
COMMAND ${PYTHON_EXECUTABLE} ${ocl_gen_blob_cmd} ${ocl_stdlib_tmpl_file} ${ocl_blob_file}
DEPENDS ${ocl_gen_blob_cmd} ${ocl_stdlib_tmpl_file} ${ocl_common_header_file} ${ocl_vector_file} ${ocl_as_file} ${ocl_convert_file})
set (pch_object ${ocl_blob_file}.pch)
# generate pch object
if (LLVM_VERSION_NODOT VERSION_GREATER 32)
set (clang_cmd -cc1 -x cl -triple spir -ffp-contract=off -emit-pch)
else (LLVM_VERSION_NODOT VERSION_GREATER 32)
if (LLVM_VERSION_NODOT VERSION_GREATER 31)
set (clang_cmd -cc1 -x cl -triple nvptx -ffp-contract=off -emit-pch)
else (LLVM_VERSION_NODOT VERSION_GREATER 31)
set (clang_cmd -cc1 -x cl -triple ptx32 -emit-pch)
endif (LLVM_VERSION_NODOT VERSION_GREATER 31)
endif (LLVM_VERSION_NODOT VERSION_GREATER 32)
set (clang_cmd ${clang_cmd} -fno-builtin -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND)
add_custom_command(
OUTPUT ${pch_object}
COMMAND rm -f ${pch_object}
COMMAND clang ${clang_cmd} ${ocl_blob_file} -o ${pch_object}
DEPENDS ${ocl_blob_file}
)
add_custom_target(pch_object
DEPENDS ${pch_object})
if (GBE_USE_BLOB)
set (GBE_SRC
blob.cpp
backend/gen/gen_mesa_disasm.c)
else (GBE_USE_BLOB)
set (GBE_SRC
ocl_stdlib.h
ocl_stdlib_str.cpp # this file is auto-generated.
sys/vector.hpp
sys/hash_map.hpp
sys/map.hpp
sys/set.hpp
sys/intrusive_list.hpp
sys/intrusive_list.cpp
sys/exception.hpp
sys/assert.cpp
sys/assert.hpp
sys/alloc.cpp
sys/alloc.hpp
sys/mutex.cpp
sys/mutex.hpp
sys/platform.cpp
sys/platform.hpp
sys/cvar.cpp
sys/cvar.hpp
ir/context.cpp
ir/context.hpp
ir/profile.cpp
ir/profile.hpp
ir/type.cpp
ir/type.hpp
ir/unit.cpp
ir/unit.hpp
ir/constant.cpp
ir/constant.hpp
ir/sampler.cpp
ir/sampler.hpp
ir/image.cpp
ir/image.hpp
ir/instruction.cpp
ir/instruction.hpp
ir/liveness.cpp
ir/register.cpp
ir/register.hpp
ir/function.cpp
ir/function.hpp
ir/value.cpp
ir/value.hpp
ir/lowering.cpp
ir/lowering.hpp
backend/context.cpp
backend/context.hpp
backend/program.cpp
backend/program.hpp
backend/program.h
llvm/llvm_gen_backend.cpp
llvm/llvm_passes.cpp
llvm/llvm_scalarize.cpp
llvm/llvm_to_gen.cpp
llvm/llvm_gen_backend.hpp
llvm/llvm_gen_ocl_function.hxx
llvm/llvm_to_gen.hpp
backend/gen/gen_mesa_disasm.c
backend/gen_insn_selection.cpp
backend/gen_insn_selection.hpp
backend/gen_insn_scheduling.cpp
backend/gen_insn_scheduling.hpp
backend/gen_reg_allocation.cpp
backend/gen_reg_allocation.hpp
backend/gen_context.cpp
backend/gen_context.hpp
backend/gen_program.cpp
backend/gen_program.hpp
backend/gen_program.h
backend/gen_defs.hpp
backend/gen_encoder.hpp
backend/gen_encoder.cpp)
endif (GBE_USE_BLOB)
include_directories (.)
link_directories (${LLVM_LIBRARY_DIRS})
include_directories(${LLVM_INCLUDE_DIRS})
add_library (gbe SHARED ${GBE_SRC})
ADD_DEPENDENCIES (gbe pch_object)
target_link_libraries(
gbe
${DRM_INTEL_LIBRARY}
${DRM_LIBRARY}
${OPENGL_LIBRARIES}
${CLANG_LIBRARIES}
${LLVM_MODULE_LIBS}
${CMAKE_THREAD_LIBS_INIT}
${CMAKE_DL_LIBS})
link_directories (${LLVM_LIBRARY_DIR})
ADD_EXECUTABLE(gbe_bin_generater gbe_bin_generater.cpp)
TARGET_LINK_LIBRARIES(gbe_bin_generater gbe)
install (TARGETS gbe LIBRARY DESTINATION lib)
install (FILES ${pch_object} DESTINATION lib)
install (FILES backend/program.h DESTINATION include/gen)
set (PCH_OBJECT_DIR "${pch_object};${CMAKE_INSTALL_PREFIX}/lib/ocl_stdlib.h.pch")
configure_file (
"GBEConfig.h.in"
"GBEConfig.h"
)
Release_v0.3/backend/src/GBEConfig.h.in 0000664 0000000 0000000 00000000304 12231421770 0017657 0 ustar 00root root 0000000 0000000 // the configured options and settings for LIBGBE
#define LIBGBE_VERSION_MAJOR @LIBGBE_VERSION_MAJOR@
#define LIBGBE_VERSION_MINOR @LIBGBE_VERSION_MINOR@
#define PCH_OBJECT_DIR "@PCH_OBJECT_DIR@"
Release_v0.3/backend/src/backend/ 0000775 0000000 0000000 00000000000 12231421770 0016750 5 ustar 00root root 0000000 0000000 Release_v0.3/backend/src/backend/context.cpp 0000664 0000000 0000000 00000061100 12231421770 0021136 0 ustar 00root root 0000000 0000000 /*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see .
*
* Author: Benjamin Segovia
*/
/**
* \file context.cpp
* \author Benjamin Segovia
*/
#include "backend/context.hpp"
#include "backend/program.hpp"
#include "backend/gen_encoder.hpp"
#include "ir/unit.hpp"
#include "ir/function.hpp"
#include "ir/profile.hpp"
#include "ir/liveness.hpp"
#include "ir/value.hpp"
#include "ir/image.hpp"
#include "ir/sampler.hpp"
#include "sys/cvar.hpp"
#include
namespace gbe
{
/*! Structure that keeps track of allocation in the register file. This is
* actually needed by Context (and not only by GenContext) because both
* simulator and hardware have to deal with constant pushing which uses the
* register file
*
* Since Gen is pretty flexible, we just maintain a free list for the
* register file (as a classical allocator) and coalesce blocks when required
*/
class RegisterFilePartitioner
{
public:
RegisterFilePartitioner(void);
~RegisterFilePartitioner(void);
/*! Allocate some memory in the register file. Return 0 if out-of-memory. By
* the way, zero is not a valid offset since r0 is always preallocated by
* the hardware. Note that we always use the left most block when
* allocating, so it makes sense for constant pushing
*/
int16_t allocate(int16_t size, int16_t alignment, bool bFwd=false);
/*! Free the given register file piece */
void deallocate(int16_t offset);
/*! Spilt a block into 2 blocks */
void splitBlock(int16_t offset, int16_t subOffset);
private:
/*! May need to make that run-time in the future */
static const int16_t RegisterFileSize = 4*KB;
/*! Double chained list of free spaces */
struct Block {
Block(int16_t offset, int16_t size) :
prev(NULL), next(NULL), offset(offset), size(size) {}
Block *prev, *next; //!< Previous and next free blocks
int16_t offset; //!< Where the free block starts
int16_t size; //!< Size of the free block
};
/*! Try to coalesce two blocks (left and right). They must be in that order.
* If the colascing was done, the left block is deleted
*/
void coalesce(Block *left, Block *right);
/*! Head and tail of the free list */
Block *head;
Block *tail;
/*! Handle free list element allocation */
DECL_POOL(Block, blockPool);
/*! Track allocated memory blocks */
map allocatedBlocks;
/*! Use custom allocators */
GBE_CLASS(RegisterFilePartitioner);
};
RegisterFilePartitioner::RegisterFilePartitioner(void) {
// r0 is always set by the HW and used at the end by EOT
const int16_t offset = GEN_REG_SIZE;
const int16_t size = RegisterFileSize - offset;
tail = head = this->newBlock(offset, size);
}
RegisterFilePartitioner::~RegisterFilePartitioner(void) {
while (this->head) {
Block *next = this->head->next;
this->deleteBlock(this->head);
this->head = next;
}
}
int16_t RegisterFilePartitioner::allocate(int16_t size, int16_t alignment, bool bFwd)
{
// Make it simple and just use the first block we find
Block *list = bFwd ? head : tail;
while (list) {
int16_t aligned;
int16_t spaceOnLeft;
int16_t spaceOnRight;
if(bFwd) {
aligned = ALIGN(list->offset, alignment);
spaceOnLeft = aligned - list->offset;
spaceOnRight = list->size - size - spaceOnLeft;
// Not enough space in this block
if (spaceOnRight < 0) {
list = list->next;
continue;
}
} else {
int16_t unaligned = list->offset + list->size - size - (alignment-1);
if(unaligned < 0) {
list = list->prev;
continue;
}
aligned = ALIGN(unaligned, alignment); //alloc from block's tail
spaceOnLeft = aligned - list->offset;
spaceOnRight = list->size - size - spaceOnLeft;
// Not enough space in this block
if (spaceOnLeft < 0) {
list = list->prev;
continue;
}
}
// Cool we can use this block
Block *left = list->prev;
Block *right = list->next;
// If we left a hole on the left, create a new block
if (spaceOnLeft) {
Block *newBlock = this->newBlock(list->offset, spaceOnLeft);
if (left) {
left->next = newBlock;
newBlock->prev = left;
}
if (right) {
newBlock->next = right;
right->prev = newBlock;
}
left = newBlock;
}
// If we left a hole on the right, create a new block as well
if (spaceOnRight) {
Block *newBlock = this->newBlock(aligned + size, spaceOnRight);
if (left) {
left->next = newBlock;
newBlock->prev = left;
}
if (right) {
right->prev = newBlock;
newBlock->next = right;
}
right = newBlock;
}
// Chain both successors and predecessors when the entire block was
// allocated
if (spaceOnLeft == 0 && spaceOnRight == 0) {
if (left) left->next = right;
if (right) right->prev = left;
}
// Update the head of the free blocks
if (list == head) {
if (left)
head = left;
else if (right)
head = right;
else
head = NULL;
}
// Update the tail of the free blocks
if (list == tail) {
if (right)
tail = right;
else if (left)
tail = left;
else
tail = NULL;
}
// Free the block and check the consistency
this->deleteBlock(list);
if (head && head->next) GBE_ASSERT(head->next->prev == head);
if (tail && tail->prev) GBE_ASSERT(tail->prev->next == tail);
// Track the allocation to retrieve the size later
allocatedBlocks.insert(std::make_pair(aligned, size));
// We have a valid offset now
return aligned;
}
return 0;
}
void RegisterFilePartitioner::deallocate(int16_t offset)
{
// Retrieve the size in the allocation map
auto it = allocatedBlocks.find(offset);
GBE_ASSERT(it != allocatedBlocks.end());
const int16_t size = it->second;
// Find the two blocks where to insert the new block
Block *list = tail, *next = NULL;
while (list != NULL) {
if (list->offset < offset)
break;
next = list;
list = list->prev;
}
// Create the block and insert it
Block *newBlock = this->newBlock(offset, size);
if (list) {
GBE_ASSERT(list->offset + list->size <= offset);
list->next = newBlock;
newBlock->prev = list;
} else
this->head = newBlock; // list is NULL means newBlock should be the head.
if (next) {
GBE_ASSERT(offset + size <= next->offset);
next->prev = newBlock;
newBlock->next = next;
} else
this->tail = newBlock; // next is NULL means newBlock should be the tail.
if (list != NULL || next != NULL)
{
// Coalesce the blocks if possible
this->coalesce(list, newBlock);
this->coalesce(newBlock, next);
}
// Do not track this allocation anymore
allocatedBlocks.erase(it);
}
void RegisterFilePartitioner::coalesce(Block *left, Block *right) {
if (left == NULL || right == NULL) return;
GBE_ASSERT(left->offset < right->offset);
GBE_ASSERT(left->next == right);
GBE_ASSERT(right->prev == left);
if (left->offset + left->size == right->offset) {
right->offset = left->offset;
right->size += left->size;
if (left->prev) left->prev->next = right;
right->prev = left->prev;
if (left == this->head)
this->head = right;
this->deleteBlock(left);
}
}
void RegisterFilePartitioner::splitBlock(int16_t offset, int16_t subOffset) {
// Retrieve the size in the allocation map
auto it = allocatedBlocks.find(offset);
GBE_ASSERT(it != allocatedBlocks.end());
while(subOffset > it->second) {
subOffset -= it->second;
offset += it->second;
it = allocatedBlocks.find(offset);
GBE_ASSERT(it != allocatedBlocks.end());
}
if(subOffset == 0)
return;
int16_t size = it->second;
allocatedBlocks.erase(it);
// Track the allocation to retrieve the size later
allocatedBlocks.insert(std::make_pair(offset, subOffset));
allocatedBlocks.insert(std::make_pair(offset + subOffset, size - subOffset));
}
static int
alignScratchSize(int size){
int i = 0;
for(; i < size; i+=1024)
;
return i;
}
///////////////////////////////////////////////////////////////////////////
// Generic Context (shared by the simulator and the HW context)
///////////////////////////////////////////////////////////////////////////
IVAR(OCL_SIMD_WIDTH, 8, 15, 16);
Context::Context(const ir::Unit &unit, const std::string &name) :
unit(unit), fn(*unit.getFunction(name)), name(name), liveness(NULL), dag(NULL)
{
GBE_ASSERT(unit.getPointerSize() == ir::POINTER_32_BITS);
this->liveness = GBE_NEW(ir::Liveness, const_cast(fn));
this->dag = GBE_NEW(ir::FunctionDAG, *this->liveness);
this->partitioner = GBE_NEW_NO_ARG(RegisterFilePartitioner);
if (fn.getSimdWidth() == 0 || OCL_SIMD_WIDTH != 15)
this->simdWidth = nextHighestPowerOf2(OCL_SIMD_WIDTH);
else
this->simdWidth = fn.getSimdWidth();
this->scratchOffset = 0;
}
Context::~Context(void) {
GBE_SAFE_DELETE(this->partitioner);
GBE_SAFE_DELETE(this->dag);
GBE_SAFE_DELETE(this->liveness);
}
Kernel *Context::compileKernel(void) {
this->kernel = this->allocateKernel();
this->kernel->simdWidth = this->simdWidth;
this->buildPatchList();
this->buildArgList();
this->buildUsedLabels();
this->buildJIPs();
this->buildStack();
this->handleSLM();
if (this->emitCode() == false) {
GBE_DELETE(this->kernel);
this->kernel = NULL;
}
if(this->kernel != NULL) {
this->kernel->scratchSize = alignScratchSize(this->scratchOffset);
this->kernel->ctx = this;
}
return this->kernel;
}
int16_t Context::allocate(int16_t size, int16_t alignment) {
return partitioner->allocate(size, alignment);
}
void Context::deallocate(int16_t offset) { partitioner->deallocate(offset); }
void Context::splitBlock(int16_t offset, int16_t subOffset) {
partitioner->splitBlock(offset, subOffset);
}
int32_t Context::allocConstBuf(uint32_t argID) {
GBE_ASSERT(kernel->args[argID].type == GBE_ARG_CONSTANT_PTR);
//free previous
int32_t offset = kernel->getCurbeOffset(GBE_CURBE_EXTRA_ARGUMENT, argID+GBE_CONSTANT_BUFFER);
if(offset >= 0)
deallocate(offset+GEN_REG_SIZE);
if(kernel->args[argID].bufSize > 0) {
//use 32 alignment here as GEN_REG_SIZE, need dynamic by type?
newCurbeEntry(GBE_CURBE_EXTRA_ARGUMENT, GBE_CONSTANT_BUFFER+argID, kernel->args[argID].bufSize, 32);
}
std::sort(kernel->patches.begin(), kernel->patches.end());
offset = kernel->getCurbeOffset(GBE_CURBE_EXTRA_ARGUMENT, argID+GBE_CONSTANT_BUFFER);
GBE_ASSERT(offset>=0);
kernel->curbeSize = ALIGN(kernel->curbeSize, GEN_REG_SIZE);
return offset + GEN_REG_SIZE;
}
uint32_t Context::allocateScratchMem(uint32_t size) {
uint32_t offset = scratchOffset;
scratchOffset += size;
return offset;
}
void Context::buildStack(void) {
const auto &stackUse = dag->getUse(ir::ocl::stackptr);
if (stackUse.size() == 0) // no stack is used if stackptr is unused
return;
// Be sure that the stack pointer is set
GBE_ASSERT(this->kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) >= 0);
this->kernel->stackSize = 1*KB; // XXX compute that in a better way
}
uint32_t Context::newCurbeEntry(gbe_curbe_type value,
uint32_t subValue,
uint32_t size,
uint32_t alignment)
{
alignment = alignment == 0 ? size : alignment;
const uint32_t offset = partitioner->allocate(size, alignment, 1);
GBE_ASSERT(offset >= GEN_REG_SIZE);
kernel->patches.push_back(PatchInfo(value, subValue, offset - GEN_REG_SIZE));
kernel->curbeSize = std::max(kernel->curbeSize, offset + size - GEN_REG_SIZE);
return offset;
}
uint32_t Context::getImageInfoCurbeOffset(ir::ImageInfoKey key, size_t size)
{
int32_t offset = fn.getImageSet()->getInfoOffset(key);
if (offset >= 0)
return offset;
newCurbeEntry(GBE_CURBE_IMAGE_INFO, key.data, size, 4);
std::sort(kernel->patches.begin(), kernel->patches.end());
offset = kernel->getCurbeOffset(GBE_CURBE_IMAGE_INFO, key.data);
GBE_ASSERT(offset >= 0); // XXX do we need to spill it out to bo?
fn.getImageSet()->appendInfo(key, offset);
return offset + GEN_REG_SIZE;
}
void Context::insertCurbeReg(ir::Register reg, uint32_t offset) {
curbeRegs.insert(std::make_pair(reg, offset));
}
void Context::buildPatchList(void) {
const uint32_t ptrSize = unit.getPointerSize() == ir::POINTER_32_BITS ? 4u : 8u;
kernel->curbeSize = 0u;
// We insert the block IP mask first
this->insertCurbeReg(ir::ocl::blockip, this->newCurbeEntry(GBE_CURBE_BLOCK_IP, 0, this->simdWidth*sizeof(uint16_t)));
// Go over the arguments and find the related patch locations
const uint32_t argNum = fn.argNum();
for (uint32_t argID = 0u; argID < argNum; ++argID) {
const ir::FunctionArgument &arg = fn.getArg(argID);
// For pointers and values, we have nothing to do. We just push the values
if (arg.type == ir::FunctionArgument::GLOBAL_POINTER ||
arg.type == ir::FunctionArgument::LOCAL_POINTER ||
arg.type == ir::FunctionArgument::CONSTANT_POINTER ||
arg.type == ir::FunctionArgument::VALUE ||
arg.type == ir::FunctionArgument::STRUCTURE ||
arg.type == ir::FunctionArgument::IMAGE ||
arg.type == ir::FunctionArgument::SAMPLER)
this->insertCurbeReg(arg.reg, this->newCurbeEntry(GBE_CURBE_KERNEL_ARGUMENT, argID, arg.size, ptrSize));
}
// Already inserted registers go here
const size_t localIDSize = sizeof(uint32_t) * this->simdWidth;
insertCurbeReg(ir::ocl::lid0, this->newCurbeEntry(GBE_CURBE_LOCAL_ID_X, 0, localIDSize));
insertCurbeReg(ir::ocl::lid1, this->newCurbeEntry(GBE_CURBE_LOCAL_ID_Y, 0, localIDSize));
insertCurbeReg(ir::ocl::lid2, this->newCurbeEntry(GBE_CURBE_LOCAL_ID_Z, 0, localIDSize));
insertCurbeReg(ir::ocl::samplerinfo, this->newCurbeEntry(GBE_CURBE_SAMPLER_INFO, 0, 32));
// Go over all the instructions and find the special register we need
// to push
#define INSERT_REG(SPECIAL_REG, PATCH, WIDTH) \
if (reg == ir::ocl::SPECIAL_REG) { \
if (curbeRegs.find(reg) != curbeRegs.end()) continue; \
insertCurbeReg(reg, this->newCurbeEntry(GBE_CURBE_##PATCH, 0, ptrSize * WIDTH)); \
} else
bool useStackPtr = false;
fn.foreachInstruction([&](ir::Instruction &insn) {
const uint32_t srcNum = insn.getSrcNum();
for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
const ir::Register reg = insn.getSrc(srcID);
if (insn.getOpcode() == ir::OP_GET_IMAGE_INFO) {
if (srcID != 0) continue;
const unsigned char bti = fn.getImageSet()->getIdx(insn.getSrc(srcID));
const unsigned char type = ir::cast(insn).getInfoType();;
ir::ImageInfoKey key;
key.index = bti;
key.type = type;
const ir::Register imageInfo(key.data | 0x8000);
ir::Register realImageInfo;
if (curbeRegs.find(imageInfo) == curbeRegs.end()) {
uint32_t offset = this->getImageInfoCurbeOffset(key, 4);
realImageInfo = insn.getSrc(1);
insertCurbeReg(realImageInfo, offset);
insertCurbeReg(imageInfo, (uint32_t)realImageInfo);
} else
realImageInfo = ir::Register(curbeRegs.find(imageInfo)->second);
insn.setSrc(srcID, realImageInfo);
continue;
} else if (insn.getOpcode() == ir::OP_GET_SAMPLER_INFO) {
/* change the src to sampler information register. */
if (curbeRegs.find(ir::ocl::samplerinfo) == curbeRegs.end())
insertCurbeReg(ir::ocl::samplerinfo, this->newCurbeEntry(GBE_CURBE_SAMPLER_INFO, 0, 32));
continue;
}
if (fn.isSpecialReg(reg) == false) continue;
if (curbeRegs.find(reg) != curbeRegs.end()) continue;
if (reg == ir::ocl::stackptr) useStackPtr = true;
INSERT_REG(lsize0, LOCAL_SIZE_X, 1)
INSERT_REG(lsize1, LOCAL_SIZE_Y, 1)
INSERT_REG(lsize2, LOCAL_SIZE_Z, 1)
INSERT_REG(gsize0, GLOBAL_SIZE_X, 1)
INSERT_REG(gsize1, GLOBAL_SIZE_Y, 1)
INSERT_REG(gsize2, GLOBAL_SIZE_Z, 1)
INSERT_REG(goffset0, GLOBAL_OFFSET_X, 1)
INSERT_REG(goffset1, GLOBAL_OFFSET_Y, 1)
INSERT_REG(goffset2, GLOBAL_OFFSET_Z, 1)
INSERT_REG(workdim, WORK_DIM, 1)
INSERT_REG(numgroup0, GROUP_NUM_X, 1)
INSERT_REG(numgroup1, GROUP_NUM_Y, 1)
INSERT_REG(numgroup2, GROUP_NUM_Z, 1)
INSERT_REG(stackptr, STACK_POINTER, this->simdWidth)
do {} while(0);
}
});
#undef INSERT_REG
// Insert the number of threads
insertCurbeReg(ir::ocl::threadn, this->newCurbeEntry(GBE_CURBE_THREAD_NUM, 0, sizeof(uint32_t)));
// Insert the stack buffer if used
if (useStackPtr)
insertCurbeReg(ir::ocl::stackptr, this->newCurbeEntry(GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER, ptrSize));
// After this point the vector is immutable. Sorting it will make
// research faster
std::sort(kernel->patches.begin(), kernel->patches.end());
kernel->curbeSize = ALIGN(kernel->curbeSize, GEN_REG_SIZE);
}
void Context::buildArgList(void) {
kernel->argNum = fn.argNum();
if (kernel->argNum)
kernel->args = GBE_NEW_ARRAY_NO_ARG(KernelArgument, kernel->argNum);
else
kernel->args = NULL;
for (uint32_t argID = 0; argID < kernel->argNum; ++argID) {
const auto &arg = fn.getArg(argID);
switch (arg.type) {
case ir::FunctionArgument::VALUE:
case ir::FunctionArgument::STRUCTURE:
kernel->args[argID].type = GBE_ARG_VALUE;
kernel->args[argID].size = arg.size;
break;
case ir::FunctionArgument::GLOBAL_POINTER:
kernel->args[argID].type = GBE_ARG_GLOBAL_PTR;
kernel->args[argID].size = sizeof(void*);
break;
case ir::FunctionArgument::CONSTANT_POINTER:
kernel->args[argID].type = GBE_ARG_CONSTANT_PTR;
kernel->args[argID].size = sizeof(void*);
break;
case ir::FunctionArgument::LOCAL_POINTER:
kernel->args[argID].type = GBE_ARG_LOCAL_PTR;
kernel->args[argID].size = 0;
break;
case ir::FunctionArgument::IMAGE:
kernel->args[argID].type = GBE_ARG_IMAGE;
kernel->args[argID].size = sizeof(void*);
break;
case ir::FunctionArgument::SAMPLER:
kernel->args[argID].type = GBE_ARG_SAMPLER;
kernel->args[argID].size = sizeof(void*);
break;
}
}
}
void Context::buildUsedLabels(void) {
usedLabels.clear();
fn.foreachInstruction([this](const ir::Instruction &insn) {
using namespace ir;
if (insn.getOpcode() != OP_BRA) return;
const LabelIndex index = cast(insn).getLabelIndex();
usedLabels.insert(index);
});
}
void Context::buildJIPs(void) {
using namespace ir;
// Linearly store the branch target for each block and its own label
const LabelIndex noTarget(fn.labelNum());
vector> braTargets;
int32_t curr = 0, blockNum = fn.blockNum();
braTargets.resize(blockNum);
// If some blocks are unused we mark them as such by setting their own label
// as "invalid" (== noTarget)
for (auto &bb : braTargets) bb = std::make_pair(noTarget, noTarget);
fn.foreachBlock([&](const BasicBlock &bb) {
const LabelIndex ownLabel = bb.getLabelIndex();
const Instruction *last = bb.getLastInstruction();
if (last->getOpcode() != OP_BRA)
braTargets[curr++] = std::make_pair(ownLabel, noTarget);
else {
const BranchInstruction *bra = cast(last);
braTargets[curr++] = std::make_pair(ownLabel, bra->getLabelIndex());
}
});
// Backward jumps are special. We must insert the label of the next block
// when we hit the "DO" i.e. the target label of the backward branch (as in
// do { } while) . So, we store the bwd jumps per targets
// XXX does not use custom allocator
std::multimap bwdTargets;
for (int32_t blockID = 0; blockID < blockNum; ++blockID) {
const LabelIndex ownLabel = braTargets[blockID].first;
const LabelIndex target = braTargets[blockID].second;
if (ownLabel == noTarget) continue; // unused block
if (target == noTarget) continue; // no branch
if (target <= ownLabel) { // This is a backward jump
// Last block is just "RET". So, it cannot be the last block
GBE_ASSERT(blockID < blockNum - 1);
const LabelIndex fallThrough = braTargets[blockID+1].first;
bwdTargets.insert(std::make_pair(target, fallThrough));
}
}
// Stores the current forward targets
set fwdTargets;
// Now retraverse the blocks and figure out all JIPs
for (int32_t blockID = 0; blockID < blockNum; ++blockID) {
const LabelIndex ownLabel = braTargets[blockID].first;
const LabelIndex target = braTargets[blockID].second;
const BasicBlock &bb = fn.getBlock(ownLabel);
const Instruction *label = bb.getFirstInstruction();
const Instruction *bra = bb.getLastInstruction();
// Expires the branches that point to us (if any)
auto it = fwdTargets.find(ownLabel);
if (it != fwdTargets.end()) fwdTargets.erase(it);
// Insert the fall through of the bwd branches that point to us if any
auto ii = bwdTargets.equal_range(ownLabel);
for (auto it = ii.first; it != ii.second; ++it)
fwdTargets.insert(it->second);
// If there is an outstanding forward branch, compute a JIP for the label
auto lower = fwdTargets.lower_bound(LabelIndex(0));
GBE_ASSERT(label->isMemberOf() == true);
if (lower != fwdTargets.end())
JIPs.insert(std::make_pair(label, *lower));
// Handle special cases and backward branches first
if (ownLabel == noTarget) continue; // unused block
if (target == noTarget) continue; // no branch at all
GBE_ASSERT(bra->isMemberOf() == true);
if (target <= ownLabel) { // bwd branch: we always jump
JIPs.insert(std::make_pair(bra, LabelIndex(target)));
continue;
}
// This is a forward jump, register it and get the JIP
fwdTargets.insert(target);
auto jip = fwdTargets.lower_bound(LabelIndex(0));
JIPs.insert(std::make_pair(bra, *jip));
}
}
void Context::handleSLM(void) {
const bool useSLM = fn.getUseSLM();
kernel->useSLM = useSLM;
kernel->slmSize = fn.getSLMSize();
}
bool Context::isScalarReg(const ir::Register ®) const {
GBE_ASSERT(fn.getProfile() == ir::Profile::PROFILE_OCL);
if (fn.getArg(reg) != NULL) return true;
if (fn.getPushLocation(reg) != NULL) return true;
if (reg == ir::ocl::groupid0 ||
reg == ir::ocl::groupid1 ||
reg == ir::ocl::groupid2 ||
reg == ir::ocl::barrierid ||
reg == ir::ocl::threadn ||
reg == ir::ocl::numgroup0 ||
reg == ir::ocl::numgroup1 ||
reg == ir::ocl::numgroup2 ||
reg == ir::ocl::lsize0 ||
reg == ir::ocl::lsize1 ||
reg == ir::ocl::lsize2 ||
reg == ir::ocl::gsize0 ||
reg == ir::ocl::gsize1 ||
reg == ir::ocl::gsize2 ||
reg == ir::ocl::goffset0 ||
reg == ir::ocl::goffset1 ||
reg == ir::ocl::goffset2 ||
reg == ir::ocl::workdim)
return true;
return false;
}
} /* namespace gbe */
Release_v0.3/backend/src/backend/context.hpp 0000664 0000000 0000000 00000014070 12231421770 0021147 0 ustar 00root root 0000000 0000000 /*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see .
*
* Author: Benjamin Segovia
*/
#ifndef __GBE_CONTEXT_HPP__
#define __GBE_CONTEXT_HPP__
#include "ir/instruction.hpp"
#include "backend/program.h"
#include "sys/set.hpp"
#include "sys/map.hpp"
#include "sys/platform.hpp"
#include
namespace gbe {
namespace ir {
class Unit; // Contains the complete program
class Function; // We compile a function into a kernel
class Liveness; // Describes liveness of each ir function register
class FunctionDAG; // Describes the instruction dependencies
} /* namespace ir */
} /* namespace gbe */
namespace gbe
{
class Kernel; // context creates Kernel
class RegisterFilePartitioner; // Partition register file for reg allocation
/*! Context is the helper structure to build the Gen ISA or simulation code
* from GenIR
*/
class Context : public NonCopyable
{
public:
/*! Create a new context. name is the name of the function we want to
* compile
*/
Context(const ir::Unit &unit, const std::string &name);
/*! Release everything needed */
virtual ~Context(void);
/*! Compile the code */
Kernel *compileKernel(void);
/*! Tells if the labels is used */
INLINE bool isLabelUsed(ir::LabelIndex index) const {
return usedLabels.contains(index);
}
/*! Get the function graph */
INLINE const ir::FunctionDAG &getFunctionDAG(void) const { return *dag; }
/*! Get the liveness information */
INLINE const ir::Liveness &getLiveness(void) const { return *liveness; }
/*! Tells if the register is used */
bool isRegUsed(const ir::Register ®) const;
/*! Indicate if a register is scalar or not */
bool isScalarReg(const ir::Register ®) const;
/*! Get the kernel we are currently compiling */
INLINE Kernel *getKernel(void) const { return this->kernel; }
/*! Get the function we are currently compiling */
INLINE const ir::Function &getFunction(void) const { return this->fn; }
/*! Get the target label index for the given instruction */
INLINE ir::LabelIndex getLabelIndex(const ir::Instruction *insn) const {
GBE_ASSERT(JIPs.find(insn) != JIPs.end());
return JIPs.find(insn)->second;
}
/*! Only GOTO and some LABEL instructions may have JIPs */
INLINE bool hasJIP(const ir::Instruction *insn) const {
return JIPs.find(insn) != JIPs.end();
}
/*! Allocate some memory in the register file */
int16_t allocate(int16_t size, int16_t alignment);
/*! Deallocate previously allocated memory */
void deallocate(int16_t offset);
/*! Spilt a block into 2 blocks, for some registers allocate together but deallocate seperate */
void splitBlock(int16_t offset, int16_t subOffset);
/* allocate curbe for constant ptr argument */
int32_t allocConstBuf(uint32_t argID);
/* allocate a new entry for a specific image's information */
/*! Get (search or allocate if fail to find one) image info curbeOffset.*/
uint32_t getImageInfoCurbeOffset(ir::ImageInfoKey key, size_t size);
/*! allocate size scratch memory and return start address */
uint32_t allocateScratchMem(uint32_t size);
/*! Preallocated curbe register set including special registers. */
map curbeRegs;
protected:
/*! Build the instruction stream. Return false if failed */
virtual bool emitCode(void) = 0;
/*! Allocate a new empty kernel (to be implemented) */
virtual Kernel *allocateKernel(void) = 0;
/*! Look if a stack is needed and allocate it */
void buildStack(void);
/*! Build the curbe patch list for the given kernel */
void buildPatchList(void);
/*! Build the list of arguments to set to launch the kernel */
void buildArgList(void);
/*! Build the sets of used labels */
void buildUsedLabels(void);
/*! Build JIPs for each branch and possibly labels. Can be different from
* the branch target due to unstructured branches
*/
void buildJIPs(void);
/*! Configure SLM use if needed */
void handleSLM(void);
/*! Insert a new entry with the given size in the Curbe. Return the offset
* of the entry
*/
void insertCurbeReg(ir::Register, uint32_t grfOffset);
uint32_t newCurbeEntry(gbe_curbe_type value, uint32_t subValue, uint32_t size, uint32_t alignment = 0);
/*! Provide for each branch and label the label index target */
typedef map JIPMap;
const ir::Unit &unit; //!< Unit that contains the kernel
const ir::Function &fn; //!< Function to compile
std::string name; //!< Name of the kernel to compile
Kernel *kernel; //!< Kernel we are building
ir::Liveness *liveness; //!< Liveness info for the variables
ir::FunctionDAG *dag; //!< Graph of values on the function
RegisterFilePartitioner *partitioner; //!< Handle register file partionning
set usedLabels; //!< Set of all used labels
JIPMap JIPs; //!< Where to jump all labels/branches
uint32_t simdWidth; //!< Number of lanes per HW threads
uint32_t scratchOffset; //!< scratch slot for next scratch memory request
GBE_CLASS(Context); //!< Use custom allocators
};
} /* namespace gbe */
#endif /* __GBE_CONTEXT_HPP__ */
Release_v0.3/backend/src/backend/gen/ 0000775 0000000 0000000 00000000000 12231421770 0017521 5 ustar 00root root 0000000 0000000 Release_v0.3/backend/src/backend/gen/gen_mesa_disasm.c 0000664 0000000 0000000 00000111330 12231421770 0023002 0 ustar 00root root 0000000 0000000 /*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see .
*
* Author: Benjamin Segovia
*/
/*
* Copyright 2008 Keith Packard
*
* Permission to use, copy, modify, distribute, and sell this software and its
* documentation for any purpose is hereby granted without fee, provided that
* the above copyright notice appear in all copies and that both that copyright
* notice and this permission notice appear in supporting documentation, and
* that the name of the copyright holders not be used in advertising or
* publicity pertaining to distribution of the software without specific,
* written prior permission. The copyright holders make no representations
* about the suitability of this software for any purpose. It is provided "as
* is" without express or implied warranty.
*
* THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
* INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
* EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
* CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
* DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
* TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THIS SOFTWARE.
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include "backend/gen_defs.hpp"
static const struct {
const char *name;
int nsrc;
int ndst;
} opcode[128] = {
[GEN_OPCODE_MOV] = { .name = "mov", .nsrc = 1, .ndst = 1 },
[GEN_OPCODE_FRC] = { .name = "frc", .nsrc = 1, .ndst = 1 },
[GEN_OPCODE_RNDU] = { .name = "rndu", .nsrc = 1, .ndst = 1 },
[GEN_OPCODE_RNDD] = { .name = "rndd", .nsrc = 1, .ndst = 1 },
[GEN_OPCODE_RNDE] = { .name = "rnde", .nsrc = 1, .ndst = 1 },
[GEN_OPCODE_RNDZ] = { .name = "rndz", .nsrc = 1, .ndst = 1 },
[GEN_OPCODE_NOT] = { .name = "not", .nsrc = 1, .ndst = 1 },
[GEN_OPCODE_LZD] = { .name = "lzd", .nsrc = 1, .ndst = 1 },
[GEN_OPCODE_FBH] = { .name = "fbh", .nsrc = 1, .ndst = 1 },
[GEN_OPCODE_FBL] = { .name = "fbl", .nsrc = 1, .ndst = 1 },
[GEN_OPCODE_MUL] = { .name = "mul", .nsrc = 2, .ndst = 1 },
[GEN_OPCODE_MAC] = { .name = "mac", .nsrc = 2, .ndst = 1 },
[GEN_OPCODE_MACH] = { .name = "mach", .nsrc = 2, .ndst = 1 },
[GEN_OPCODE_LINE] = { .name = "line", .nsrc = 2, .ndst = 1 },
[GEN_OPCODE_PLN] = { .name = "pln", .nsrc = 2, .ndst = 1 },
[GEN_OPCODE_MAD] = { .name = "mad", .nsrc = 3, .ndst = 1 },
[GEN_OPCODE_SAD2] = { .name = "sad2", .nsrc = 2, .ndst = 1 },
[GEN_OPCODE_SADA2] = { .name = "sada2", .nsrc = 2, .ndst = 1 },
[GEN_OPCODE_DP4] = { .name = "dp4", .nsrc = 2, .ndst = 1 },
[GEN_OPCODE_DPH] = { .name = "dph", .nsrc = 2, .ndst = 1 },
[GEN_OPCODE_DP3] = { .name = "dp3", .nsrc = 2, .ndst = 1 },
[GEN_OPCODE_DP2] = { .name = "dp2", .nsrc = 2, .ndst = 1 },
[GEN_OPCODE_MATH] = { .name = "math", .nsrc = 2, .ndst = 1 },
[GEN_OPCODE_AVG] = { .name = "avg", .nsrc = 2, .ndst = 1 },
[GEN_OPCODE_ADD] = { .name = "add", .nsrc = 2, .ndst = 1 },
[GEN_OPCODE_ADDC] = { .name = "addc", .nsrc = 2, .ndst = 1 },
[GEN_OPCODE_SUBB] = { .name = "subb", .nsrc = 2, .ndst = 1 },
[GEN_OPCODE_SEL] = { .name = "sel", .nsrc = 2, .ndst = 1 },
[GEN_OPCODE_AND] = { .name = "and", .nsrc = 2, .ndst = 1 },
[GEN_OPCODE_OR] = { .name = "or", .nsrc = 2, .ndst = 1 },
[GEN_OPCODE_XOR] = { .name = "xor", .nsrc = 2, .ndst = 1 },
[GEN_OPCODE_SHR] = { .name = "shr", .nsrc = 2, .ndst = 1 },
[GEN_OPCODE_SHL] = { .name = "shl", .nsrc = 2, .ndst = 1 },
[GEN_OPCODE_ASR] = { .name = "asr", .nsrc = 2, .ndst = 1 },
[GEN_OPCODE_CMP] = { .name = "cmp", .nsrc = 2, .ndst = 1 },
[GEN_OPCODE_CMPN] = { .name = "cmpn", .nsrc = 2, .ndst = 1 },
[GEN_OPCODE_SEND] = { .name = "send", .nsrc = 1, .ndst = 1 },
[GEN_OPCODE_SENDC] = { .name = "sendc", .nsrc = 1, .ndst = 1 },
[GEN_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
[GEN_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 0, .ndst = 0 },
[GEN_OPCODE_IF] = { .name = "if", .nsrc = 2, .ndst = 0 },
[GEN_OPCODE_IFF] = { .name = "iff", .nsrc = 2, .ndst = 1 },
[GEN_OPCODE_WHILE] = { .name = "while", .nsrc = 2, .ndst = 0 },
[GEN_OPCODE_ELSE] = { .name = "else", .nsrc = 2, .ndst = 0 },
[GEN_OPCODE_BREAK] = { .name = "break", .nsrc = 2, .ndst = 0 },
[GEN_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 1, .ndst = 0 },
[GEN_OPCODE_HALT] = { .name = "halt", .nsrc = 1, .ndst = 0 },
[GEN_OPCODE_MSAVE] = { .name = "msave", .nsrc = 1, .ndst = 1 },
[GEN_OPCODE_PUSH] = { .name = "push", .nsrc = 1, .ndst = 1 },
[GEN_OPCODE_MRESTORE] = { .name = "mrest", .nsrc = 1, .ndst = 1 },
[GEN_OPCODE_POP] = { .name = "pop", .nsrc = 2, .ndst = 0 },
[GEN_OPCODE_WAIT] = { .name = "wait", .nsrc = 1, .ndst = 0 },
[GEN_OPCODE_DO] = { .name = "do", .nsrc = 0, .ndst = 0 },
[GEN_OPCODE_ENDIF] = { .name = "endif", .nsrc = 2, .ndst = 0 },
};
static const char *conditional_modifier[16] = {
[GEN_CONDITIONAL_NONE] = "",
[GEN_CONDITIONAL_Z] = ".e",
[GEN_CONDITIONAL_NZ] = ".ne",
[GEN_CONDITIONAL_G] = ".g",
[GEN_CONDITIONAL_GE] = ".ge",
[GEN_CONDITIONAL_L] = ".l",
[GEN_CONDITIONAL_LE] = ".le",
[GEN_CONDITIONAL_R] = ".r",
[GEN_CONDITIONAL_O] = ".o",
[GEN_CONDITIONAL_U] = ".u",
};
static const char *negate[2] = {
[0] = "",
[1] = "-",
};
static const char *_abs[2] = {
[0] = "",
[1] = "(abs)",
};
static const char *vert_stride[16] = {
[0] = "0",
[1] = "1",
[2] = "2",
[3] = "4",
[4] = "8",
[5] = "16",
[6] = "32",
[15] = "VxH",
};
static const char *width[8] = {
[0] = "1",
[1] = "2",
[2] = "4",
[3] = "8",
[4] = "16",
};
static const char *horiz_stride[4] = {
[0] = "0",
[1] = "1",
[2] = "2",
[3] = "4"
};
static const char *chan_sel[4] = {
[0] = "x",
[1] = "y",
[2] = "z",
[3] = "w",
};
static const char *debug_ctrl[2] = {
[0] = "",
[1] = ".breakpoint"
};
static const char *saturate[2] = {
[0] = "",
[1] = ".sat"
};
static const char *accwr[2] = {
[0] = "",
[1] = "AccWrEnable"
};
static const char *wectrl[2] = {
[0] = "WE_normal",
[1] = "WE_all"
};
static const char *exec_size[8] = {
[0] = "1",
[1] = "2",
[2] = "4",
[3] = "8",
[4] = "16",
[5] = "32"
};
static const char *pred_inv[2] = {
[0] = "+",
[1] = "-"
};
static const char *pred_ctrl_align16[16] = {
[1] = "",
[2] = ".x",
[3] = ".y",
[4] = ".z",
[5] = ".w",
[6] = ".any4h",
[7] = ".all4h",
};
static const char *pred_ctrl_align1[16] = {
[1] = "",
[2] = ".anyv",
[3] = ".allv",
[4] = ".any2h",
[5] = ".all2h",
[6] = ".any4h",
[7] = ".all4h",
[8] = ".any8h",
[9] = ".all8h",
[10] = ".any16h",
[11] = ".all16h",
};
static const char *thread_ctrl[4] = {
[0] = "",
[2] = "switch"
};
static const char *dep_ctrl[4] = {
[0] = "",
[1] = "NoDDClr",
[2] = "NoDDChk",
[3] = "NoDDClr,NoDDChk",
};
static const char *mask_ctrl[4] = {
[0] = "",
[1] = "nomask",
};
static const char *access_mode[2] = {
[0] = "align1",
[1] = "align16",
};
static const char *reg_encoding[8] = {
[0] = "UD",
[1] = "D",
[2] = "UW",
[3] = "W",
[4] = "UB",
[5] = "B",
[6] = "DF",
[7] = "F"
};
int reg_type_size[8] = {
[0] = 4,
[1] = 4,
[2] = 2,
[3] = 2,
[4] = 1,
[5] = 1,
[6] = 8,
[7] = 4
};
static const char *reg_file[4] = {
[0] = "A",
[1] = "g",
[2] = "m",
[3] = "imm",
};
static const char *writemask[16] = {
[0x0] = ".",
[0x1] = ".x",
[0x2] = ".y",
[0x3] = ".xy",
[0x4] = ".z",
[0x5] = ".xz",
[0x6] = ".yz",
[0x7] = ".xyz",
[0x8] = ".w",
[0x9] = ".xw",
[0xa] = ".yw",
[0xb] = ".xyw",
[0xc] = ".zw",
[0xd] = ".xzw",
[0xe] = ".yzw",
[0xf] = "",
};
static const char *end_of_thread[2] = {
[0] = "",
[1] = "EOT"
};
static const char *target_function_gen6[16] = {
[GEN_SFID_NULL] = "null",
[GEN_SFID_MATH] = "math",
[GEN_SFID_SAMPLER] = "sampler",
[GEN_SFID_MESSAGE_GATEWAY] = "gateway",
[GEN_SFID_URB] = "urb",
[GEN_SFID_THREAD_SPAWNER] = "thread_spawner",
[GEN6_SFID_DATAPORT_SAMPLER_CACHE] = "sampler",
[GEN6_SFID_DATAPORT_RENDER_CACHE] = "render",
[GEN6_SFID_DATAPORT_CONSTANT_CACHE] = "const",
[GEN_SFID_DATAPORT_DATA_CACHE] = "data"
};
static const char *gateway_sub_function[8] = {
[0] = "open gateway",
[1] = "close gateway",
[2] = "forward gateway",
[3] = "get time stamp",
[4] = "barrier",
[5] = "update gateway state",
[6] = "MMIO R/W",
[7] = "reserved"
};
static const char *math_function[16] = {
[GEN_MATH_FUNCTION_INV] = "inv",
[GEN_MATH_FUNCTION_LOG] = "log",
[GEN_MATH_FUNCTION_EXP] = "exp",
[GEN_MATH_FUNCTION_SQRT] = "sqrt",
[GEN_MATH_FUNCTION_RSQ] = "rsq",
[GEN_MATH_FUNCTION_SIN] = "sin",
[GEN_MATH_FUNCTION_COS] = "cos",
[GEN_MATH_FUNCTION_FDIV] = "fdiv",
[GEN_MATH_FUNCTION_POW] = "pow",
[GEN_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER] = "intdivmod",
[GEN_MATH_FUNCTION_INT_DIV_QUOTIENT] = "intdiv",
[GEN_MATH_FUNCTION_INT_DIV_REMAINDER] = "intmod",
};
static const char *math_saturate[2] = {
[0] = "",
[1] = "sat"
};
static const char *math_signed[2] = {
[0] = "",
[1] = "signed"
};
static const char *math_scalar[2] = {
[0] = "",
[1] = "scalar"
};
static const char *math_precision[2] = {
[0] = "",
[1] = "partial_precision"
};
static const char *data_port_data_cache_simd_mode[] = {
"SIMD4x2",
"SIMD16",
"SIMD8",
};
static const char *data_port_data_cache_category[] = {
"legacy",
"scratch",
};
static const char *data_port_scratch_block_size[] = {
"1 register",
"2 registers",
"Reserve",
"4 registers",
};
static const char *data_port_scratch_invalidate[] = {
"no invalidate",
"invalidate cache line",
};
static const char *data_port_scratch_channel_mode[] = {
"Oword",
"Dword",
};
static const char *data_port_scratch_msg_type[] = {
"Scratch Read",
"Scratch Write",
};
static const char *data_port_data_cache_msg_type[] = {
[0] = "OWord Block Read",
[1] = "Unaligned OWord Block Read",
[2] = "OWord Dual Block Read",
[3] = "DWord Scattered Read",
[4] = "Byte Scattered Read",
[5] = "Untyped Surface Read",
[6] = "Untyped Atomic Operation",
[7] = "Memory Fence",
[8] = "OWord Block Write",
[10] = "OWord Dual Block Write",
[11] = "DWord Scattered Write",
[12] = "Byte Scattered Write",
[13] = "Untyped Surface Write",
};
static int column;
static int string (FILE *file, const char *string)
{
fputs (string, file);
column += strlen (string);
return 0;
}
static int format (FILE *f, const char *format, ...)
{
char buf[1024];
va_list args;
va_start (args, format);
vsnprintf (buf, sizeof (buf) - 1, format, args);
va_end (args);
string (f, buf);
return 0;
}
static int newline (FILE *f)
{
putc ('\n', f);
column = 0;
return 0;
}
static int pad (FILE *f, int c)
{
do
string (f, " ");
while (column < c);
return 0;
}
static int flag_reg (FILE *file, const int flag_nr, const int flag_sub_reg_nr)
{
if (flag_nr || flag_sub_reg_nr)
return format (file, ".f%d.%d", flag_nr, flag_sub_reg_nr);
return 0;
}
static int control (FILE *file, const char *name, const char *ctrl[], uint32_t id, int *space)
{
if (!ctrl[id]) {
fprintf (file, "*** invalid %s value %d ",
name, id);
return 1;
}
if (ctrl[id][0])
{
if (space && *space)
string (file, " ");
string (file, ctrl[id]);
if (space)
*space = 1;
}
return 0;
}
static int print_opcode (FILE *file, int id)
{
if (!opcode[id].name) {
format (file, "*** invalid opcode value %d ", id);
return 1;
}
string (file, opcode[id].name);
return 0;
}
static int reg (FILE *file, uint32_t _reg_file, uint32_t _reg_nr)
{
int err = 0;
if (_reg_file == GEN_ARCHITECTURE_REGISTER_FILE) {
switch (_reg_nr & 0xf0) {
case GEN_ARF_NULL:
string (file, "null");
return -1;
case GEN_ARF_ADDRESS:
format (file, "a%d", _reg_nr & 0x0f);
break;
case GEN_ARF_ACCUMULATOR:
format (file, "acc%d", _reg_nr & 0x0f);
break;
case GEN_ARF_FLAG:
format (file, "f%d", _reg_nr & 0x0f);
break;
case GEN_ARF_MASK:
format (file, "mask%d", _reg_nr & 0x0f);
break;
case GEN_ARF_MASK_STACK:
format (file, "msd%d", _reg_nr & 0x0f);
break;
case GEN_ARF_STATE:
format (file, "sr%d", _reg_nr & 0x0f);
break;
case GEN_ARF_CONTROL:
format (file, "cr%d", _reg_nr & 0x0f);
break;
case GEN_ARF_NOTIFICATION_COUNT:
format (file, "n%d", _reg_nr & 0x0f);
break;
case GEN_ARF_IP:
string (file, "ip");
return -1;
break;
default:
format (file, "ARF%d", _reg_nr);
break;
}
} else {
err |= control (file, "src reg file", reg_file, _reg_file, NULL);
format (file, "%d", _reg_nr);
}
return err;
}
static int dest (FILE *file, const struct GenInstruction *inst)
{
int err = 0;
if (inst->header.access_mode == GEN_ALIGN_1)
{
if (inst->bits1.da1.dest_address_mode == GEN_ADDRESS_DIRECT)
{
err |= reg (file, inst->bits1.da1.dest_reg_file, inst->bits1.da1.dest_reg_nr);
if (err == -1)
return 0;
if (inst->bits1.da1.dest_subreg_nr)
format (file, ".%d", inst->bits1.da1.dest_subreg_nr /
reg_type_size[inst->bits1.da1.dest_reg_type]);
format (file, "<%s>", horiz_stride[inst->bits1.da1.dest_horiz_stride]);
err |= control (file, "dest reg encoding", reg_encoding, inst->bits1.da1.dest_reg_type, NULL);
}
else
{
string (file, "g[a0");
if (inst->bits1.ia1.dest_subreg_nr)
format (file, ".%d", inst->bits1.ia1.dest_subreg_nr /
reg_type_size[inst->bits1.ia1.dest_reg_type]);
if (inst->bits1.ia1.dest_indirect_offset)
format (file, " %d", inst->bits1.ia1.dest_indirect_offset);
string (file, "]");
format (file, "<%s>", horiz_stride[inst->bits1.ia1.dest_horiz_stride]);
err |= control (file, "dest reg encoding", reg_encoding, inst->bits1.ia1.dest_reg_type, NULL);
}
}
else
{
if (inst->bits1.da16.dest_address_mode == GEN_ADDRESS_DIRECT)
{
err |= reg (file, inst->bits1.da16.dest_reg_file, inst->bits1.da16.dest_reg_nr);
if (err == -1)
return 0;
if (inst->bits1.da16.dest_subreg_nr)
format (file, ".%d", inst->bits1.da16.dest_subreg_nr /
reg_type_size[inst->bits1.da16.dest_reg_type]);
string (file, "<1>");
err |= control (file, "writemask", writemask, inst->bits1.da16.dest_writemask, NULL);
err |= control (file, "dest reg encoding", reg_encoding, inst->bits1.da16.dest_reg_type, NULL);
}
else
{
err = 1;
string (file, "Indirect align16 address mode not supported");
}
}
return 0;
}
static int dest_3src (FILE *file, const struct GenInstruction *inst)
{
int err = 0;
const uint32_t reg_file = GEN_GENERAL_REGISTER_FILE;
err |= reg (file, reg_file, inst->bits1.da3src.dest_reg_nr);
if (err == -1)
return 0;
if (inst->bits1.da3src.dest_subreg_nr)
format (file, ".%d", inst->bits1.da3src.dest_subreg_nr);
string (file, "<1>");
err |= control (file, "writemask", writemask, inst->bits1.da3src.dest_writemask, NULL);
err |= control (file, "dest reg encoding", reg_encoding, GEN_TYPE_F, NULL);
return 0;
}
static int src_align1_region (FILE *file,
uint32_t _vert_stride, uint32_t _width, uint32_t _horiz_stride)
{
int err = 0;
string (file, "<");
err |= control (file, "vert stride", vert_stride, _vert_stride, NULL);
string (file, ",");
err |= control (file, "width", width, _width, NULL);
string (file, ",");
err |= control (file, "horiz_stride", horiz_stride, _horiz_stride, NULL);
string (file, ">");
return err;
}
static int src_da1 (FILE *file, uint32_t type, uint32_t _reg_file,
uint32_t _vert_stride, uint32_t _width, uint32_t _horiz_stride,
uint32_t reg_num, uint32_t sub_reg_num, uint32_t __abs, uint32_t _negate)
{
int err = 0;
err |= control (file, "negate", negate, _negate, NULL);
err |= control (file, "abs", _abs, __abs, NULL);
err |= reg (file, _reg_file, reg_num);
if (err == -1)
return 0;
if (sub_reg_num)
format (file, ".%d", sub_reg_num / reg_type_size[type]); /* use formal style like spec */
src_align1_region (file, _vert_stride, _width, _horiz_stride);
err |= control (file, "src reg encoding", reg_encoding, type, NULL);
return err;
}
static int src_ia1 (FILE *file,
uint32_t type,
uint32_t _reg_file,
int32_t _addr_imm,
uint32_t _addr_subreg_nr,
uint32_t _negate,
uint32_t __abs,
uint32_t _addr_mode,
uint32_t _horiz_stride,
uint32_t _width,
uint32_t _vert_stride)
{
int err = 0;
err |= control (file, "negate", negate, _negate, NULL);
err |= control (file, "abs", _abs, __abs, NULL);
string (file, "g[a0");
if (_addr_subreg_nr)
format (file, ".%d", _addr_subreg_nr);
if (_addr_imm)
format (file, " %d", _addr_imm);
string (file, "]");
src_align1_region (file, _vert_stride, _width, _horiz_stride);
err |= control (file, "src reg encoding", reg_encoding, type, NULL);
return err;
}
static int src_da16 (FILE *file,
uint32_t _reg_type,
uint32_t _reg_file,
uint32_t _vert_stride,
uint32_t _reg_nr,
uint32_t _subreg_nr,
uint32_t __abs,
uint32_t _negate,
uint32_t swz_x,
uint32_t swz_y,
uint32_t swz_z,
uint32_t swz_w)
{
int err = 0;
err |= control (file, "negate", negate, _negate, NULL);
err |= control (file, "abs", _abs, __abs, NULL);
err |= reg (file, _reg_file, _reg_nr);
if (err == -1)
return 0;
if (_subreg_nr)
/* bit4 for subreg number byte addressing. Make this same meaning as
in da1 case, so output looks consistent. */
format (file, ".%d", 16 / reg_type_size[_reg_type]);
string (file, "<");
err |= control (file, "vert stride", vert_stride, _vert_stride, NULL);
string (file, ",4,1>");
/*
* Three kinds of swizzle display:
* identity - nothing printed
* 1->all - print the single channel
* 1->1 - print the mapping
*/
if (swz_x == GEN_CHANNEL_X &&
swz_y == GEN_CHANNEL_Y &&
swz_z == GEN_CHANNEL_Z &&
swz_w == GEN_CHANNEL_W)
{
;
}
else if (swz_x == swz_y && swz_x == swz_z && swz_x == swz_w)
{
string (file, ".");
err |= control (file, "channel select", chan_sel, swz_x, NULL);
}
else
{
string (file, ".");
err |= control (file, "channel select", chan_sel, swz_x, NULL);
err |= control (file, "channel select", chan_sel, swz_y, NULL);
err |= control (file, "channel select", chan_sel, swz_z, NULL);
err |= control (file, "channel select", chan_sel, swz_w, NULL);
}
err |= control (file, "src da16 reg type", reg_encoding, _reg_type, NULL);
return err;
}
static int src0_3src (FILE *file, const struct GenInstruction *inst)
{
int err = 0;
uint32_t swz_x = (inst->bits2.da3src.src0_swizzle >> 0) & 0x3;
uint32_t swz_y = (inst->bits2.da3src.src0_swizzle >> 2) & 0x3;
uint32_t swz_z = (inst->bits2.da3src.src0_swizzle >> 4) & 0x3;
uint32_t swz_w = (inst->bits2.da3src.src0_swizzle >> 6) & 0x3;
err |= control (file, "negate", negate, inst->bits1.da3src.src0_negate, NULL);
err |= control (file, "abs", _abs, inst->bits1.da3src.src0_abs, NULL);
err |= reg (file, GEN_GENERAL_REGISTER_FILE, inst->bits2.da3src.src0_reg_nr);
if (err == -1)
return 0;
if (inst->bits2.da3src.src0_subreg_nr)
format (file, ".%d", inst->bits2.da3src.src0_subreg_nr);
string (file, "<4,1,1>");
err |= control (file, "src da16 reg type", reg_encoding,
GEN_TYPE_F, NULL);
/*
* Three kinds of swizzle display:
* identity - nothing printed
* 1->all - print the single channel
* 1->1 - print the mapping
*/
if (swz_x == GEN_CHANNEL_X &&
swz_y == GEN_CHANNEL_Y &&
swz_z == GEN_CHANNEL_Z &&
swz_w == GEN_CHANNEL_W)
{
;
}
else if (swz_x == swz_y && swz_x == swz_z && swz_x == swz_w)
{
string (file, ".");
err |= control (file, "channel select", chan_sel, swz_x, NULL);
}
else
{
string (file, ".");
err |= control (file, "channel select", chan_sel, swz_x, NULL);
err |= control (file, "channel select", chan_sel, swz_y, NULL);
err |= control (file, "channel select", chan_sel, swz_z, NULL);
err |= control (file, "channel select", chan_sel, swz_w, NULL);
}
return err;
}
static int src1_3src (FILE *file, const struct GenInstruction *inst)
{
int err = 0;
uint32_t swz_x = (inst->bits2.da3src.src1_swizzle >> 0) & 0x3;
uint32_t swz_y = (inst->bits2.da3src.src1_swizzle >> 2) & 0x3;
uint32_t swz_z = (inst->bits2.da3src.src1_swizzle >> 4) & 0x3;
uint32_t swz_w = (inst->bits2.da3src.src1_swizzle >> 6) & 0x3;
uint32_t src1_subreg_nr = (inst->bits2.da3src.src1_subreg_nr_low |
(inst->bits3.da3src.src1_subreg_nr_high << 2));
err |= control (file, "negate", negate, inst->bits1.da3src.src1_negate,
NULL);
err |= control (file, "abs", _abs, inst->bits1.da3src.src1_abs, NULL);
err |= reg (file, GEN_GENERAL_REGISTER_FILE,
inst->bits3.da3src.src1_reg_nr);
if (err == -1)
return 0;
if (src1_subreg_nr)
format (file, ".%d", src1_subreg_nr);
string (file, "<4,1,1>");
err |= control (file, "src da16 reg type", reg_encoding,
GEN_TYPE_F, NULL);
/*
* Three kinds of swizzle display:
* identity - nothing printed
* 1->all - print the single channel
* 1->1 - print the mapping
*/
if (swz_x == GEN_CHANNEL_X &&
swz_y == GEN_CHANNEL_Y &&
swz_z == GEN_CHANNEL_Z &&
swz_w == GEN_CHANNEL_W)
{
;
}
else if (swz_x == swz_y && swz_x == swz_z && swz_x == swz_w)
{
string (file, ".");
err |= control (file, "channel select", chan_sel, swz_x, NULL);
}
else
{
string (file, ".");
err |= control (file, "channel select", chan_sel, swz_x, NULL);
err |= control (file, "channel select", chan_sel, swz_y, NULL);
err |= control (file, "channel select", chan_sel, swz_z, NULL);
err |= control (file, "channel select", chan_sel, swz_w, NULL);
}
return err;
}
static int src2_3src (FILE *file, const struct GenInstruction *inst)
{
int err = 0;
uint32_t swz_x = (inst->bits3.da3src.src2_swizzle >> 0) & 0x3;
uint32_t swz_y = (inst->bits3.da3src.src2_swizzle >> 2) & 0x3;
uint32_t swz_z = (inst->bits3.da3src.src2_swizzle >> 4) & 0x3;
uint32_t swz_w = (inst->bits3.da3src.src2_swizzle >> 6) & 0x3;
err |= control (file, "negate", negate, inst->bits1.da3src.src2_negate,
NULL);
err |= control (file, "abs", _abs, inst->bits1.da3src.src2_abs, NULL);
err |= reg (file, GEN_GENERAL_REGISTER_FILE,
inst->bits3.da3src.src2_reg_nr);
if (err == -1)
return 0;
if (inst->bits3.da3src.src2_subreg_nr)
format (file, ".%d", inst->bits3.da3src.src2_subreg_nr);
string (file, "<4,1,1>");
err |= control (file, "src da16 reg type", reg_encoding,
GEN_TYPE_F, NULL);
/*
* Three kinds of swizzle display:
* identity - nothing printed
* 1->all - print the single channel
* 1->1 - print the mapping
*/
if (swz_x == GEN_CHANNEL_X &&
swz_y == GEN_CHANNEL_Y &&
swz_z == GEN_CHANNEL_Z &&
swz_w == GEN_CHANNEL_W)
{
;
}
else if (swz_x == swz_y && swz_x == swz_z && swz_x == swz_w)
{
string (file, ".");
err |= control (file, "channel select", chan_sel, swz_x, NULL);
}
else
{
string (file, ".");
err |= control (file, "channel select", chan_sel, swz_x, NULL);
err |= control (file, "channel select", chan_sel, swz_y, NULL);
err |= control (file, "channel select", chan_sel, swz_z, NULL);
err |= control (file, "channel select", chan_sel, swz_w, NULL);
}
return err;
}
static int imm (FILE *file, uint32_t type, const struct GenInstruction *inst) {
switch (type) {
case GEN_TYPE_UD:
format (file, "0x%xUD", inst->bits3.ud);
break;
case GEN_TYPE_D:
format (file, "%dD", inst->bits3.d);
break;
case GEN_TYPE_UW:
format (file, "0x%xUW", (uint16_t) inst->bits3.ud);
break;
case GEN_TYPE_W:
format (file, "%dW", (int16_t) inst->bits3.d);
break;
case GEN_TYPE_UB:
format (file, "0x%xUB", (int8_t) inst->bits3.ud);
break;
case GEN_TYPE_VF:
format (file, "Vector Float");
break;
case GEN_TYPE_V:
format (file, "0x%xV", inst->bits3.ud);
break;
case GEN_TYPE_F:
format (file, "%-gF", inst->bits3.f);
}
return 0;
}
static int src0 (FILE *file, const struct GenInstruction *inst)
{
if (inst->bits1.da1.src0_reg_file == GEN_IMMEDIATE_VALUE)
return imm (file, inst->bits1.da1.src0_reg_type,
inst);
else if (inst->header.access_mode == GEN_ALIGN_1)
{
if (inst->bits2.da1.src0_address_mode == GEN_ADDRESS_DIRECT)
{
return src_da1 (file,
inst->bits1.da1.src0_reg_type,
inst->bits1.da1.src0_reg_file,
inst->bits2.da1.src0_vert_stride,
inst->bits2.da1.src0_width,
inst->bits2.da1.src0_horiz_stride,
inst->bits2.da1.src0_reg_nr,
inst->bits2.da1.src0_subreg_nr,
inst->bits2.da1.src0_abs,
inst->bits2.da1.src0_negate);
}
else
{
return src_ia1 (file,
inst->bits1.ia1.src0_reg_type,
inst->bits1.ia1.src0_reg_file,
inst->bits2.ia1.src0_indirect_offset,
inst->bits2.ia1.src0_subreg_nr,
inst->bits2.ia1.src0_negate,
inst->bits2.ia1.src0_abs,
inst->bits2.ia1.src0_address_mode,
inst->bits2.ia1.src0_horiz_stride,
inst->bits2.ia1.src0_width,
inst->bits2.ia1.src0_vert_stride);
}
}
else
{
if (inst->bits2.da16.src0_address_mode == GEN_ADDRESS_DIRECT)
{
return src_da16 (file,
inst->bits1.da16.src0_reg_type,
inst->bits1.da16.src0_reg_file,
inst->bits2.da16.src0_vert_stride,
inst->bits2.da16.src0_reg_nr,
inst->bits2.da16.src0_subreg_nr,
inst->bits2.da16.src0_abs,
inst->bits2.da16.src0_negate,
inst->bits2.da16.src0_swz_x,
inst->bits2.da16.src0_swz_y,
inst->bits2.da16.src0_swz_z,
inst->bits2.da16.src0_swz_w);
}
else
{
string (file, "Indirect align16 address mode not supported");
return 1;
}
}
}
static int src1 (FILE *file, const struct GenInstruction *inst)
{
if (inst->bits1.da1.src1_reg_file == GEN_IMMEDIATE_VALUE)
return imm (file, inst->bits1.da1.src1_reg_type,
inst);
else if (inst->header.access_mode == GEN_ALIGN_1)
{
if (inst->bits3.da1.src1_address_mode == GEN_ADDRESS_DIRECT)
{
return src_da1 (file,
inst->bits1.da1.src1_reg_type,
inst->bits1.da1.src1_reg_file,
inst->bits3.da1.src1_vert_stride,
inst->bits3.da1.src1_width,
inst->bits3.da1.src1_horiz_stride,
inst->bits3.da1.src1_reg_nr,
inst->bits3.da1.src1_subreg_nr,
inst->bits3.da1.src1_abs,
inst->bits3.da1.src1_negate);
}
else
{
return src_ia1 (file,
inst->bits1.ia1.src1_reg_type,
inst->bits1.ia1.src1_reg_file,
inst->bits3.ia1.src1_indirect_offset,
inst->bits3.ia1.src1_subreg_nr,
inst->bits3.ia1.src1_negate,
inst->bits3.ia1.src1_abs,
inst->bits3.ia1.src1_address_mode,
inst->bits3.ia1.src1_horiz_stride,
inst->bits3.ia1.src1_width,
inst->bits3.ia1.src1_vert_stride);
}
}
else
{
if (inst->bits3.da16.src1_address_mode == GEN_ADDRESS_DIRECT)
{
return src_da16 (file,
inst->bits1.da16.src1_reg_type,
inst->bits1.da16.src1_reg_file,
inst->bits3.da16.src1_vert_stride,
inst->bits3.da16.src1_reg_nr,
inst->bits3.da16.src1_subreg_nr,
inst->bits3.da16.src1_abs,
inst->bits3.da16.src1_negate,
inst->bits3.da16.src1_swz_x,
inst->bits3.da16.src1_swz_y,
inst->bits3.da16.src1_swz_z,
inst->bits3.da16.src1_swz_w);
}
else
{
string (file, "Indirect align16 address mode not supported");
return 1;
}
}
}
static const int esize[6] = {
[0] = 1,
[1] = 2,
[2] = 4,
[3] = 8,
[4] = 16,
[5] = 32,
};
static int qtr_ctrl(FILE *file, const struct GenInstruction *inst)
{
int qtr_ctl = inst->header.quarter_control;
int exec_size = esize[inst->header.execution_size];
if (exec_size == 8) {
switch (qtr_ctl) {
case 0:
string (file, " 1Q");
break;
case 1:
string (file, " 2Q");
break;
case 2:
string (file, " 3Q");
break;
case 3:
string (file, " 4Q");
break;
}
} else if (exec_size == 16){
if (qtr_ctl < 2)
string (file, " 1H");
else
string (file, " 2H");
}
return 0;
}
int gen_disasm (FILE *file, const void *opaque_insn)
{
const struct GenInstruction *inst = (const struct GenInstruction *) opaque_insn;
int err = 0;
int space = 0;
int gen = 7;
if (inst->header.predicate_control) {
string (file, "(");
err |= control (file, "predicate inverse", pred_inv, inst->header.predicate_inverse, NULL);
format (file, "f%d", inst->bits2.da1.flag_reg_nr);
if (inst->bits2.da1.flag_sub_reg_nr)
format (file, ".%d", inst->bits2.da1.flag_sub_reg_nr);
if (inst->header.access_mode == GEN_ALIGN_1)
err |= control (file, "predicate control align1", pred_ctrl_align1,
inst->header.predicate_control, NULL);
else
err |= control (file, "predicate control align16", pred_ctrl_align16,
inst->header.predicate_control, NULL);
string (file, ") ");
}
err |= print_opcode (file, inst->header.opcode);
err |= control (file, "saturate", saturate, inst->header.saturate, NULL);
err |= control (file, "debug control", debug_ctrl, inst->header.debug_control, NULL);
if (inst->header.opcode == GEN_OPCODE_MATH) {
string (file, " ");
err |= control (file, "function", math_function,
inst->header.destreg_or_condmod, NULL);
} else if (inst->header.opcode != GEN_OPCODE_SEND &&
inst->header.opcode != GEN_OPCODE_SENDC) {
err |= control (file, "conditional modifier", conditional_modifier,
inst->header.destreg_or_condmod, NULL);
if (inst->header.destreg_or_condmod)
err |= flag_reg (file,
inst->bits2.da1.flag_reg_nr,
inst->bits2.da1.flag_sub_reg_nr);
}
if (inst->header.opcode != GEN_OPCODE_NOP) {
string (file, "(");
err |= control (file, "execution size", exec_size, inst->header.execution_size, NULL);
string (file, ")");
}
if (inst->header.opcode == GEN_OPCODE_SEND && gen < 6)
format (file, " %d", inst->header.destreg_or_condmod);
if (opcode[inst->header.opcode].nsrc == 3) {
pad (file, 16);
err |= dest_3src (file, inst);
pad (file, 32);
err |= src0_3src (file, inst);
pad (file, 48);
err |= src1_3src (file, inst);
pad (file, 64);
err |= src2_3src (file, inst);
} else {
if (opcode[inst->header.opcode].ndst > 0) {
pad (file, 16);
err |= dest (file, inst);
} else if (gen >= 6 && (inst->header.opcode == GEN_OPCODE_IF ||
inst->header.opcode == GEN_OPCODE_ELSE ||
inst->header.opcode == GEN_OPCODE_ENDIF ||
inst->header.opcode == GEN_OPCODE_WHILE)) {
// XXX format (file, " %d", inst->bits1.branch_gen6.jump_count);
assert(0);
} else if (gen >= 6 && (inst->header.opcode == GEN_OPCODE_BREAK ||
inst->header.opcode == GEN_OPCODE_CONTINUE ||
inst->header.opcode == GEN_OPCODE_HALT)) {
// XXX format (file, " %d %d", inst->bits3.break_cont.uip, inst->bits3.break_cont.jip);
assert(0);
} else if (inst->header.opcode == GEN_OPCODE_JMPI) {
format (file, " %d", inst->bits3.d);
}
if (opcode[inst->header.opcode].nsrc > 0) {
pad (file, 32);
err |= src0 (file, inst);
}
if (opcode[inst->header.opcode].nsrc > 1) {
pad (file, 48);
err |= src1 (file, inst);
}
}
if (inst->header.opcode == GEN_OPCODE_SEND ||
inst->header.opcode == GEN_OPCODE_SENDC) {
enum GenMessageTarget target = inst->header.destreg_or_condmod;
newline (file);
pad (file, 16);
space = 0;
err |= control (file, "target function", target_function_gen6,
target, &space);
switch (target) {
case GEN_SFID_MATH:
err |= control (file, "math function", math_function,
inst->bits3.math_gen5.function, &space);
err |= control (file, "math saturate", math_saturate,
inst->bits3.math_gen5.saturate, &space);
err |= control (file, "math signed", math_signed,
inst->bits3.math_gen5.int_type, &space);
err |= control (file, "math scalar", math_scalar,
inst->bits3.math_gen5.data_type, &space);
err |= control (file, "math precision", math_precision,
inst->bits3.math_gen5.precision, &space);
break;
case GEN_SFID_SAMPLER:
format (file, " (%d, %d, %d, %d)",
inst->bits3.sampler_gen7.bti,
inst->bits3.sampler_gen7.sampler,
inst->bits3.sampler_gen7.msg_type,
inst->bits3.sampler_gen7.simd_mode);
break;
case GEN_SFID_DATAPORT_DATA_CACHE:
if(inst->bits3.gen7_untyped_rw.category == 0) {
format (file, " (bti: %d, rgba: %d, %s, %s, %s)",
inst->bits3.gen7_untyped_rw.bti,
inst->bits3.gen7_untyped_rw.rgba,
data_port_data_cache_simd_mode[inst->bits3.gen7_untyped_rw.simd_mode],
data_port_data_cache_category[inst->bits3.gen7_untyped_rw.category],
data_port_data_cache_msg_type[inst->bits3.gen7_untyped_rw.msg_type]);
} else {
format (file, " (addr: %d, blocks: %s, %s, mode: %s, %s)",
inst->bits3.gen7_scratch_rw.offset,
data_port_scratch_block_size[inst->bits3.gen7_scratch_rw.block_size],
data_port_scratch_invalidate[inst->bits3.gen7_scratch_rw.invalidate_after_read],
data_port_scratch_channel_mode[inst->bits3.gen7_scratch_rw.channel_mode],
data_port_scratch_msg_type[inst->bits3.gen7_scratch_rw.msg_type]);
}
break;
case GEN6_SFID_DATAPORT_CONSTANT_CACHE:
format (file, " (bti: %d, %s)",
inst->bits3.gen7_dword_rw.bti,
data_port_data_cache_msg_type[inst->bits3.gen7_dword_rw.msg_type]);
break;
case GEN_SFID_MESSAGE_GATEWAY:
format (file, " (subfunc: %s, notify: %d, ackreq: %d)",
gateway_sub_function[inst->bits3.gen7_msg_gw.subfunc],
inst->bits3.gen7_msg_gw.notify,
inst->bits3.gen7_msg_gw.ackreq);
break;
default:
format (file, "unsupported target %d", target);
break;
}
if (space)
string (file, " ");
format (file, "mlen %d", inst->bits3.generic_gen5.msg_length);
format (file, " rlen %d", inst->bits3.generic_gen5.response_length);
}
pad (file, 64);
if (inst->header.opcode != GEN_OPCODE_NOP) {
string (file, "{");
space = 1;
err |= control(file, "access mode", access_mode, inst->header.access_mode, &space);
if (gen >= 6)
err |= control (file, "write enable control", wectrl, inst->header.mask_control, &space);
else
err |= control (file, "mask control", mask_ctrl, inst->header.mask_control, &space);
err |= control (file, "dependency control", dep_ctrl, inst->header.dependency_control, &space);
err |= qtr_ctrl (file, inst);
err |= control (file, "thread control", thread_ctrl, inst->header.thread_control, &space);
if (gen >= 6)
err |= control (file, "acc write control", accwr, inst->header.acc_wr_control, &space);
if (inst->header.opcode == GEN_OPCODE_SEND ||
inst->header.opcode == GEN_OPCODE_SENDC)
err |= control (file, "end of thread", end_of_thread,
inst->bits3.generic_gen5.end_of_thread, &space);
if (space)
string (file, " ");
string (file, "}");
}
string (file, ";");
newline (file);
return err;
}
Release_v0.3/backend/src/backend/gen/gen_mesa_disasm.h 0000664 0000000 0000000 00000002351 12231421770 0023011 0 ustar 00root root 0000000 0000000 /*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see .
*
* Author: Benjamin Segovia
*/
/**
* \file gen_mesa_disasm.h
* \author Benjamin Segovia
*
* To decode and print one Gen ISA instruction. The code is directly taken
* from Mesa
*/
#ifndef __GBE_GEN_MESA_DISASM_H__
#define __GBE_GEN_MESA_DISASM_H__
#include
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
extern int gen_disasm(FILE *file, const void *opaque_insn);
#ifdef __cplusplus
}
#endif /* __cplusplus */
#endif /* __GBE_GEN_MESA_DISASM_H__ */
Release_v0.3/backend/src/backend/gen_context.cpp 0000664 0000000 0000000 00000172764 12231421770 0022012 0 ustar 00root root 0000000 0000000 /*
* Copyright © 2012 Intel Corporatin
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see .
*
* Author: Benjamin Segovia
*/
/**
* \file gen_context.cpp
* \author Benjamin Segovia
*/
#include "backend/gen_context.hpp"
#include "backend/gen_program.hpp"
#include "backend/gen_defs.hpp"
#include "backend/gen_encoder.hpp"
#include "backend/gen_insn_selection.hpp"
#include "backend/gen_insn_scheduling.hpp"
#include "backend/gen_reg_allocation.hpp"
#include "backend/gen/gen_mesa_disasm.h"
#include "ir/function.hpp"
#include "sys/cvar.hpp"
#include
namespace gbe
{
///////////////////////////////////////////////////////////////////////////
// GenContext implementation
///////////////////////////////////////////////////////////////////////////
GenContext::GenContext(const ir::Unit &unit,
const std::string &name,
bool limitRegisterPressure) :
Context(unit, name), limitRegisterPressure(limitRegisterPressure)
{
this->p = GBE_NEW(GenEncoder, simdWidth, 7); // XXX handle more than Gen7
this->sel = GBE_NEW(Selection, *this);
this->ra = GBE_NEW(GenRegAllocator, *this);
}
GenContext::~GenContext(void) {
GBE_DELETE(this->ra);
GBE_DELETE(this->sel);
GBE_DELETE(this->p);
}
void GenContext::emitInstructionStream(void) {
// Emit Gen ISA
for (auto &block : *sel->blockList)
for (auto &insn : block.insnList) {
const uint32_t opcode = insn.opcode;
p->push();
// no more virtual register here in that part of the code generation
GBE_ASSERT(insn.state.physicalFlag);
p->curr = insn.state;
switch (opcode) {
#define DECL_SELECTION_IR(OPCODE, FAMILY) \
case SEL_OP_##OPCODE: this->emit##FAMILY(insn); break;
#include "backend/gen_insn_selection.hxx"
#undef DECL_INSN
}
p->pop();
}
/* per spec, pad the instruction stream with 8 nop to avoid
instruction prefetcher prefetch into an invalide page */
for(int i = 0; i < 8; i++)
p->NOP();
}
void GenContext::patchBranches(void) {
using namespace ir;
for (auto pair : branchPos2) {
const LabelIndex label = pair.first;
const int32_t insnID = pair.second;
const int32_t targetID = labelPos.find(label)->second;
p->patchJMPI(insnID, (targetID-insnID-1) * 2);
}
}
void GenContext::clearFlagRegister(void) {
// when group size not aligned to simdWidth, flag register need clear to
// make prediction(any8/16h) work correctly
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
p->curr.noMask = 1;
p->curr.execWidth = 1;
p->MOV(GenRegister::retype(GenRegister::flag(0,0), GEN_TYPE_UD), GenRegister::immud(0x0));
p->MOV(GenRegister::retype(GenRegister::flag(1,0), GEN_TYPE_UD), GenRegister::immud(0x0));
p->pop();
}
void GenContext::emitStackPointer(void) {
using namespace ir;
// Only emit stack pointer computation if we use a stack
if (kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) <= 0)
return;
// Check that everything is consistent in the kernel code
const uint32_t perLaneSize = kernel->getStackSize();
const uint32_t perThreadSize = perLaneSize * this->simdWidth;
const int32_t offset = GEN_REG_SIZE + kernel->getCurbeOffset(GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER);
GBE_ASSERT(perLaneSize > 0);
GBE_ASSERT(isPowerOf<2>(perLaneSize) == true);
GBE_ASSERT(isPowerOf<2>(perThreadSize) == true);
// Use shifts rather than muls which are limited to 32x16 bit sources
const uint32_t perLaneShift = logi2(perLaneSize);
const uint32_t perThreadShift = logi2(perThreadSize);
const GenRegister selStatckPtr = this->simdWidth == 8 ?
GenRegister::ud8grf(ir::ocl::stackptr) :
GenRegister::ud16grf(ir::ocl::stackptr);
const GenRegister stackptr = ra->genReg(selStatckPtr);
const uint32_t nr = offset / GEN_REG_SIZE;
const uint32_t subnr = (offset % GEN_REG_SIZE) / sizeof(uint32_t);
const GenRegister bufferptr = GenRegister::ud1grf(nr, subnr);
// We compute the per-lane stack pointer here
p->push();
p->curr.execWidth = 1;
p->curr.predicate = GEN_PREDICATE_NONE;
p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x1ff));
p->curr.execWidth = this->simdWidth;
p->SHL(stackptr, stackptr, GenRegister::immud(perLaneShift));
p->curr.execWidth = 1;
p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(perThreadShift));
p->curr.execWidth = this->simdWidth;
p->ADD(stackptr, stackptr, bufferptr);
p->ADD(stackptr, stackptr, GenRegister::ud1grf(126,0));
p->pop();
}
void GenContext::emitLabelInstruction(const SelectionInstruction &insn) {
const ir::LabelIndex label(insn.index);
this->labelPos.insert(std::make_pair(label, p->store.size()));
}
void GenContext::emitUnaryInstruction(const SelectionInstruction &insn) {
const GenRegister dst = ra->genReg(insn.dst(0));
const GenRegister src = ra->genReg(insn.src(0));
switch (insn.opcode) {
case SEL_OP_MOV: p->MOV(dst, src); break;
case SEL_OP_FBH: p->FBH(dst, src); break;
case SEL_OP_FBL: p->FBL(dst, src); break;
case SEL_OP_NOT: p->NOT(dst, src); break;
case SEL_OP_RNDD: p->RNDD(dst, src); break;
case SEL_OP_RNDU: p->RNDU(dst, src); break;
case SEL_OP_RNDE: p->RNDE(dst, src); break;
case SEL_OP_RNDZ: p->RNDZ(dst, src); break;
case SEL_OP_LOAD_INT64_IMM: p->LOAD_INT64_IMM(dst, src.value.i64); break;
case SEL_OP_CONVI64_TO_I:
{
int execWidth = p->curr.execWidth;
GenRegister xsrc = src.bottom_half(), xdst = dst;
p->push();
p->curr.execWidth = 8;
for(int i = 0; i < execWidth/4; i ++) {
p->curr.chooseNib(i);
p->MOV(xdst, xsrc);
xdst = GenRegister::suboffset(xdst, 4);
xsrc = GenRegister::suboffset(xsrc, 8);
}
p->pop();
break;
}
default: NOT_IMPLEMENTED;
}
}
void GenContext::emitUnaryWithTempInstruction(const SelectionInstruction &insn) {
GenRegister dst = ra->genReg(insn.dst(0));
GenRegister src = ra->genReg(insn.src(0));
GenRegister tmp = ra->genReg(insn.dst(1));
switch (insn.opcode) {
case SEL_OP_LOAD_DF_IMM:
p->LOAD_DF_IMM(dst, tmp, src.value.df);
break;
case SEL_OP_MOV_DF:
p->MOV_DF(dst, src, tmp);
break;
case SEL_OP_CONVF_TO_I64:
{
tmp.type = GEN_TYPE_F;
GenRegister d = GenRegister::retype(tmp, GEN_TYPE_D);
float c = (1.f / 65536.f) * (1.f / 65536.f);
p->MUL(tmp, src, GenRegister::immf(c));
p->RNDZ(tmp, tmp);
p->MOV(d, tmp);
storeTopHalf(dst, d);
d.type = GEN_TYPE_UD;
p->MOV(d, GenRegister::abs(src));
storeBottomHalf(dst, d);
break;
}
case SEL_OP_CONVI_TO_I64: {
GenRegister middle;
if (src.type == GEN_TYPE_B || src.type == GEN_TYPE_D) {
middle = tmp;
middle.type = src.is_signed_int() ? GEN_TYPE_D : GEN_TYPE_UD;
p->MOV(middle, src);
} else {
middle = src;
}
int execWidth = p->curr.execWidth;
p->push();
p->curr.execWidth = 8;
for (int nib = 0; nib < execWidth / 4; nib ++) {
p->curr.chooseNib(nib);
p->MOV(dst.bottom_half(), middle);
if(middle.is_signed_int())
p->ASR(dst.top_half(), middle, GenRegister::immud(31));
else
p->MOV(dst.top_half(), GenRegister::immd(0));
dst = GenRegister::suboffset(dst, 4);
middle = GenRegister::suboffset(middle, 4);
}
p->pop();
break;
}
default:
NOT_IMPLEMENTED;
}
}
void GenContext::emitBinaryWithTempInstruction(const SelectionInstruction &insn) {
GenRegister dst = ra->genReg(insn.dst(0));
GenRegister src0 = ra->genReg(insn.src(0));
GenRegister src1 = ra->genReg(insn.src(1));
GenRegister tmp = ra->genReg(insn.dst(1));
switch (insn.opcode) {
case SEL_OP_I64ADD: {
GenRegister x = GenRegister::retype(tmp, GEN_TYPE_UD),
y = GenRegister::suboffset(x, p->curr.execWidth);
loadBottomHalf(x, src0);
loadBottomHalf(y, src1);
addWithCarry(x, x, y);
storeBottomHalf(dst, x);
loadTopHalf(x, src0);
p->ADD(x, x, y);
loadTopHalf(y, src1);
p->ADD(x, x, y);
storeTopHalf(dst, x);
break;
}
case SEL_OP_I64SUB: {
GenRegister x = GenRegister::retype(tmp, GEN_TYPE_UD),
y = GenRegister::suboffset(x, p->curr.execWidth);
loadBottomHalf(x, src0);
loadBottomHalf(y, src1);
subWithBorrow(x, x, y);
storeBottomHalf(dst, x);
loadTopHalf(x, src0);
subWithBorrow(x, x, y);
loadTopHalf(y, src1);
subWithBorrow(x, x, y);
storeTopHalf(dst, x);
break;
}
case SEL_OP_MUL_HI: {
int w = p->curr.execWidth;
p->push();
p->curr.execWidth = 8;
for (int i = 0; i < w / 8; i ++) {
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
p->MUL(GenRegister::retype(GenRegister::acc(), GEN_TYPE_UD), src0, src1);
p->curr.accWrEnable = 1;
p->MACH(tmp, src0, src1);
p->pop();
p->curr.quarterControl = i;
p->MOV(dst, tmp);
dst = GenRegister::Qn(dst, 1);
src0 = GenRegister::Qn(src0, 1);
src1 = GenRegister::Qn(src1, 1);
}
p->pop();
break;
}
case SEL_OP_HADD: {
int w = p->curr.execWidth;
p->push();
p->curr.execWidth = 8;
for (int i = 0; i < w / 8; i ++) {
p->curr.quarterControl = i;
p->ADDC(dst, src0, src1);
p->SHR(dst, dst, GenRegister::immud(1));
p->SHL(tmp, GenRegister::retype(GenRegister::acc(), GEN_TYPE_D), GenRegister::immud(31));
p->OR(dst, dst, tmp);
dst = GenRegister::Qn(dst, 1);
src0 = GenRegister::Qn(src0, 1);
src1 = GenRegister::Qn(src1, 1);
}
p->pop();
break;
}
case SEL_OP_RHADD: {
int w = p->curr.execWidth;
p->push();
p->curr.execWidth = 8;
for (int i = 0; i < w / 8; i ++) {
p->curr.quarterControl = i;
p->ADDC(dst, src0, src1);
p->ADD(dst, dst, GenRegister::immud(1));
p->SHR(dst, dst, GenRegister::immud(1));
p->SHL(tmp, GenRegister::retype(GenRegister::acc(), GEN_TYPE_D), GenRegister::immud(31));
p->OR(dst, dst, tmp);
dst = GenRegister::Qn(dst, 1);
src0 = GenRegister::Qn(src0, 1);
src1 = GenRegister::Qn(src1, 1);
}
p->pop();
break;
}
default:
NOT_IMPLEMENTED;
}
}
void GenContext::emitBinaryInstruction(const SelectionInstruction &insn) {
const GenRegister dst = ra->genReg(insn.dst(0));
const GenRegister src0 = ra->genReg(insn.src(0));
const GenRegister src1 = ra->genReg(insn.src(1));
switch (insn.opcode) {
case SEL_OP_SEL: p->SEL(dst, src0, src1); break;
case SEL_OP_SEL_INT64:
{
GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
int execWidth = p->curr.execWidth;
p->push();
p->curr.execWidth = 8;
for (int nib = 0; nib < execWidth / 4; nib ++) {
p->curr.chooseNib(nib);
p->SEL(xdst.bottom_half(), xsrc0.bottom_half(), xsrc1.bottom_half());
p->SEL(xdst.top_half(), xsrc0.top_half(), xsrc1.top_half());
xdst = GenRegister::suboffset(xdst, 4);
xsrc0 = GenRegister::suboffset(xsrc0, 4);
xsrc1 = GenRegister::suboffset(xsrc1, 4);
}
p->pop();
}
break;
case SEL_OP_AND: p->AND(dst, src0, src1); break;
case SEL_OP_OR: p->OR (dst, src0, src1); break;
case SEL_OP_XOR: p->XOR(dst, src0, src1); break;
case SEL_OP_I64AND:
{
GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
int execWidth = p->curr.execWidth;
p->push();
p->curr.execWidth = 8;
for (int nib = 0; nib < execWidth / 4; nib ++) {
p->curr.chooseNib(nib);
p->AND(xdst.bottom_half(), xsrc0.bottom_half(), xsrc1.bottom_half());
p->AND(xdst.top_half(), xsrc0.top_half(), xsrc1.top_half());
xdst = GenRegister::suboffset(xdst, 4),
xsrc0 = GenRegister::suboffset(xsrc0, 4),
xsrc1 = GenRegister::suboffset(xsrc1, 4);
}
p->pop();
}
break;
case SEL_OP_I64OR:
{
GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
int execWidth = p->curr.execWidth;
p->push();
p->curr.execWidth = 8;
for (int nib = 0; nib < execWidth / 4; nib ++) {
p->curr.chooseNib(nib);
p->OR(xdst.bottom_half(), xsrc0.bottom_half(), xsrc1.bottom_half());
p->OR(xdst.top_half(), xsrc0.top_half(), xsrc1.top_half());
xdst = GenRegister::suboffset(xdst, 4),
xsrc0 = GenRegister::suboffset(xsrc0, 4),
xsrc1 = GenRegister::suboffset(xsrc1, 4);
}
p->pop();
}
break;
case SEL_OP_I64XOR:
{
GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
int execWidth = p->curr.execWidth;
p->push();
p->curr.execWidth = 8;
for (int nib = 0; nib < execWidth / 4; nib ++) {
p->curr.chooseNib(nib);
p->XOR(xdst.bottom_half(), xsrc0.bottom_half(), xsrc1.bottom_half());
p->XOR(xdst.top_half(), xsrc0.top_half(), xsrc1.top_half());
xdst = GenRegister::suboffset(xdst, 4),
xsrc0 = GenRegister::suboffset(xsrc0, 4),
xsrc1 = GenRegister::suboffset(xsrc1, 4);
}
p->pop();
}
break;
case SEL_OP_SHR: p->SHR(dst, src0, src1); break;
case SEL_OP_SHL: p->SHL(dst, src0, src1); break;
case SEL_OP_RSR: p->RSR(dst, src0, src1); break;
case SEL_OP_RSL: p->RSL(dst, src0, src1); break;
case SEL_OP_ASR: p->ASR(dst, src0, src1); break;
case SEL_OP_ADD: p->ADD(dst, src0, src1); break;
case SEL_OP_MUL: p->MUL(dst, src0, src1); break;
case SEL_OP_MACH: p->MACH(dst, src0, src1); break;
case SEL_OP_UPSAMPLE_SHORT: p->UPSAMPLE_SHORT(dst, src0, src1); break;
case SEL_OP_UPSAMPLE_INT: p->UPSAMPLE_INT(dst, src0, src1); break;
case SEL_OP_UPSAMPLE_LONG:
{
GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
int execWidth = p->curr.execWidth;
p->push();
p->curr.execWidth = 8;
for (int nib = 0; nib < execWidth / 4; nib ++) {
p->curr.chooseNib(nib);
p->MOV(xdst.top_half(), xsrc0.bottom_half());
p->MOV(xdst.bottom_half(), xsrc1.bottom_half());
xdst = GenRegister::suboffset(xdst, 4);
xsrc0 = GenRegister::suboffset(xsrc0, 4);
xsrc1 = GenRegister::suboffset(xsrc1, 4);
}
p->pop();
}
break;
default: NOT_IMPLEMENTED;
}
}
void GenContext::collectShifter(GenRegister dest, GenRegister src) {
int execWidth = p->curr.execWidth;
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
p->curr.execWidth = 8;
for (int nib = 0; nib < execWidth / 4; nib ++) {
p->AND(dest, src.bottom_half(), GenRegister::immud(63));
dest = GenRegister::suboffset(dest, 4);
src = GenRegister::suboffset(src, 4);
}
p->pop();
}
void GenContext::I64FullAdd(GenRegister high1, GenRegister low1, GenRegister high2, GenRegister low2) {
addWithCarry(low1, low1, low2);
addWithCarry(high1, high1, high2);
p->ADD(high1, high1, low2);
}
void GenContext::I64FullMult(GenRegister dst1, GenRegister dst2, GenRegister dst3, GenRegister dst4, GenRegister x_high, GenRegister x_low, GenRegister y_high, GenRegister y_low) {
GenRegister &e = dst1, &f = dst2, &g = dst3, &h = dst4,
&a = x_high, &b = x_low, &c = y_high, &d = y_low;
I32FullMult(e, h, b, d);
I32FullMult(f, g, a, d);
addWithCarry(g, g, e);
addWithCarry(f, f, e);
I32FullMult(e, d, b, c);
I64FullAdd(f, g, e, d);
I32FullMult(b, d, a, c);
I64FullAdd(e, f, b, d);
}
void GenContext::I64Neg(GenRegister high, GenRegister low, GenRegister tmp) {
p->NOT(high, high);
p->NOT(low, low);
p->MOV(tmp, GenRegister::immud(1));
addWithCarry(low, low, tmp);
p->ADD(high, high, tmp);
}
void GenContext::I64ABS(GenRegister sign, GenRegister high, GenRegister low, GenRegister tmp, GenRegister flagReg) {
p->SHR(sign, high, GenRegister::immud(31));
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
p->CMP(GEN_CONDITIONAL_NZ, sign, GenRegister::immud(0));
p->curr.predicate = GEN_PREDICATE_NORMAL;
I64Neg(high, low, tmp);
p->pop();
}
void GenContext::emitI64MULHIInstruction(const SelectionInstruction &insn) {
GenRegister dest = ra->genReg(insn.dst(0));
GenRegister x = ra->genReg(insn.src(0));
GenRegister y = ra->genReg(insn.src(1));
GenRegister a = ra->genReg(insn.dst(1));
GenRegister b = ra->genReg(insn.dst(2));
GenRegister c = ra->genReg(insn.dst(3));
GenRegister d = ra->genReg(insn.dst(4));
GenRegister e = ra->genReg(insn.dst(5));
GenRegister f = ra->genReg(insn.dst(6));
GenRegister g = ra->genReg(insn.dst(7));
GenRegister h = ra->genReg(insn.dst(8));
GenRegister i = ra->genReg(insn.dst(9));
GenRegister flagReg = ra->genReg(insn.dst(10));
loadTopHalf(a, x);
loadBottomHalf(b, x);
loadTopHalf(c, y);
loadBottomHalf(d, y);
if(x.type == GEN_TYPE_UL) {
I64FullMult(e, f, g, h, a, b, c, d);
} else {
I64ABS(e, a, b, i, flagReg);
I64ABS(f, c, d, i, flagReg);
p->XOR(i, e, f);
I64FullMult(e, f, g, h, a, b, c, d);
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
p->CMP(GEN_CONDITIONAL_NZ, i, GenRegister::immud(0));
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->NOT(e, e);
p->NOT(f, f);
p->NOT(g, g);
p->NOT(h, h);
p->MOV(i, GenRegister::immud(1));
addWithCarry(h, h, i);
addWithCarry(g, g, i);
addWithCarry(f, f, i);
p->ADD(e, e, i);
p->pop();
}
storeTopHalf(dest, e);
storeBottomHalf(dest, f);
}
void GenContext::emitI64MADSATInstruction(const SelectionInstruction &insn) {
GenRegister dest = ra->genReg(insn.dst(0));
GenRegister x = ra->genReg(insn.src(0));
GenRegister y = ra->genReg(insn.src(1));
GenRegister z = ra->genReg(insn.src(2));
GenRegister a = ra->genReg(insn.dst(1));
GenRegister b = ra->genReg(insn.dst(2));
GenRegister c = ra->genReg(insn.dst(3));
GenRegister d = ra->genReg(insn.dst(4));
GenRegister e = ra->genReg(insn.dst(5));
GenRegister f = ra->genReg(insn.dst(6));
GenRegister g = ra->genReg(insn.dst(7));
GenRegister h = ra->genReg(insn.dst(8));
GenRegister i = ra->genReg(insn.dst(9));
GenRegister flagReg = ra->genReg(insn.dst(10));
GenRegister zero = GenRegister::immud(0), one = GenRegister::immud(1);
loadTopHalf(a, x);
loadBottomHalf(b, x);
loadTopHalf(c, y);
loadBottomHalf(d, y);
if(x.type == GEN_TYPE_UL) {
I64FullMult(e, f, g, h, a, b, c, d);
loadTopHalf(c, z);
loadBottomHalf(d, z);
addWithCarry(h, h, d);
addWithCarry(g, g, d);
addWithCarry(f, f, d);
p->ADD(e, e, d);
addWithCarry(g, g, c);
addWithCarry(f, f, c);
p->ADD(e, e, c);
p->OR(a, e, f);
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
p->CMP(GEN_CONDITIONAL_NZ, a, zero);
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->MOV(g, GenRegister::immd(-1));
p->MOV(h, GenRegister::immd(-1));
p->pop();
} else {
I64ABS(e, a, b, i, flagReg);
I64ABS(f, c, d, i, flagReg);
p->XOR(i, e, f);
I64FullMult(e, f, g, h, a, b, c, d);
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
p->CMP(GEN_CONDITIONAL_NZ, i, zero);
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->NOT(e, e);
p->NOT(f, f);
p->NOT(g, g);
p->NOT(h, h);
p->MOV(i, one);
addWithCarry(h, h, i);
addWithCarry(g, g, i);
addWithCarry(f, f, i);
p->ADD(e, e, i);
p->pop();
loadTopHalf(c, z);
loadBottomHalf(d, z);
p->ASR(GenRegister::retype(b, GEN_TYPE_D), GenRegister::retype(c, GEN_TYPE_D), GenRegister::immd(31));
p->MOV(a, b);
addWithCarry(h, h, d);
addWithCarry(g, g, d);
addWithCarry(f, f, d);
p->ADD(e, e, d);
addWithCarry(g, g, c);
addWithCarry(f, f, c);
p->ADD(e, e, c);
addWithCarry(f, f, b);
p->ADD(e, e, b);
p->ADD(e, e, a);
p->MOV(b, zero);
p->push();
p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
p->curr.predicate = GEN_PREDICATE_NONE;
p->CMP(GEN_CONDITIONAL_NZ, e, zero);
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->MOV(b, one);
p->curr.predicate = GEN_PREDICATE_NONE;
p->CMP(GEN_CONDITIONAL_NZ, f, zero);
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->MOV(b, one);
p->curr.predicate = GEN_PREDICATE_NONE;
p->CMP(GEN_CONDITIONAL_G, g, GenRegister::immud(0x7FFFFFFF));
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->MOV(b, one);
p->curr.predicate = GEN_PREDICATE_NONE;
p->SHR(a, e, GenRegister::immud(31));
p->CMP(GEN_CONDITIONAL_NZ, a, zero);
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->MOV(b, zero);
p->curr.predicate = GEN_PREDICATE_NONE;
p->CMP(GEN_CONDITIONAL_NZ, b, zero);
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->MOV(g, GenRegister::immud(0x7FFFFFFF));
p->MOV(h, GenRegister::immud(0xFFFFFFFFu));
p->curr.predicate = GEN_PREDICATE_NONE;
p->MOV(b, zero);
p->CMP(GEN_CONDITIONAL_NEQ, e, GenRegister::immud(0xFFFFFFFFu));
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->MOV(b, one);
p->curr.predicate = GEN_PREDICATE_NONE;
p->CMP(GEN_CONDITIONAL_NEQ, f, GenRegister::immud(0xFFFFFFFFu));
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->MOV(b, one);
p->curr.predicate = GEN_PREDICATE_NONE;
p->CMP(GEN_CONDITIONAL_LE, g, GenRegister::immud(0x7FFFFFFF));
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->MOV(b, one);
p->curr.predicate = GEN_PREDICATE_NONE;
p->CMP(GEN_CONDITIONAL_Z, a, zero);
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->MOV(b, zero);
p->curr.predicate = GEN_PREDICATE_NONE;
p->CMP(GEN_CONDITIONAL_NZ, b, zero);
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->MOV(g, GenRegister::immud(0x80000000u));
p->MOV(h, zero);
p->pop();
}
storeTopHalf(dest, g);
storeBottomHalf(dest, h);
}
void GenContext::emitI64HADDInstruction(const SelectionInstruction &insn) {
GenRegister dest = ra->genReg(insn.dst(0));
GenRegister x = ra->genReg(insn.src(0));
GenRegister y = ra->genReg(insn.src(1));
GenRegister a = ra->genReg(insn.dst(1));
GenRegister b = ra->genReg(insn.dst(2));
GenRegister c = ra->genReg(insn.dst(3));
GenRegister d = ra->genReg(insn.dst(4));
a.type = b.type = c.type = d.type = GEN_TYPE_UD;
loadBottomHalf(a, x);
loadBottomHalf(b, y);
loadTopHalf(c, x);
loadTopHalf(d, y);
addWithCarry(a, a, b);
addWithCarry(c, c, b);
addWithCarry(c, c, d);
p->ADD(b, b, d);
p->SHR(a, a, GenRegister::immud(1));
p->SHL(d, c, GenRegister::immud(31));
p->OR(a, a, d);
p->SHR(c, c, GenRegister::immud(1));
p->SHL(d, b, GenRegister::immud(31));
p->OR(c, c, d);
storeBottomHalf(dest, a);
storeTopHalf(dest, c);
}
void GenContext::emitI64RHADDInstruction(const SelectionInstruction &insn) {
GenRegister dest = ra->genReg(insn.dst(0));
GenRegister x = ra->genReg(insn.src(0));
GenRegister y = ra->genReg(insn.src(1));
GenRegister a = ra->genReg(insn.dst(1));
GenRegister b = ra->genReg(insn.dst(2));
GenRegister c = ra->genReg(insn.dst(3));
GenRegister d = ra->genReg(insn.dst(4));
a.type = b.type = c.type = d.type = GEN_TYPE_UD;
loadBottomHalf(a, x);
loadBottomHalf(b, y);
addWithCarry(a, a, b);
p->MOV(c, GenRegister::immud(1));
addWithCarry(a, a, c);
p->ADD(b, b, c);
loadTopHalf(c, x);
loadTopHalf(d, y);
addWithCarry(c, c, b);
addWithCarry(c, c, d);
p->ADD(b, b, d);
p->SHR(a, a, GenRegister::immud(1));
p->SHL(d, c, GenRegister::immud(31));
p->OR(a, a, d);
p->SHR(c, c, GenRegister::immud(1));
p->SHL(d, b, GenRegister::immud(31));
p->OR(c, c, d);
storeBottomHalf(dest, a);
storeTopHalf(dest, c);
}
void GenContext::emitI64ShiftInstruction(const SelectionInstruction &insn) {
GenRegister dest = ra->genReg(insn.dst(0));
GenRegister x = ra->genReg(insn.src(0));
GenRegister y = ra->genReg(insn.src(1));
GenRegister a = ra->genReg(insn.dst(1));
GenRegister b = ra->genReg(insn.dst(2));
GenRegister c = ra->genReg(insn.dst(3));
GenRegister d = ra->genReg(insn.dst(4));
GenRegister e = ra->genReg(insn.dst(5));
GenRegister f = ra->genReg(insn.dst(6));
a.type = b.type = c.type = d.type = e.type = f.type = GEN_TYPE_UD;
GenRegister flagReg = ra->genReg(insn.dst(7));
GenRegister zero = GenRegister::immud(0);
switch(insn.opcode) {
case SEL_OP_I64SHL:
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
collectShifter(a, y);
loadBottomHalf(e, x);
loadTopHalf(f, x);
p->SHR(b, e, GenRegister::negate(a));
p->SHL(c, e, a);
p->SHL(d, f, a);
p->OR(e, d, b);
p->MOV(flagReg, GenRegister::immuw(0xFFFF));
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
p->CMP(GEN_CONDITIONAL_Z, a, zero);
p->SEL(d, d, e);
p->curr.predicate = GEN_PREDICATE_NONE;
p->AND(a, a, GenRegister::immud(32));
p->MOV(flagReg, GenRegister::immuw(0xFFFF));
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
p->CMP(GEN_CONDITIONAL_Z, a, zero);
p->SEL(d, d, c);
p->SEL(c, c, zero);
p->pop();
storeBottomHalf(dest, c);
storeTopHalf(dest, d);
break;
case SEL_OP_I64SHR:
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
collectShifter(a, y);
loadBottomHalf(e, x);
loadTopHalf(f, x);
p->SHL(b, f, GenRegister::negate(a));
p->SHR(c, f, a);
p->SHR(d, e, a);
p->OR(e, d, b);
p->MOV(flagReg, GenRegister::immuw(0xFFFF));
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
p->CMP(GEN_CONDITIONAL_Z, a, zero);
p->SEL(d, d, e);
p->curr.predicate = GEN_PREDICATE_NONE;
p->AND(a, a, GenRegister::immud(32));
p->MOV(flagReg, GenRegister::immuw(0xFFFF));
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
p->CMP(GEN_CONDITIONAL_Z, a, zero);
p->SEL(d, d, c);
p->SEL(c, c, zero);
p->pop();
storeBottomHalf(dest, d);
storeTopHalf(dest, c);
break;
case SEL_OP_I64ASR:
f.type = GEN_TYPE_D;
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
collectShifter(a, y);
loadBottomHalf(e, x);
loadTopHalf(f, x);
p->SHL(b, f, GenRegister::negate(a));
p->ASR(c, f, a);
p->SHR(d, e, a);
p->OR(e, d, b);
p->MOV(flagReg, GenRegister::immuw(0xFFFF));
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
p->CMP(GEN_CONDITIONAL_Z, a, zero);
p->SEL(d, d, e);
p->curr.predicate = GEN_PREDICATE_NONE;
p->AND(a, a, GenRegister::immud(32));
p->MOV(flagReg, GenRegister::immuw(0xFFFF));
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
p->CMP(GEN_CONDITIONAL_Z, a, zero);
p->SEL(d, d, c);
p->SEL(c, c, GenRegister::immd(-1));
p->pop();
storeBottomHalf(dest, d);
storeTopHalf(dest, c);
break;
default:
NOT_IMPLEMENTED;
}
}
void GenContext::saveFlag(GenRegister dest, int flag, int subFlag) {
p->push();
p->curr.execWidth = 1;
p->MOV(dest, GenRegister::flag(flag, subFlag));
p->pop();
}
void GenContext::UnsignedI64ToFloat(GenRegister dst, GenRegister high, GenRegister low, GenRegister tmp) {
p->MOV(dst, high);
p->MUL(dst, dst, GenRegister::immf(65536.f * 65536.f));
tmp.type = GEN_TYPE_F;
p->MOV(tmp, low);
p->ADD(dst, dst, tmp);
}
void GenContext::emitI64ToFloatInstruction(const SelectionInstruction &insn) {
GenRegister src = ra->genReg(insn.src(0));
GenRegister dest = ra->genReg(insn.dst(0));
GenRegister high = ra->genReg(insn.dst(1));
GenRegister low = ra->genReg(insn.dst(2));
GenRegister tmp = ra->genReg(insn.dst(3));
GenRegister flagReg = ra->genReg(insn.dst(4));
loadTopHalf(high, src);
loadBottomHalf(low, src);
if(!src.is_signed_int()) {
UnsignedI64ToFloat(dest, high, low, tmp);
} else {
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
p->CMP(GEN_CONDITIONAL_GE, high, GenRegister::immud(0x80000000));
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->NOT(high, high);
p->NOT(low, low);
p->MOV(tmp, GenRegister::immud(1));
addWithCarry(low, low, tmp);
p->ADD(high, high, tmp);
p->pop();
UnsignedI64ToFloat(dest, high, low, tmp);
p->push();
p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
dest.type = GEN_TYPE_UD;
p->OR(dest, dest, GenRegister::immud(0x80000000));
p->pop();
}
}
void GenContext::emitI64CompareInstruction(const SelectionInstruction &insn) {
GenRegister src0 = ra->genReg(insn.src(0));
GenRegister src1 = ra->genReg(insn.src(1));
GenRegister tmp0 = ra->genReg(insn.dst(0));
GenRegister tmp1 = ra->genReg(insn.dst(1));
GenRegister tmp2 = ra->genReg(insn.dst(2));
tmp0.type = (src0.type == GEN_TYPE_L) ? GEN_TYPE_D : GEN_TYPE_UD;
tmp1.type = (src1.type == GEN_TYPE_L) ? GEN_TYPE_D : GEN_TYPE_UD;
int flag = p->curr.flag, subFlag = p->curr.subFlag;
GenRegister f1 = GenRegister::retype(tmp2, GEN_TYPE_UW),
f2 = GenRegister::suboffset(f1, 1),
f3 = GenRegister::suboffset(f1, 2),
f4 = GenRegister::suboffset(f1, 3);
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
saveFlag(f4, flag, subFlag);
loadTopHalf(tmp0, src0);
loadTopHalf(tmp1, src1);
switch(insn.extra.function) {
case GEN_CONDITIONAL_L:
case GEN_CONDITIONAL_LE:
case GEN_CONDITIONAL_G:
case GEN_CONDITIONAL_GE:
{
int cmpTopHalf = insn.extra.function;
if(insn.extra.function == GEN_CONDITIONAL_LE)
cmpTopHalf = GEN_CONDITIONAL_L;
if(insn.extra.function == GEN_CONDITIONAL_GE)
cmpTopHalf = GEN_CONDITIONAL_G;
p->CMP(cmpTopHalf, tmp0, tmp1);
}
saveFlag(f1, flag, subFlag);
p->CMP(GEN_CONDITIONAL_EQ, tmp0, tmp1);
saveFlag(f2, flag, subFlag);
tmp0.type = tmp1.type = GEN_TYPE_UD;
loadBottomHalf(tmp0, src0);
loadBottomHalf(tmp1, src1);
p->CMP(insn.extra.function, tmp0, tmp1);
saveFlag(f3, flag, subFlag);
p->push();
p->curr.execWidth = 1;
p->AND(f2, f2, f3);
p->OR(f1, f1, f2);
p->pop();
break;
case GEN_CONDITIONAL_EQ:
p->CMP(GEN_CONDITIONAL_EQ, tmp0, tmp1);
saveFlag(f1, flag, subFlag);
tmp0.type = tmp1.type = GEN_TYPE_UD;
loadBottomHalf(tmp0, src0);
loadBottomHalf(tmp1, src1);
p->CMP(GEN_CONDITIONAL_EQ, tmp0, tmp1);
saveFlag(f2, flag, subFlag);
p->push();
p->curr.execWidth = 1;
p->AND(f1, f1, f2);
p->pop();
break;
case GEN_CONDITIONAL_NEQ:
p->CMP(GEN_CONDITIONAL_NEQ, tmp0, tmp1);
saveFlag(f1, flag, subFlag);
tmp0.type = tmp1.type = GEN_TYPE_UD;
loadBottomHalf(tmp0, src0);
loadBottomHalf(tmp1, src1);
p->CMP(GEN_CONDITIONAL_NEQ, tmp0, tmp1);
saveFlag(f2, flag, subFlag);
p->push();
p->curr.execWidth = 1;
p->OR(f1, f1, f2);
p->pop();
break;
default:
NOT_IMPLEMENTED;
}
p->curr.execWidth = 1;
p->AND(f1, f1, f4);
p->MOV(GenRegister::flag(flag, subFlag), f1);
p->pop();
}
void GenContext::emitI64SATADDInstruction(const SelectionInstruction &insn) {
GenRegister x = ra->genReg(insn.src(0));
GenRegister y = ra->genReg(insn.src(1));
GenRegister dst = ra->genReg(insn.dst(0));
GenRegister a = ra->genReg(insn.dst(1));
GenRegister b = ra->genReg(insn.dst(2));
GenRegister c = ra->genReg(insn.dst(3));
GenRegister d = ra->genReg(insn.dst(4));
GenRegister e = ra->genReg(insn.dst(5));
GenRegister flagReg = ra->genReg(insn.dst(6));
loadTopHalf(a, x);
loadBottomHalf(b, x);
loadTopHalf(c, y);
loadBottomHalf(d, y);
if(dst.is_signed_int())
p->SHR(e, a, GenRegister::immud(31));
addWithCarry(b, b, d);
addWithCarry(a, a, d);
addWithCarry(a, a, c);
p->ADD(c, c, d);
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
if(! dst.is_signed_int()) {
p->CMP(GEN_CONDITIONAL_NZ, c, GenRegister::immud(0));
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->MOV(a, GenRegister::immud(0xFFFFFFFFu));
p->MOV(b, GenRegister::immud(0xFFFFFFFFu));
} else {
p->CMP(GEN_CONDITIONAL_EQ, e, GenRegister::immud(1));
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->CMP(GEN_CONDITIONAL_L, a, GenRegister::immud(0x80000000u));
p->MOV(a, GenRegister::immud(0x80000000u));
p->MOV(b, GenRegister::immud(0));
p->curr.predicate = GEN_PREDICATE_NONE;
p->CMP(GEN_CONDITIONAL_EQ, e, GenRegister::immud(0));
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->CMP(GEN_CONDITIONAL_GE, a, GenRegister::immud(0x80000000u));
p->MOV(a, GenRegister::immud(0x7FFFFFFFu));
p->MOV(b, GenRegister::immud(0xFFFFFFFFu));
}
p->pop();
storeTopHalf(dst, a);
storeBottomHalf(dst, b);
}
void GenContext::emitI64SATSUBInstruction(const SelectionInstruction &insn) {
GenRegister x = ra->genReg(insn.src(0));
GenRegister y = ra->genReg(insn.src(1));
GenRegister dst = ra->genReg(insn.dst(0));
GenRegister a = ra->genReg(insn.dst(1));
GenRegister b = ra->genReg(insn.dst(2));
GenRegister c = ra->genReg(insn.dst(3));
GenRegister d = ra->genReg(insn.dst(4));
GenRegister e = ra->genReg(insn.dst(5));
GenRegister flagReg = ra->genReg(insn.dst(6));
loadTopHalf(a, x);
loadBottomHalf(b, x);
loadTopHalf(c, y);
loadBottomHalf(d, y);
if(dst.is_signed_int())
p->SHR(e, a, GenRegister::immud(31));
subWithBorrow(b, b, d);
subWithBorrow(a, a, d);
subWithBorrow(a, a, c);
p->ADD(c, c, d);
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
if(! dst.is_signed_int()) {
p->CMP(GEN_CONDITIONAL_NZ, c, GenRegister::immud(0));
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->MOV(a, GenRegister::immud(0));
p->MOV(b, GenRegister::immud(0));
} else {
p->CMP(GEN_CONDITIONAL_EQ, e, GenRegister::immud(1));
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->CMP(GEN_CONDITIONAL_L, a, GenRegister::immud(0x80000000u));
p->MOV(a, GenRegister::immud(0x80000000u));
p->MOV(b, GenRegister::immud(0));
p->curr.predicate = GEN_PREDICATE_NONE;
p->CMP(GEN_CONDITIONAL_EQ, e, GenRegister::immud(0));
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->CMP(GEN_CONDITIONAL_GE, a, GenRegister::immud(0x80000000u));
p->MOV(a, GenRegister::immud(0x7FFFFFFFu));
p->MOV(b, GenRegister::immud(0xFFFFFFFFu));
}
p->pop();
storeTopHalf(dst, a);
storeBottomHalf(dst, b);
}
void GenContext::loadTopHalf(GenRegister dest, GenRegister src) {
int execWidth = p->curr.execWidth;
src = src.top_half();
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
p->curr.execWidth = 8;
p->MOV(dest, src);
p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 8));
if (execWidth == 16) {
p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 16));
p->MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(src, 24));
}
p->pop();
}
void GenContext::storeTopHalf(GenRegister dest, GenRegister src) {
int execWidth = p->curr.execWidth;
dest = dest.top_half();
p->push();
p->curr.execWidth = 8;
p->MOV(dest, src);
p->curr.nibControl = 1;
p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 4));
if (execWidth == 16) {
p->curr.quarterControl = 1;
p->curr.nibControl = 0;
p->MOV(GenRegister::suboffset(dest, 16), GenRegister::suboffset(src, 8));
p->curr.nibControl = 1;
p->MOV(GenRegister::suboffset(dest, 24), GenRegister::suboffset(src, 12));
}
p->pop();
}
void GenContext::loadBottomHalf(GenRegister dest, GenRegister src) {
int execWidth = p->curr.execWidth;
src = src.bottom_half();
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
p->curr.execWidth = 8;
p->MOV(dest, src);
p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 8));
if (execWidth == 16) {
p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 16));
p->MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(src, 24));
}
p->pop();
}
void GenContext::storeBottomHalf(GenRegister dest, GenRegister src) {
int execWidth = p->curr.execWidth;
dest = dest.bottom_half();
p->push();
p->curr.execWidth = 8;
p->MOV(dest, src);
p->curr.nibControl = 1;
p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 4));
if (execWidth == 16) {
p->curr.quarterControl = 1;
p->curr.nibControl = 0;
p->MOV(GenRegister::suboffset(dest, 16), GenRegister::suboffset(src, 8));
p->curr.nibControl = 1;
p->MOV(GenRegister::suboffset(dest, 24), GenRegister::suboffset(src, 12));
}
p->pop();
}
void GenContext::addWithCarry(GenRegister dest, GenRegister src0, GenRegister src1) {
int execWidth = p->curr.execWidth;
GenRegister acc0 = GenRegister::retype(GenRegister::acc(), GEN_TYPE_D);
p->push();
p->curr.execWidth = 8;
p->ADDC(dest, src0, src1);
p->MOV(src1, acc0);
if (execWidth == 16) {
p->curr.quarterControl = 1;
p->ADDC(GenRegister::suboffset(dest, 8),
GenRegister::suboffset(src0, 8),
GenRegister::suboffset(src1, 8));
p->MOV(GenRegister::suboffset(src1, 8), acc0);
}
p->pop();
}
void GenContext::subWithBorrow(GenRegister dest, GenRegister src0, GenRegister src1) {
int execWidth = p->curr.execWidth;
GenRegister acc0 = GenRegister::retype(GenRegister::acc(), GEN_TYPE_D);
p->push();
p->curr.execWidth = 8;
p->SUBB(dest, src0, src1);
p->MOV(src1, acc0);
if (execWidth == 16) {
p->curr.quarterControl = 1;
p->SUBB(GenRegister::suboffset(dest, 8),
GenRegister::suboffset(src0, 8),
GenRegister::suboffset(src1, 8));
p->MOV(GenRegister::suboffset(src1, 8), acc0);
}
p->pop();
}
void GenContext::I32FullMult(GenRegister high, GenRegister low, GenRegister src0, GenRegister src1) {
GenRegister acc = GenRegister::retype(GenRegister::acc(), GEN_TYPE_UD);
int execWidth = p->curr.execWidth;
p->push();
p->curr.execWidth = 8;
for(int i = 0; i < execWidth; i += 8) {
p->MUL(acc, src0, src1);
p->curr.accWrEnable = 1;
p->MACH(high, src0, src1);
p->curr.accWrEnable = 0;
p->MOV(low, acc);
src0 = GenRegister::suboffset(src0, 8);
src1 = GenRegister::suboffset(src1, 8);
high = GenRegister::suboffset(high, 8);
low = GenRegister::suboffset(low, 8);
}
p->pop();
}
void GenContext::emitI64MULInstruction(const SelectionInstruction &insn) {
GenRegister dest = ra->genReg(insn.dst(0));
GenRegister x = ra->genReg(insn.src(0));
GenRegister y = ra->genReg(insn.src(1));
GenRegister a = ra->genReg(insn.dst(1));
GenRegister b = ra->genReg(insn.dst(2));
GenRegister c = ra->genReg(insn.dst(3));
GenRegister d = ra->genReg(insn.dst(4));
GenRegister e = ra->genReg(insn.dst(5));
GenRegister f = ra->genReg(insn.dst(6));
a.type = b.type = c.type = d.type = e.type = f.type = GEN_TYPE_UD;
loadTopHalf(a, x);
loadBottomHalf(b, x);
loadTopHalf(c, y);
loadBottomHalf(d, y);
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
I32FullMult(GenRegister::null(), e, b, c);
I32FullMult(GenRegister::null(), f, a, d);
p->ADD(e, e, f);
I32FullMult(f, a, b, d);
p->ADD(e, e, f);
p->pop();
storeTopHalf(dest, e);
storeBottomHalf(dest, a);
}
void GenContext::emitI64DIVREMInstruction(const SelectionInstruction &insn) {
GenRegister dest = ra->genReg(insn.dst(0));
GenRegister x = ra->genReg(insn.src(0));
GenRegister y = ra->genReg(insn.src(1));
GenRegister a = ra->genReg(insn.dst(1));
GenRegister b = ra->genReg(insn.dst(2));
GenRegister c = ra->genReg(insn.dst(3));
GenRegister d = ra->genReg(insn.dst(4));
GenRegister e = ra->genReg(insn.dst(5));
GenRegister f = ra->genReg(insn.dst(6));
GenRegister g = ra->genReg(insn.dst(7));
GenRegister h = ra->genReg(insn.dst(8));
GenRegister i = ra->genReg(insn.dst(9));
GenRegister j = ra->genReg(insn.dst(10));
GenRegister k = ra->genReg(insn.dst(11));
GenRegister l = ra->genReg(insn.dst(12));
GenRegister m = ra->genReg(insn.dst(13));
GenRegister flagReg = ra->genReg(insn.dst(14));
GenRegister zero = GenRegister::immud(0),
one = GenRegister::immud(1),
imm31 = GenRegister::immud(31);
// (a,b) <- x
loadTopHalf(a, x);
loadBottomHalf(b, x);
// (c,d) <- y
loadTopHalf(c, y);
loadBottomHalf(d, y);
// k <- sign_of_result
if(x.is_signed_int()) {
GBE_ASSERT(y.is_signed_int());
GBE_ASSERT(dest.is_signed_int());
I64ABS(k, a, b, e, flagReg);
I64ABS(l, c, d, e, flagReg);
if(insn.opcode == SEL_OP_I64DIV)
p->XOR(k, k, l);
}
// (e,f) <- 0
p->MOV(e, zero);
p->MOV(f, zero);
// (g,h) <- 2**63
p->MOV(g, GenRegister::immud(0x80000000));
p->MOV(h, zero);
// (i,j) <- 0
p->MOV(i, zero);
p->MOV(j, zero);
// m <- 0
p->MOV(m, zero);
{
uint32_t loop_start = p->n_instruction();
// (c,d,e,f) <- (c,d,e,f) / 2
p->SHR(f, f, one);
p->SHL(l, e, imm31);
p->OR(f, f, l);
p->SHR(e, e, one);
p->SHL(l, d, imm31);
p->OR(e, e, l);
p->SHR(d, d, one);
p->SHL(l, c, imm31);
p->OR(d, d, l);
p->SHR(c, c, one);
// condition <- (c,d)==0 && (a,b)>=(e,f)
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
p->MOV(l, zero);
p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
p->CMP(GEN_CONDITIONAL_EQ, a, e);
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->CMP(GEN_CONDITIONAL_GE, b, f);
p->MOV(l, one);
p->curr.predicate = GEN_PREDICATE_NONE;
p->CMP(GEN_CONDITIONAL_G, a, e);
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->MOV(l, one);
p->curr.predicate = GEN_PREDICATE_NONE;
p->CMP(GEN_CONDITIONAL_NEQ, l, zero);
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->CMP(GEN_CONDITIONAL_EQ, c, zero);
p->CMP(GEN_CONDITIONAL_EQ, d, zero);
// under condition, (a,b) <- (a,b) - (e,f)
p->MOV(l, f);
subWithBorrow(b, b, l);
subWithBorrow(a, a, l);
p->MOV(l, e);
subWithBorrow(a, a, l);
// under condition, (i,j) <- (i,j) | (g,h)
p->OR(i, i, g);
p->OR(j, j, h);
p->pop();
// (g,h) /= 2
p->SHR(h, h, one);
p->SHL(l, g, imm31);
p->OR(h, h, l);
p->SHR(g, g, one);
// condition: m < 64
p->ADD(m, m, one);
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
p->CMP(GEN_CONDITIONAL_L, m, GenRegister::immud(64));
p->curr.predicate = GEN_PREDICATE_NORMAL;
// under condition, jump back to start point
if (simdWidth == 8)
p->curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
else if (simdWidth == 16)
p->curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
else
NOT_IMPLEMENTED;
p->curr.execWidth = 1;
p->curr.noMask = 1;
int jip = -(int)(p->n_instruction() - loop_start + 1) * 2;
p->JMPI(zero);
p->patchJMPI(p->n_instruction()-1, jip);
p->pop();
// end of loop
}
// adjust sign of result
if(x.is_signed_int()) {
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
p->CMP(GEN_CONDITIONAL_NEQ, k, zero);
p->curr.predicate = GEN_PREDICATE_NORMAL;
if(insn.opcode == SEL_OP_I64DIV)
I64Neg(i, j, l);
else
I64Neg(a, b, l);
p->pop();
}
// write dest
if(insn.opcode == SEL_OP_I64DIV) {
storeTopHalf(dest, i);
storeBottomHalf(dest, j);
} else {
GBE_ASSERT(insn.opcode == SEL_OP_I64REM);
storeTopHalf(dest, a);
storeBottomHalf(dest, b);
}
}
void GenContext::emitTernaryInstruction(const SelectionInstruction &insn) {
const GenRegister dst = ra->genReg(insn.dst(0));
const GenRegister src0 = ra->genReg(insn.src(0));
const GenRegister src1 = ra->genReg(insn.src(1));
const GenRegister src2 = ra->genReg(insn.src(2));
switch (insn.opcode) {
case SEL_OP_MAD: p->MAD(dst, src0, src1, src2); break;
default: NOT_IMPLEMENTED;
}
}
void GenContext::emitNoOpInstruction(const SelectionInstruction &insn) {
NOT_IMPLEMENTED;
}
void GenContext::emitWaitInstruction(const SelectionInstruction &insn) {
p->WAIT();
}
void GenContext::emitBarrierInstruction(const SelectionInstruction &insn) {
const GenRegister src = ra->genReg(insn.src(0));
p->BARRIER(src);
}
void GenContext::emitFenceInstruction(const SelectionInstruction &insn) {
const GenRegister dst = ra->genReg(insn.dst(0));
p->FENCE(dst);
p->MOV(dst, dst);
}
void GenContext::emitMathInstruction(const SelectionInstruction &insn) {
const GenRegister dst = ra->genReg(insn.dst(0));
const GenRegister src0 = ra->genReg(insn.src(0));
const uint32_t function = insn.extra.function;
if (insn.srcNum == 2) {
const GenRegister src1 = ra->genReg(insn.src(1));
p->MATH(dst, function, src0, src1);
} else
p->MATH(dst, function, src0);
}
void GenContext::emitCompareInstruction(const SelectionInstruction &insn) {
const GenRegister src0 = ra->genReg(insn.src(0));
const GenRegister src1 = ra->genReg(insn.src(1));
if (insn.opcode == SEL_OP_CMP)
p->CMP(insn.extra.function, src0, src1);
else {
GBE_ASSERT(insn.opcode == SEL_OP_SEL_CMP);
const GenRegister dst = ra->genReg(insn.dst(0));
p->SEL_CMP(insn.extra.function, dst, src0, src1);
}
}
void GenContext::emitAtomicInstruction(const SelectionInstruction &insn) {
const GenRegister src = ra->genReg(insn.src(0));
const GenRegister dst = ra->genReg(insn.dst(0));
const uint32_t function = insn.extra.function;
const uint32_t bti = insn.extra.elem;
p->ATOMIC(dst, function, src, bti, insn.srcNum);
}
void GenContext::emitIndirectMoveInstruction(const SelectionInstruction &insn) {
GenRegister src = ra->genReg(insn.src(0));
if(isScalarReg(src.reg()))
src = GenRegister::retype(src, GEN_TYPE_UW);
else
src = GenRegister::unpacked_uw(src.nr, src.subnr / typeSize(GEN_TYPE_UW));
const GenRegister dst = ra->genReg(insn.dst(0));
const GenRegister a0 = GenRegister::addr8(0);
uint32_t simdWidth = p->curr.execWidth;
p->push();
p->curr.execWidth = 8;
p->curr.quarterControl = GEN_COMPRESSION_Q1;
p->MOV(a0, src);
p->MOV(dst, GenRegister::indirect(dst.type, 0, GEN_WIDTH_8));
p->pop();
if (simdWidth == 16) {
p->push();
p->curr.execWidth = 8;
p->curr.quarterControl = GEN_COMPRESSION_Q2;
const GenRegister nextDst = GenRegister::Qn(dst, 1);
const GenRegister nextSrc = GenRegister::Qn(src, 1);
p->MOV(a0, nextSrc);
p->MOV(nextDst, GenRegister::indirect(dst.type, 0, GEN_WIDTH_8));
p->pop();
}
}
void GenContext::emitJumpInstruction(const SelectionInstruction &insn) {
const ir::LabelIndex label(insn.index);
const GenRegister src = ra->genReg(insn.src(0));
this->branchPos2.push_back(std::make_pair(label, p->store.size()));
p->JMPI(src);
}
void GenContext::emitEotInstruction(const SelectionInstruction &insn) {
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
p->curr.noMask = 1;
p->MOV(GenRegister::ud8grf(112, 0), GenRegister::ud8grf(0, 0));
p->curr.execWidth = 8;
p->EOT(112);
p->pop();
}
void GenContext::emitSpillRegInstruction(const SelectionInstruction &insn) {
uint32_t simdWidth = p->curr.execWidth;
uint32_t scratchOffset = insn.extra.scratchOffset;
const uint32_t header = insn.extra.scratchMsgHeader;
p->push();
const GenRegister msg = GenRegister::ud8grf(header, 0);
const GenRegister src = ra->genReg(insn.src(0));
GenRegister payload = src;
payload.nr = header + 1;
payload.subnr = 0;
p->MOV(payload, src);
uint32_t regType = insn.src(0).type;
uint32_t size = typeSize(regType);
assert(size <= 4);
uint32_t regNum = (stride(src.hstride)*size*simdWidth) > 32 ? 2 : 1;
this->scratchWrite(msg, scratchOffset, regNum, regType, GEN_SCRATCH_CHANNEL_MODE_DWORD);
p->pop();
}
void GenContext::emitUnSpillRegInstruction(const SelectionInstruction &insn) {
uint32_t scratchOffset = insn.extra.scratchOffset;
const GenRegister dst = insn.dst(0);
uint32_t regType = dst.type;
uint32_t simdWidth = p->curr.execWidth;
const uint32_t header = insn.extra.scratchMsgHeader;
uint32_t size = typeSize(regType);
assert(size <= 4);
uint32_t regNum = (stride(dst.hstride)*size*simdWidth) > 32 ? 2 : 1;
const GenRegister msg = GenRegister::ud8grf(header, 0);
this->scratchRead(GenRegister::retype(dst, GEN_TYPE_UD), msg, scratchOffset, regNum, regType, GEN_SCRATCH_CHANNEL_MODE_DWORD);
}
// For SIMD8, we allocate 2*elemNum temporary registers from dst(0), and
// then follow the real destination registers.
// For SIMD16, we allocate elemNum temporary registers from dst(0).
void GenContext::emitRead64Instruction(const SelectionInstruction &insn) {
const uint32_t elemNum = insn.extra.elem;
const uint32_t tmpRegSize = (p->curr.execWidth == 8) ? elemNum * 2 : elemNum;
const GenRegister tempAddr = ra->genReg(insn.dst(0));
const GenRegister dst = ra->genReg(insn.dst(tmpRegSize + 1));
const GenRegister tmp = ra->genReg(insn.dst(1));
const GenRegister src = ra->genReg(insn.src(0));
const uint32_t bti = insn.extra.function;
p->READ64(dst, tmp, tempAddr, src, bti, elemNum);
}
void GenContext::emitUntypedReadInstruction(const SelectionInstruction &insn) {
const GenRegister dst = ra->genReg(insn.dst(0));
const GenRegister src = ra->genReg(insn.src(0));
const uint32_t bti = insn.extra.function;
const uint32_t elemNum = insn.extra.elem;
p->UNTYPED_READ(dst, src, bti, elemNum);
}
// For SIMD8, we allocate 2*elemNum temporary registers from dst(0), and
// then follow the real destination registers.
// For SIMD16, we allocate elemNum temporary registers from dst(0).
void GenContext::emitWrite64Instruction(const SelectionInstruction &insn) {
const GenRegister src = ra->genReg(insn.dst(0));
const uint32_t elemNum = insn.extra.elem;
const GenRegister addr = ra->genReg(insn.src(0)); //tmpRegSize + 1));
const GenRegister data = ra->genReg(insn.src(1));
const uint32_t bti = insn.extra.function;
p->MOV(src, addr);
p->WRITE64(src, data, bti, elemNum, isScalarReg(data.reg()));
}
void GenContext::emitUntypedWriteInstruction(const SelectionInstruction &insn) {
const GenRegister src = ra->genReg(insn.src(0));
const uint32_t bti = insn.extra.function;
const uint32_t elemNum = insn.extra.elem;
p->UNTYPED_WRITE(src, bti, elemNum);
}
void GenContext::emitByteGatherInstruction(const SelectionInstruction &insn) {
const GenRegister dst = ra->genReg(insn.dst(0));
const GenRegister src = ra->genReg(insn.src(0));
const uint32_t bti = insn.extra.function;
const uint32_t elemSize = insn.extra.elem;
p->BYTE_GATHER(dst, src, bti, elemSize);
}
void GenContext::emitByteScatterInstruction(const SelectionInstruction &insn) {
const GenRegister src = ra->genReg(insn.src(0));
const uint32_t bti = insn.extra.function;
const uint32_t elemSize = insn.extra.elem;
p->BYTE_SCATTER(src, bti, elemSize);
}
void GenContext::emitDWordGatherInstruction(const SelectionInstruction &insn) {
const GenRegister dst = ra->genReg(insn.dst(0));
const GenRegister src = ra->genReg(insn.src(0));
const uint32_t bti = insn.extra.function;
p->DWORD_GATHER(dst, src, bti);
}
void GenContext::emitSampleInstruction(const SelectionInstruction &insn) {
const GenRegister dst = ra->genReg(insn.dst(0));
const GenRegister msgPayload = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_F);
const unsigned char bti = insn.extra.function;
const unsigned char sampler = insn.extra.elem;
const GenRegister ucoord = ra->genReg(insn.src(4));
const GenRegister vcoord = ra->genReg(insn.src(5));
const GenRegister wcoord = ra->genReg(insn.src(6));
uint32_t simdWidth = p->curr.execWidth;
uint32_t coord_cnt = 2;
p->push();
const uint32_t nr = msgPayload.nr;
// prepare mesg desc and move to a0.0.
// desc = bti | (sampler << 8) | (0 << 12) | (2 << 16) | (0 << 18) | (0 << 19) | (4 << 20) | (1 << 25) | (0 < 29) | (0 << 31)
/* Prepare message payload. */
p->MOV(GenRegister::f8grf(nr , 0), ucoord);
p->MOV(GenRegister::f8grf(nr + (simdWidth/8), 0), vcoord);
if (insn.src(6).reg() != 0) {
p->MOV(GenRegister::f8grf(nr + (simdWidth/4), 0), wcoord);
coord_cnt++;
}
p->SAMPLE(dst, msgPayload, false, bti, sampler, coord_cnt, simdWidth, -1, 0);
p->pop();
}
void GenContext::scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode) {
p->push();
uint32_t simdWidth = p->curr.execWidth;
p->curr.predicate = GEN_PREDICATE_NONE;
p->curr.noMask = 1;
p->curr.execWidth = 8;
p->MOV(header, GenRegister::ud8grf(0,0));
p->pop();
int size = typeSize(reg_type)*simdWidth;
p->push();
p->SCRATCH_WRITE(header, offset/32, size, reg_num, channel_mode);
p->pop();
}
void GenContext::scratchRead(const GenRegister dst, const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode) {
p->push();
uint32_t simdWidth = p->curr.execWidth;
p->curr.predicate = GEN_PREDICATE_NONE;
p->curr.noMask = 1;
p->curr.execWidth = 8;
p->MOV(header, GenRegister::ud8grf(0,0));
p->pop();
int size = typeSize(reg_type)*simdWidth;
p->push();
p->SCRATCH_READ(dst, header, offset/32, size, reg_num, channel_mode);
p->pop();
}
void GenContext::emitTypedWriteInstruction(const SelectionInstruction &insn) {
const GenRegister header = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_UD);
const GenRegister ucoord = ra->genReg(insn.src(insn.extra.elem));
const GenRegister vcoord = ra->genReg(insn.src(1 + insn.extra.elem));
const GenRegister wcoord = ra->genReg(insn.src(2 + insn.extra.elem));
const GenRegister R = ra->genReg(insn.src(3 + insn.extra.elem));
const GenRegister G = ra->genReg(insn.src(4 + insn.extra.elem));
const GenRegister B = ra->genReg(insn.src(5 + insn.extra.elem));
const GenRegister A = ra->genReg(insn.src(6 + insn.extra.elem));
const unsigned char bti = insn.extra.function;
p->push();
uint32_t simdWidth = p->curr.execWidth;
const uint32_t nr = header.nr;
p->curr.predicate = GEN_PREDICATE_NONE;
p->curr.noMask = 1;
p->MOV(header, GenRegister::immud(0x0));
p->curr.execWidth = 1;
// prepare mesg desc and move to a0.0.
// desc = bti | (msg_type << 14) | (header_present << 19))
// prepare header, we need to enable all the 8 planes.
p->MOV(GenRegister::ud8grf(nr, 7), GenRegister::immud(0xffff));
p->curr.execWidth = 8;
// Typed write only support SIMD8.
// Prepare message payload U + V + R(ignored) + LOD(0) + RGBA.
// Currently, we don't support non-zero lod, so we clear all lod to
// zero for both quarters thus save one instruction here.
// Thus we must put this instruction in noMask and no predication state.
p->MOV(GenRegister::ud8grf(nr + 4, 0), GenRegister::immud(0)); //LOD
p->pop();
p->push();
p->curr.execWidth = 8;
// TYPED WRITE send instruction only support SIMD8, if we are SIMD16, we
// need to call it twice.
uint32_t quarterNum = (simdWidth == 8) ? 1 : 2;
for( uint32_t quarter = 0; quarter < quarterNum; quarter++)
{
#define QUARTER_MOV0(dst_nr, src) p->MOV(GenRegister::ud8grf(dst_nr, 0), \
GenRegister::retype(GenRegister::QnPhysical(src, quarter), src.type))
#define QUARTER_MOV1(dst_nr, src) p->MOV(GenRegister::retype(GenRegister::ud8grf(dst_nr, 0), src.type), \
GenRegister::retype(GenRegister::QnPhysical(src,quarter), src.type))
if (quarter == 1)
p->curr.quarterControl = GEN_COMPRESSION_Q2;
QUARTER_MOV0(nr + 1, ucoord);
QUARTER_MOV0(nr + 2, vcoord);
if (insn.src(2 + insn.extra.elem).reg() != 0)
QUARTER_MOV0(nr + 3, wcoord);
QUARTER_MOV1(nr + 5, R);
QUARTER_MOV1(nr + 6, G);
QUARTER_MOV1(nr + 7, B);
QUARTER_MOV1(nr + 8, A);
#undef QUARTER_MOV
p->TYPED_WRITE(header, true, bti);
}
p->pop();
}
void GenContext::emitGetImageInfoInstruction(const SelectionInstruction &insn) {
const unsigned char bti = insn.extra.function;
const unsigned char type = insn.extra.elem;
const uint32_t dstNum = ir::GetImageInfoInstruction::getDstNum4Type(type);
ir::ImageInfoKey key;
key.index = bti;
key.type = type;
uint32_t offset = this->getImageInfoCurbeOffset(key, dstNum * 4) + GEN_REG_SIZE;
for(uint32_t i = 0; i < dstNum; i++) {
const uint32_t nr = offset / GEN_REG_SIZE;
const uint32_t subnr = (offset % GEN_REG_SIZE) / sizeof(uint32_t);
p->MOV(ra->genReg(insn.dst(i)), GenRegister::ud1grf(nr, subnr));
offset += 32;
}
}
BVAR(OCL_OUTPUT_REG_ALLOC, false);
BVAR(OCL_OUTPUT_ASM, false);
bool GenContext::emitCode(void) {
GenKernel *genKernel = static_cast(this->kernel);
sel->select();
schedulePreRegAllocation(*this, *this->sel);
if (UNLIKELY(ra->allocate(*this->sel) == false))
return false;
schedulePostRegAllocation(*this, *this->sel);
if (OCL_OUTPUT_REG_ALLOC)
ra->outputAllocation();
this->clearFlagRegister();
this->emitStackPointer();
this->emitInstructionStream();
this->patchBranches();
genKernel->insnNum = p->store.size();
genKernel->insns = GBE_NEW_ARRAY_NO_ARG(GenInstruction, genKernel->insnNum);
std::memcpy(genKernel->insns, &p->store[0], genKernel->insnNum * sizeof(GenInstruction));
if (OCL_OUTPUT_ASM)
for (uint32_t insnID = 0; insnID < genKernel->insnNum; ++insnID)
gen_disasm(stdout, &p->store[insnID]);
return true;
}
Kernel *GenContext::allocateKernel(void) {
return GBE_NEW(GenKernel, name);
}
} /* namespace gbe */
Release_v0.3/backend/src/backend/gen_context.hpp 0000664 0000000 0000000 00000017000 12231421770 0021774 0 ustar 00root root 0000000 0000000 /*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see .
*
* Author: Benjamin Segovia
*/
/**
* \file gen_context.hpp
* \author Benjamin Segovia
*/
#ifndef __GBE_GEN_CONTEXT_HPP__
#define __GBE_GEN_CONTEXT_HPP__
#include "backend/context.hpp"
#include "backend/program.h"
#include "backend/gen_register.hpp"
#include "ir/function.hpp"
#include "ir/liveness.hpp"
#include "sys/map.hpp"
#include
namespace gbe
{
class Kernel; // We build this structure
class GenEncoder; // Helps emitting Gen ISA
class GenRegAllocator; // Handle the register allocation
class Selection; // Performs instruction selection
class SelectionInstruction; // Pre-RA Gen instruction
class SelectionReg; // Pre-RA Gen register
class GenRegister;
/*! Context is the helper structure to build the Gen ISA or simulation code
* from GenIR
*/
class GenContext : public Context
{
public:
/*! Create a new context. name is the name of the function we want to
* compile
*/
GenContext(const ir::Unit &unit, const std::string &name, bool limitRegisterPressure = false);
/*! Release everything needed */
~GenContext(void);
/*! Implements base class */
virtual bool emitCode(void);
/*! Function we emit code for */
INLINE const ir::Function &getFunction(void) const { return fn; }
/*! Simd width chosen for the current function */
INLINE uint32_t getSimdWidth(void) const { return simdWidth; }
void clearFlagRegister(void);
/*! Emit the per-lane stack pointer computation */
void emitStackPointer(void);
/*! Emit the instructions */
void emitInstructionStream(void);
/*! Set the correct target values for the branches */
void patchBranches(void);
/*! Forward ir::Function isSpecialReg method */
INLINE bool isSpecialReg(ir::Register reg) const {
return fn.isSpecialReg(reg);
}
/*! Get the liveOut information for the given block */
INLINE const ir::Liveness::LiveOut &getLiveOut(const ir::BasicBlock *bb) const {
return this->liveness->getLiveOut(bb);
}
void collectShifter(GenRegister dest, GenRegister src);
void loadTopHalf(GenRegister dest, GenRegister src);
void storeTopHalf(GenRegister dest, GenRegister src);
void loadBottomHalf(GenRegister dest, GenRegister src);
void storeBottomHalf(GenRegister dest, GenRegister src);
void addWithCarry(GenRegister dest, GenRegister src0, GenRegister src1);
void subWithBorrow(GenRegister dest, GenRegister src0, GenRegister src1);
void I64Neg(GenRegister high, GenRegister low, GenRegister tmp);
void I64ABS(GenRegister sign, GenRegister high, GenRegister low, GenRegister tmp, GenRegister flagReg);
void I64FullAdd(GenRegister high1, GenRegister low1, GenRegister high2, GenRegister low2);
void I32FullMult(GenRegister high, GenRegister low, GenRegister src0, GenRegister src1);
void I64FullMult(GenRegister dst1, GenRegister dst2, GenRegister dst3, GenRegister dst4, GenRegister x_high, GenRegister x_low, GenRegister y_high, GenRegister y_low);
void saveFlag(GenRegister dest, int flag, int subFlag);
void UnsignedI64ToFloat(GenRegister dst, GenRegister high, GenRegister low, GenRegister tmp);
/*! Final Gen ISA emission helper functions */
void emitLabelInstruction(const SelectionInstruction &insn);
void emitUnaryInstruction(const SelectionInstruction &insn);
void emitUnaryWithTempInstruction(const SelectionInstruction &insn);
void emitBinaryInstruction(const SelectionInstruction &insn);
void emitBinaryWithTempInstruction(const SelectionInstruction &insn);
void emitTernaryInstruction(const SelectionInstruction &insn);
void emitI64MULHIInstruction(const SelectionInstruction &insn);
void emitI64MADSATInstruction(const SelectionInstruction &insn);
void emitI64HADDInstruction(const SelectionInstruction &insn);
void emitI64RHADDInstruction(const SelectionInstruction &insn);
void emitI64ShiftInstruction(const SelectionInstruction &insn);
void emitI64CompareInstruction(const SelectionInstruction &insn);
void emitI64SATADDInstruction(const SelectionInstruction &insn);
void emitI64SATSUBInstruction(const SelectionInstruction &insn);
void emitI64ToFloatInstruction(const SelectionInstruction &insn);
void emitCompareInstruction(const SelectionInstruction &insn);
void emitJumpInstruction(const SelectionInstruction &insn);
void emitIndirectMoveInstruction(const SelectionInstruction &insn);
void emitEotInstruction(const SelectionInstruction &insn);
void emitNoOpInstruction(const SelectionInstruction &insn);
void emitWaitInstruction(const SelectionInstruction &insn);
void emitBarrierInstruction(const SelectionInstruction &insn);
void emitFenceInstruction(const SelectionInstruction &insn);
void emitMathInstruction(const SelectionInstruction &insn);
void emitRead64Instruction(const SelectionInstruction &insn);
void emitWrite64Instruction(const SelectionInstruction &insn);
void emitUntypedReadInstruction(const SelectionInstruction &insn);
void emitUntypedWriteInstruction(const SelectionInstruction &insn);
void emitAtomicInstruction(const SelectionInstruction &insn);
void emitByteGatherInstruction(const SelectionInstruction &insn);
void emitByteScatterInstruction(const SelectionInstruction &insn);
void emitDWordGatherInstruction(const SelectionInstruction &insn);
void emitSampleInstruction(const SelectionInstruction &insn);
void emitTypedWriteInstruction(const SelectionInstruction &insn);
void emitSpillRegInstruction(const SelectionInstruction &insn);
void emitUnSpillRegInstruction(const SelectionInstruction &insn);
void emitGetImageInfoInstruction(const SelectionInstruction &insn);
void emitI64MULInstruction(const SelectionInstruction &insn);
void emitI64DIVREMInstruction(const SelectionInstruction &insn);
void scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
void scratchRead(const GenRegister dst, const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
/*! Implements base class */
virtual Kernel *allocateKernel(void);
/*! Store the position of each label instruction in the Gen ISA stream */
map labelPos;
/*! Store the Gen instructions to patch */
vector> branchPos2;
/*! Encode Gen ISA */
GenEncoder *p;
/*! Instruction selection on Gen ISA (pre-register allocation) */
Selection *sel;
/*! Perform the register allocation */
GenRegAllocator *ra;
/*! Indicate if we need to tackle a register pressure issue when
* regenerating the code
*/
bool limitRegisterPressure;
};
} /* namespace gbe */
#endif /* __GBE_GEN_CONTEXT_HPP__ */
Release_v0.3/backend/src/backend/gen_defs.hpp 0000664 0000000 0000000 00000065207 12231421770 0021245 0 ustar 00root root 0000000 0000000 /*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see .
*
* Author: Benjamin Segovia
*/
/*
Copyright (C) Intel Corp. 2006. All Rights Reserved.
Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
develop this 3D driver.
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice (including the
next paragraph) shall be included in all copies or substantial
portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
**********************************************************************/
/*
* Authors:
* Keith Whitwell
*/
#ifndef __GEN_DEFS_HPP__
#define __GEN_DEFS_HPP__
#include
/////////////////////////////////////////////////////////////////////////////
// Gen EU defines
/////////////////////////////////////////////////////////////////////////////
/* Execution Unit (EU) defines */
#define GEN_ALIGN_1 0
#define GEN_ALIGN_16 1
#define GEN_REG_SIZE 32
#define GEN_ADDRESS_DIRECT 0
#define GEN_ADDRESS_REGISTER_INDIRECT_REGISTER 1
#define GEN_CHANNEL_X 0
#define GEN_CHANNEL_Y 1
#define GEN_CHANNEL_Z 2
#define GEN_CHANNEL_W 3
#define GEN_COMPRESSION_Q1 0
#define GEN_COMPRESSION_Q2 1
#define GEN_COMPRESSION_Q3 2
#define GEN_COMPRESSION_Q4 3
#define GEN_COMPRESSION_H1 0
#define GEN_COMPRESSION_H2 2
#define GEN_CONDITIONAL_NONE 0
#define GEN_CONDITIONAL_Z 1
#define GEN_CONDITIONAL_NZ 2
#define GEN_CONDITIONAL_EQ 1 /* Z */
#define GEN_CONDITIONAL_NEQ 2 /* NZ */
#define GEN_CONDITIONAL_G 3
#define GEN_CONDITIONAL_GE 4
#define GEN_CONDITIONAL_L 5
#define GEN_CONDITIONAL_LE 6
#define GEN_CONDITIONAL_R 7
#define GEN_CONDITIONAL_O 8
#define GEN_CONDITIONAL_U 9
#define GEN_DEBUG_NONE 0
#define GEN_DEBUG_BREAKPOINT 1
#define GEN_DEPENDENCY_NORMAL 0
#define GEN_DEPENDENCY_NOTCLEARED 1
#define GEN_DEPENDENCY_NOTCHECKED 2
#define GEN_DEPENDENCY_DISABLE 3
#define GEN_HORIZONTAL_STRIDE_0 0
#define GEN_HORIZONTAL_STRIDE_1 1
#define GEN_HORIZONTAL_STRIDE_2 2
#define GEN_HORIZONTAL_STRIDE_4 3
#define GEN_INSTRUCTION_NORMAL 0
#define GEN_INSTRUCTION_SATURATE 1
#define GEN_MASK_ENABLE 0
#define GEN_MASK_DISABLE 1
/*! Gen opcode */
enum opcode {
GEN_OPCODE_MOV = 1,
GEN_OPCODE_SEL = 2,
GEN_OPCODE_NOT = 4,
GEN_OPCODE_AND = 5,
GEN_OPCODE_OR = 6,
GEN_OPCODE_XOR = 7,
GEN_OPCODE_SHR = 8,
GEN_OPCODE_SHL = 9,
GEN_OPCODE_RSR = 10,
GEN_OPCODE_RSL = 11,
GEN_OPCODE_ASR = 12,
GEN_OPCODE_CMP = 16,
GEN_OPCODE_CMPN = 17,
GEN_OPCODE_JMPI = 32,
GEN_OPCODE_IF = 34,
GEN_OPCODE_IFF = 35,
GEN_OPCODE_ELSE = 36,
GEN_OPCODE_ENDIF = 37,
GEN_OPCODE_DO = 38,
GEN_OPCODE_WHILE = 39,
GEN_OPCODE_BREAK = 40,
GEN_OPCODE_CONTINUE = 41,
GEN_OPCODE_HALT = 42,
GEN_OPCODE_MSAVE = 44,
GEN_OPCODE_MRESTORE = 45,
GEN_OPCODE_PUSH = 46,
GEN_OPCODE_POP = 47,
GEN_OPCODE_WAIT = 48,
GEN_OPCODE_SEND = 49,
GEN_OPCODE_SENDC = 50,
GEN_OPCODE_MATH = 56,
GEN_OPCODE_ADD = 64,
GEN_OPCODE_MUL = 65,
GEN_OPCODE_AVG = 66,
GEN_OPCODE_FRC = 67,
GEN_OPCODE_RNDU = 68,
GEN_OPCODE_RNDD = 69,
GEN_OPCODE_RNDE = 70,
GEN_OPCODE_RNDZ = 71,
GEN_OPCODE_MAC = 72,
GEN_OPCODE_MACH = 73,
GEN_OPCODE_LZD = 74,
GEN_OPCODE_FBH = 75,
GEN_OPCODE_FBL = 76,
GEN_OPCODE_ADDC = 78,
GEN_OPCODE_SUBB = 79,
GEN_OPCODE_SAD2 = 80,
GEN_OPCODE_SADA2 = 81,
GEN_OPCODE_DP4 = 84,
GEN_OPCODE_DPH = 85,
GEN_OPCODE_DP3 = 86,
GEN_OPCODE_DP2 = 87,
GEN_OPCODE_DPA2 = 88,
GEN_OPCODE_LINE = 89,
GEN_OPCODE_PLN = 90,
GEN_OPCODE_MAD = 91,
GEN_OPCODE_NOP = 126,
};
#define GEN_ATOMIC_SIMD16 0
#define GEN_ATOMIC_SIMD8 1
enum GenAtomicOpCode {
GEN_ATOMIC_OP_CMPWR8B = 0,
GEN_ATOMIC_OP_AND = 1,
GEN_ATOMIC_OP_OR = 2,
GEN_ATOMIC_OP_XOR = 3,
GEN_ATOMIC_OP_MOV = 4,
GEN_ATOMIC_OP_INC = 5,
GEN_ATOMIC_OP_DEC = 6,
GEN_ATOMIC_OP_ADD = 7,
GEN_ATOMIC_OP_SUB = 8,
GEN_ATOMIC_OP_REVSUB = 9,
GEN_ATOMIC_OP_IMAX = 10,
GEN_ATOMIC_OP_IMIN = 11,
GEN_ATOMIC_OP_UMAX = 12,
GEN_ATOMIC_OP_UMIN = 13,
GEN_ATOMIC_OP_CMPWR = 14,
GEN_ATOMIC_OP_PREDEC = 15
};
/*! Gen SFID */
enum GenMessageTarget {
GEN_SFID_NULL = 0,
GEN_SFID_MATH = 1,
GEN_SFID_SAMPLER = 2,
GEN_SFID_MESSAGE_GATEWAY = 3,
GEN_SFID_DATAPORT_READ = 4,
GEN_SFID_DATAPORT_WRITE = 5,
GEN_SFID_URB = 6,
GEN_SFID_THREAD_SPAWNER = 7,
GEN6_SFID_DATAPORT_SAMPLER_CACHE = 4,
GEN6_SFID_DATAPORT_RENDER_CACHE = 5,
GEN6_SFID_DATAPORT_CONSTANT_CACHE = 9,
GEN_SFID_DATAPORT_DATA_CACHE = 10,
};
#define GEN_PREDICATE_NONE 0
#define GEN_PREDICATE_NORMAL 1
#define GEN_PREDICATE_ALIGN1_ANYV 2
#define GEN_PREDICATE_ALIGN1_ALLV 3
#define GEN_PREDICATE_ALIGN1_ANY2H 4
#define GEN_PREDICATE_ALIGN1_ALL2H 5
#define GEN_PREDICATE_ALIGN1_ANY4H 6
#define GEN_PREDICATE_ALIGN1_ALL4H 7
#define GEN_PREDICATE_ALIGN1_ANY8H 8
#define GEN_PREDICATE_ALIGN1_ALL8H 9
#define GEN_PREDICATE_ALIGN1_ANY16H 10
#define GEN_PREDICATE_ALIGN1_ALL16H 11
#define GEN_PREDICATE_ALIGN16_REPLICATE_X 2
#define GEN_PREDICATE_ALIGN16_REPLICATE_Y 3
#define GEN_PREDICATE_ALIGN16_REPLICATE_Z 4
#define GEN_PREDICATE_ALIGN16_REPLICATE_W 5
#define GEN_PREDICATE_ALIGN16_ANY4H 6
#define GEN_PREDICATE_ALIGN16_ALL4H 7
#define GEN_ARCHITECTURE_REGISTER_FILE 0
#define GEN_GENERAL_REGISTER_FILE 1
#define GEN_IMMEDIATE_VALUE 3
#define GEN_TYPE_UD 0
#define GEN_TYPE_D 1
#define GEN_TYPE_UW 2
#define GEN_TYPE_W 3
#define GEN_TYPE_UB 4
#define GEN_TYPE_B 5
#define GEN_TYPE_VF 5 /* packed float vector, immediates only? */
#define GEN_TYPE_HF 6
#define GEN_TYPE_V 6 /* packed int vector, immediates only, uword dest only */
#define GEN_TYPE_DF 6
#define GEN_TYPE_F 7
#define GEN_TYPE_UL 8
#define GEN_TYPE_L 9
#define GEN_ARF_NULL 0x00
#define GEN_ARF_ADDRESS 0x10
#define GEN_ARF_ACCUMULATOR 0x20
#define GEN_ARF_FLAG 0x30
#define GEN_ARF_MASK 0x40
#define GEN_ARF_MASK_STACK 0x50
#define GEN_ARF_MASK_STACK_DEPTH 0x60
#define GEN_ARF_STATE 0x70
#define GEN_ARF_CONTROL 0x80
#define GEN_ARF_NOTIFICATION_COUNT 0x90
#define GEN_ARF_IP 0xA0
#define GEN_MRF_COMPR4 (1 << 7)
#define GEN_AMASK 0
#define GEN_IMASK 1
#define GEN_LMASK 2
#define GEN_CMASK 3
#define GEN_THREAD_NORMAL 0
#define GEN_THREAD_ATOMIC 1
#define GEN_THREAD_SWITCH 2
#define GEN_VERTICAL_STRIDE_0 0
#define GEN_VERTICAL_STRIDE_1 1
#define GEN_VERTICAL_STRIDE_2 2
#define GEN_VERTICAL_STRIDE_4 3
#define GEN_VERTICAL_STRIDE_8 4
#define GEN_VERTICAL_STRIDE_16 5
#define GEN_VERTICAL_STRIDE_32 6
#define GEN_VERTICAL_STRIDE_64 7
#define GEN_VERTICAL_STRIDE_128 8
#define GEN_VERTICAL_STRIDE_256 9
#define GEN_VERTICAL_STRIDE_ONE_DIMENSIONAL 0xF
/* Execution width */
#define GEN_WIDTH_1 0
#define GEN_WIDTH_2 1
#define GEN_WIDTH_4 2
#define GEN_WIDTH_8 3
#define GEN_WIDTH_16 4
#define GEN_WIDTH_32 5
/* Channels to enable for the untyped reads and writes */
#define GEN_UNTYPED_RED (1 << 0)
#define GEN_UNTYPED_GREEN (1 << 1)
#define GEN_UNTYPED_BLUE (1 << 2)
#define GEN_UNTYPED_ALPHA (1 << 3)
/* SIMD mode for untyped reads and writes */
#define GEN_UNTYPED_SIMD4x2 0
#define GEN_UNTYPED_SIMD16 1
#define GEN_UNTYPED_SIMD8 2
/* SIMD mode for byte scatters / gathers */
#define GEN_BYTE_SCATTER_SIMD8 0
#define GEN_BYTE_SCATTER_SIMD16 1
/* Data port message type*/
#define GEN_OBLOCK_READ 0 //0000: OWord Block Read
#define GEN_UNALIGNED_OBLOCK_READ 1 //0001: Unaligned OWord Block Read
#define GEN_ODBLOCK_READ 2 //0010: OWord Dual Block Read
#define GEN_DWORD_GATHER 3 //0011: DWord Scattered Read
#define GEN_BYTE_GATHER 4 //0100: Byte Scattered Read
#define GEN_UNTYPED_READ 5 //0101: Untyped Surface Read
#define GEN_UNTYPED_ATOMIC_READ 6 //0110: Untyped Atomic Operation
#define GEN_MEMORY_FENCE 7 //0111: Memory Fence
#define GEN_OBLOCK_WRITE 8 //1000: OWord Block Write
#define GEN_ODBLOCK_WRITE 10//1010: OWord Dual Block Write
#define GEN_DWORD_SCATTER 11//1011: DWord Scattered Write
#define GEN_BYTE_SCATTER 12//1100: Byte Scattered Write
#define GEN_UNTYPED_WRITE 13//1101: Untyped Surface Write
/* Data port data cache scratch messages*/
#define GEN_SCRATCH_READ 0
#define GEN_SCRATCH_WRITE 1
#define GEN_SCRATCH_CHANNEL_MODE_OWORD 0
#define GEN_SCRATCH_CHANNEL_MODE_DWORD 1
#define GEN_SCRATCH_BLOCK_SIZE_1 0
#define GEN_SCRATCH_BLOCK_SIZE_2 1
#define GEN_SCRATCH_BLOCK_SIZE_4 3
/* Data port render cache Message Type*/
#define GEN_MBLOCK_READ 4 //0100: Media Block Read
#define GEN_TYPED_READ 5 //0101: Typed Surface Read
#define GEN_TYPED_ATOMIC 6 //0110: Typed Atomic Operation
#define GEN_MEM_FENCE 7 //0111: Memory Fence
#define GEN_MBLOCK_WRITE 10 //1010: Media Block Write
#define GEN_RENDER_WRITE 12 //1100: Render Target Write
#define GEN_TYPED_WRITE 13 //1101: Typed Surface Write
/* For byte scatters and gathers, the element to write */
#define GEN_BYTE_SCATTER_BYTE 0
#define GEN_BYTE_SCATTER_WORD 1
#define GEN_BYTE_SCATTER_DWORD 2
#define GEN_BYTE_SCATTER_QWORD 3
/* dword scattered rw */
#define GEN_DWORD_SCATTER_8_DWORDS 2
#define GEN_DWORD_SCATTER_16_DWORDS 3
#define GEN_SAMPLER_RETURN_FORMAT_FLOAT32 0
#define GEN_SAMPLER_RETURN_FORMAT_UINT32 2
#define GEN_SAMPLER_RETURN_FORMAT_SINT32 3
#define GEN_SAMPLER_MESSAGE_SIMD8_SAMPLE 0
#define GEN_SAMPLER_MESSAGE_SIMD16_SAMPLE 0
#define GEN_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS 0
#define GEN_SAMPLER_MESSAGE_SIMD8_KILLPIX 1
#define GEN_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD 1
#define GEN_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD 1
#define GEN_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS 2
#define GEN_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS 2
#define GEN_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_COMPARE 0
#define GEN_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE 2
#define GEN_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE 0
#define GEN_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_COMPARE 1
#define GEN_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE 1
#define GEN_SAMPLER_MESSAGE_SIMD4X2_RESINFO 2
#define GEN_SAMPLER_MESSAGE_SIMD16_RESINFO 2
#define GEN_SAMPLER_MESSAGE_SIMD4X2_LD 3
#define GEN_SAMPLER_MESSAGE_SIMD8_LD 3
#define GEN_SAMPLER_MESSAGE_SIMD16_LD 3
#define GEN5_SAMPLER_MESSAGE_SAMPLE 0
#define GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS 1
#define GEN5_SAMPLER_MESSAGE_SAMPLE_LOD 2
#define GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE 3
#define GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS 4
#define GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE 5
#define GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE 6
#define GEN5_SAMPLER_MESSAGE_SAMPLE_LD 7
#define GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO 10
/* for GEN5 only */
#define GEN_SAMPLER_SIMD_MODE_SIMD4X2 0
#define GEN_SAMPLER_SIMD_MODE_SIMD8 1
#define GEN_SAMPLER_SIMD_MODE_SIMD16 2
#define GEN_SAMPLER_SIMD_MODE_SIMD32_64 3
#define GEN_MATH_FUNCTION_INV 1
#define GEN_MATH_FUNCTION_LOG 2
#define GEN_MATH_FUNCTION_EXP 3
#define GEN_MATH_FUNCTION_SQRT 4
#define GEN_MATH_FUNCTION_RSQ 5
#define GEN_MATH_FUNCTION_SIN 6 /* was 7 */
#define GEN_MATH_FUNCTION_COS 7 /* was 8 */
#define GEN_MATH_FUNCTION_FDIV 9 /* gen6+ */
#define GEN_MATH_FUNCTION_POW 10
#define GEN_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER 11
#define GEN_MATH_FUNCTION_INT_DIV_QUOTIENT 12
#define GEN_MATH_FUNCTION_INT_DIV_REMAINDER 13
#define GEN_MATH_INTEGER_UNSIGNED 0
#define GEN_MATH_INTEGER_SIGNED 1
#define GEN_MATH_PRECISION_FULL 0
#define GEN_MATH_PRECISION_PARTIAL 1
#define GEN_MATH_SATURATE_NONE 0
#define GEN_MATH_SATURATE_SATURATE 1
#define GEN_MATH_DATA_VECTOR 0
#define GEN_MATH_DATA_SCALAR 1
#define GEN_DEREFERENCE_URB 0
#define GEN_DO_NOT_DEREFERENCE_URB 1
#define GEN_MAX_NUM_BUFFER_ENTRIES (1 << 27)
/* Message gateway */
#define GEN_OPEN_GATEWAY 0b000
#define GEN_CLOSE_GATEWAY 0b001
#define GEN_FORWARD_MSG 0b010
#define GEN_GET_TIME_STAMP 0b011
#define GEN_BARRIER_MSG 0b100
#define GEN_UPDATE_GATEWAT_STATE 0b101
#define GEN_MMIO_READ_WRITE 0b110
/////////////////////////////////////////////////////////////////////////////
// Gen EU structures
/////////////////////////////////////////////////////////////////////////////
/** Number of general purpose registers (VS, WM, etc) */
#define GEN_MAX_GRF 128
/* Instruction format for the execution units */
struct GenInstruction
{
struct {
uint32_t opcode:7;
uint32_t pad:1;
uint32_t access_mode:1;
uint32_t mask_control:1;
uint32_t dependency_control:2;
uint32_t quarter_control:2;
uint32_t thread_control:2;
uint32_t predicate_control:4;
uint32_t predicate_inverse:1;
uint32_t execution_size:3;
uint32_t destreg_or_condmod:4;
uint32_t acc_wr_control:1;
uint32_t cmpt_control:1;
uint32_t debug_control:1;
uint32_t saturate:1;
} header;
union {
struct {
uint32_t dest_reg_file:2;
uint32_t dest_reg_type:3;
uint32_t src0_reg_file:2;
uint32_t src0_reg_type:3;
uint32_t src1_reg_file:2;
uint32_t src1_reg_type:3;
uint32_t nib_ctrl:1;
uint32_t dest_subreg_nr:5;
uint32_t dest_reg_nr:8;
uint32_t dest_horiz_stride:2;
uint32_t dest_address_mode:1;
} da1;
struct {
uint32_t dest_reg_file:2;
uint32_t dest_reg_type:3;
uint32_t src0_reg_file:2;
uint32_t src0_reg_type:3;
uint32_t src1_reg_file:2; /* 0x00000c00 */
uint32_t src1_reg_type:3; /* 0x00007000 */
uint32_t nib_ctrl:1;
int dest_indirect_offset:10; /* offset against the deref'd address reg */
uint32_t dest_subreg_nr:3; /* subnr for the address reg a0.x */
uint32_t dest_horiz_stride:2;
uint32_t dest_address_mode:1;
} ia1;
struct {
uint32_t dest_reg_file:2;
uint32_t dest_reg_type:3;
uint32_t src0_reg_file:2;
uint32_t src0_reg_type:3;
uint32_t src1_reg_file:2;
uint32_t src1_reg_type:3;
uint32_t nib_ctrl:1;
uint32_t dest_writemask:4;
uint32_t dest_subreg_nr:1;
uint32_t dest_reg_nr:8;
uint32_t dest_horiz_stride:2;
uint32_t dest_address_mode:1;
} da16;
struct {
uint32_t dest_reg_file:2;
uint32_t dest_reg_type:3;
uint32_t src0_reg_file:2;
uint32_t src0_reg_type:3;
uint32_t nib_ctrl:1;
uint32_t dest_writemask:4;
int dest_indirect_offset:6;
uint32_t dest_subreg_nr:3;
uint32_t dest_horiz_stride:2;
uint32_t dest_address_mode:1;
} ia16;
struct {
uint32_t dest_reg_file:2;
uint32_t dest_reg_type:3;
uint32_t src0_reg_file:2;
uint32_t src0_reg_type:3;
uint32_t src1_reg_file:2;
uint32_t src1_reg_type:3;
uint32_t pad:1;
int jump_count:16;
} branch_gen6;
struct {
uint32_t dest_reg_file:1;
uint32_t flag_subreg_num:1;
uint32_t pad0:2;
uint32_t src0_abs:1;
uint32_t src0_negate:1;
uint32_t src1_abs:1;
uint32_t src1_negate:1;
uint32_t src2_abs:1;
uint32_t src2_negate:1;
uint32_t pad1:7;
uint32_t dest_writemask:4;
uint32_t dest_subreg_nr:3;
uint32_t dest_reg_nr:8;
} da3src;
} bits1;
union {
struct {
uint32_t src0_subreg_nr:5;
uint32_t src0_reg_nr:8;
uint32_t src0_abs:1;
uint32_t src0_negate:1;
uint32_t src0_address_mode:1;
uint32_t src0_horiz_stride:2;
uint32_t src0_width:3;
uint32_t src0_vert_stride:4;
uint32_t flag_sub_reg_nr:1;
uint32_t flag_reg_nr:1;
uint32_t pad:5;
} da1;
struct {
int src0_indirect_offset:10;
uint32_t src0_subreg_nr:3;
uint32_t src0_abs:1;
uint32_t src0_negate:1;
uint32_t src0_address_mode:1;
uint32_t src0_horiz_stride:2;
uint32_t src0_width:3;
uint32_t src0_vert_stride:4;
uint32_t flag_sub_reg_nr:1;
uint32_t flag_reg_nr:1;
uint32_t pad:5;
} ia1;
struct {
uint32_t src0_swz_x:2;
uint32_t src0_swz_y:2;
uint32_t src0_subreg_nr:1;
uint32_t src0_reg_nr:8;
uint32_t src0_abs:1;
uint32_t src0_negate:1;
uint32_t src0_address_mode:1;
uint32_t src0_swz_z:2;
uint32_t src0_swz_w:2;
uint32_t pad0:1;
uint32_t src0_vert_stride:4;
uint32_t flag_sub_reg_nr:1;
uint32_t flag_reg_nr:1;
uint32_t pad:5;
} da16;
struct {
uint32_t src0_swz_x:2;
uint32_t src0_swz_y:2;
int src0_indirect_offset:6;
uint32_t src0_subreg_nr:3;
uint32_t src0_abs:1;
uint32_t src0_negate:1;
uint32_t src0_address_mode:1;
uint32_t src0_swz_z:2;
uint32_t src0_swz_w:2;
uint32_t pad0:1;
uint32_t src0_vert_stride:4;
uint32_t flag_sub_reg_nr:1;
uint32_t flag_reg_nr:1;
uint32_t pad:5;
} ia16;
struct {
uint32_t src0_rep_ctrl:1;
uint32_t src0_swizzle:8;
uint32_t src0_subreg_nr:3;
uint32_t src0_reg_nr:8;
uint32_t pad0:1;
uint32_t src1_rep_ctrl:1;
uint32_t src1_swizzle:8;
uint32_t src1_subreg_nr_low:2;
} da3src;
} bits2;
union {
struct {
uint32_t src1_subreg_nr:5;
uint32_t src1_reg_nr:8;
uint32_t src1_abs:1;
uint32_t src1_negate:1;
uint32_t src1_address_mode:1;
uint32_t src1_horiz_stride:2;
uint32_t src1_width:3;
uint32_t src1_vert_stride:4;
uint32_t pad0:7;
} da1;
struct {
uint32_t src1_swz_x:2;
uint32_t src1_swz_y:2;
uint32_t src1_subreg_nr:1;
uint32_t src1_reg_nr:8;
uint32_t src1_abs:1;
uint32_t src1_negate:1;
uint32_t src1_address_mode:1;
uint32_t src1_swz_z:2;
uint32_t src1_swz_w:2;
uint32_t pad1:1;
uint32_t src1_vert_stride:4;
uint32_t pad2:7;
} da16;
struct {
int src1_indirect_offset:10;
uint32_t src1_subreg_nr:3;
uint32_t src1_abs:1;
uint32_t src1_negate:1;
uint32_t src1_address_mode:1;
uint32_t src1_horiz_stride:2;
uint32_t src1_width:3;
uint32_t src1_vert_stride:4;
uint32_t pad1:7;
} ia1;
struct {
uint32_t src1_swz_x:2;
uint32_t src1_swz_y:2;
int src1_indirect_offset:6;
uint32_t src1_subreg_nr:3;
uint32_t src1_abs:1;
uint32_t src1_negate:1;
uint32_t pad0:1;
uint32_t src1_swz_z:2;
uint32_t src1_swz_w:2;
uint32_t pad1:1;
uint32_t src1_vert_stride:4;
uint32_t pad2:7;
} ia16;
struct {
uint32_t function_control:19;
uint32_t header_present:1;
uint32_t response_length:5;
uint32_t msg_length:4;
uint32_t pad1:2;
uint32_t end_of_thread:1;
} generic_gen5;
struct {
uint32_t sub_function_id:3;
uint32_t pad0:11;
uint32_t ack_req:1;
uint32_t notify:2;
uint32_t pad1:2;
uint32_t header:1;
uint32_t response_length:5;
uint32_t msg_length:4;
uint32_t pad2:2;
uint32_t end_of_thread:1;
} msg_gateway;
struct {
uint32_t opcode:1;
uint32_t request:1;
uint32_t pad0:2;
uint32_t resource:1;
uint32_t pad1:14;
uint32_t header:1;
uint32_t response_length:5;
uint32_t msg_length:4;
uint32_t pad2:2;
uint32_t end_of_thread:1;
} spawner_gen5;
/** Ironlake PRM, Volume 4 Part 1, Section 6.1.1.1 */
struct {
uint32_t function:4;
uint32_t int_type:1;
uint32_t precision:1;
uint32_t saturate:1;
uint32_t data_type:1;
uint32_t snapshot:1;
uint32_t pad0:10;
uint32_t header_present:1;
uint32_t response_length:5;
uint32_t msg_length:4;
uint32_t pad1:2;
uint32_t end_of_thread:1;
} math_gen5;
struct {
uint32_t bti:8;
uint32_t sampler:4;
uint32_t msg_type:5;
uint32_t simd_mode:2;
uint32_t header_present:1;
uint32_t response_length:5;
uint32_t msg_length:4;
uint32_t pad1:2;
uint32_t end_of_thread:1;
} sampler_gen7;
/**
* Message for the Sandybridge Sampler Cache or Constant Cache Data Port.
*
* See the Sandybridge PRM, Volume 4 Part 1, Section 3.9.2.1.1.
**/
struct {
uint32_t bti:8;
uint32_t msg_control:5;
uint32_t msg_type:3;
uint32_t pad0:3;
uint32_t header_present:1;
uint32_t response_length:5;
uint32_t msg_length:4;
uint32_t pad1:2;
uint32_t end_of_thread:1;
} gen6_dp_sampler_const_cache;
/*! Data port untyped read / write messages */
struct {
uint32_t bti:8;
uint32_t rgba:4;
uint32_t simd_mode:2;
uint32_t msg_type:4;
uint32_t category:1;
uint32_t header_present:1;
uint32_t response_length:5;
uint32_t msg_length:4;
uint32_t pad2:2;
uint32_t end_of_thread:1;
} gen7_untyped_rw;
/*! Data port byte scatter / gather */
struct {
uint32_t bti:8;
uint32_t simd_mode:1;
uint32_t ignored0:1;
uint32_t data_size:2;
uint32_t ignored1:2;
uint32_t msg_type:4;
uint32_t category:1;
uint32_t header_present:1;
uint32_t response_length:5;
uint32_t msg_length:4;
uint32_t pad2:2;
uint32_t end_of_thread:1;
} gen7_byte_rw;
/*! Data port Scratch Read/ write */
struct {
uint32_t offset:12;
uint32_t block_size:2;
uint32_t ignored0:1;
uint32_t invalidate_after_read:1;
uint32_t channel_mode:1;
uint32_t msg_type:1;
uint32_t category:1;
uint32_t header_present:1;
uint32_t response_length:5;
uint32_t msg_length:4;
uint32_t pad2:2;
uint32_t end_of_thread:1;
} gen7_scratch_rw;
/*! Data port OBlock read / write */
struct {
uint32_t bti:8;
uint32_t block_size:3;
uint32_t ignored:2;
uint32_t invalidate_after_read:1;
uint32_t msg_type:4;
uint32_t category:1;
uint32_t header_present:1;
uint32_t response_length:5;
uint32_t msg_length:4;
uint32_t pad2:2;
uint32_t end_of_thread:1;
} gen7_oblock_rw;
/*! Data port dword scatter / gather */
struct {
uint32_t bti:8;
uint32_t block_size:2;
uint32_t ignored0:3;
uint32_t invalidate_after_read:1;
uint32_t msg_type:4;
uint32_t ignored1:1;
uint32_t header_present:1;
uint32_t response_length:5;
uint32_t msg_length:4;
uint32_t pad2:2;
uint32_t end_of_thread:1;
} gen7_dword_rw;
/*! Data port typed read / write messages */
struct {
uint32_t bti:8;
uint32_t chan_mask:4;
uint32_t pad:1;
uint32_t slot:1;
uint32_t msg_type:4;
uint32_t pad2:1;
uint32_t header_present:1;
uint32_t response_length:5;
uint32_t msg_length:4;
uint32_t pad3:2;
uint32_t end_of_thread:1;
} gen7_typed_rw;
/*! Memory fence */
struct {
uint32_t bti:8;
uint32_t pad:5;
uint32_t commit_enable:1;
uint32_t msg_type:4;
uint32_t pad2:1;
uint32_t header_present:1;
uint32_t response_length:5;
uint32_t msg_length:4;
uint32_t pad3:2;
uint32_t end_of_thread:1;
} gen7_memory_fence;
/*! atomic messages */
struct {
uint32_t bti:8;
uint32_t aop_type:4;
uint32_t simd_mode:1;
uint32_t return_data:1;
uint32_t msg_type:4;
uint32_t category:1;
uint32_t header_present:1;
uint32_t response_length:5;
uint32_t msg_length:4;
uint32_t pad3:2;
uint32_t end_of_thread:1;
} gen7_atomic_op;
struct {
uint32_t src1_subreg_nr_high:1;
uint32_t src1_reg_nr:8;
uint32_t pad0:1;
uint32_t src2_rep_ctrl:1;
uint32_t src2_swizzle:8;
uint32_t src2_subreg_nr:3;
uint32_t src2_reg_nr:8;
uint32_t pad1:2;
} da3src;
/*! Message gateway */
struct {
uint32_t subfunc:3;
uint32_t pad:11;
uint32_t ackreq:1;
uint32_t notify:2;
uint32_t pad2:2;
uint32_t header_present:1;
uint32_t response_length:5;
uint32_t msg_length:4;
uint32_t pad3:2;
uint32_t end_of_thread:1;
} gen7_msg_gw;
int d;
uint32_t ud;
float f;
} bits3;
};
#endif /* __GEN_DEFS_HPP__ */
Release_v0.3/backend/src/backend/gen_encoder.cpp 0000664 0000000 0000000 00000136416 12231421770 0021737 0 ustar 00root root 0000000 0000000 /*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see .
*
* Author: Benjamin Segovia
*/
/*
Copyright (C) Intel Corp. 2006. All Rights Reserved.
Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
develop this 3D driver.
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice (including the
next paragraph) shall be included in all copies or substantial
portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
**********************************************************************/
/*
* Authors:
* Keith Whitwell
*/
#include "backend/gen_encoder.hpp"
#include
namespace gbe
{
//////////////////////////////////////////////////////////////////////////
// Some helper functions to encode
//////////////////////////////////////////////////////////////////////////
INLINE bool isVectorOfBytes(GenRegister reg) {
if (reg.hstride != GEN_HORIZONTAL_STRIDE_0 &&
(reg.type == GEN_TYPE_UB || reg.type == GEN_TYPE_B))
return true;
else
return false;
}
INLINE bool needToSplitAlu1(GenEncoder *p, GenRegister dst, GenRegister src) {
if (p->curr.execWidth != 16) return false;
if (isVectorOfBytes(dst) == true) return true;
if (isVectorOfBytes(src) == true) return true;
return false;
}
INLINE bool needToSplitAlu2(GenEncoder *p, GenRegister dst, GenRegister src0, GenRegister src1) {
if (p->curr.execWidth != 16) return false;
if (isVectorOfBytes(dst) == true) return true;
if (isVectorOfBytes(src0) == true) return true;
if (isVectorOfBytes(src1) == true) return true;
return false;
}
INLINE bool needToSplitCmp(GenEncoder *p, GenRegister src0, GenRegister src1) {
if (p->curr.execWidth != 16) return false;
if (isVectorOfBytes(src0) == true) return true;
if (isVectorOfBytes(src1) == true) return true;
if (src0.type == GEN_TYPE_D || src0.type == GEN_TYPE_UD || src0.type == GEN_TYPE_F)
return true;
if (src1.type == GEN_TYPE_D || src1.type == GEN_TYPE_UD || src1.type == GEN_TYPE_F)
return true;
return false;
}
static void setMessageDescriptor(GenEncoder *p,
GenInstruction *inst,
enum GenMessageTarget sfid,
unsigned msg_length,
unsigned response_length,
bool header_present = false,
bool end_of_thread = false)
{
p->setSrc1(inst, GenRegister::immd(0));
inst->bits3.generic_gen5.header_present = header_present;
inst->bits3.generic_gen5.response_length = response_length;
inst->bits3.generic_gen5.msg_length = msg_length;
inst->bits3.generic_gen5.end_of_thread = end_of_thread;
inst->header.destreg_or_condmod = sfid;
}
static void setDPUntypedRW(GenEncoder *p,
GenInstruction *insn,
uint32_t bti,
uint32_t rgba,
uint32_t msg_type,
uint32_t msg_length,
uint32_t response_length)
{
const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA_CACHE;
setMessageDescriptor(p, insn, sfid, msg_length, response_length);
insn->bits3.gen7_untyped_rw.msg_type = msg_type;
insn->bits3.gen7_untyped_rw.bti = bti;
insn->bits3.gen7_untyped_rw.rgba = rgba;
if (p->curr.execWidth == 8)
insn->bits3.gen7_untyped_rw.simd_mode = GEN_UNTYPED_SIMD8;
else if (p->curr.execWidth == 16)
insn->bits3.gen7_untyped_rw.simd_mode = GEN_UNTYPED_SIMD16;
else
NOT_SUPPORTED;
}
static void setDPByteScatterGather(GenEncoder *p,
GenInstruction *insn,
uint32_t bti,
uint32_t elem_size,
uint32_t msg_type,
uint32_t msg_length,
uint32_t response_length)
{
const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA_CACHE;
setMessageDescriptor(p, insn, sfid, msg_length, response_length);
insn->bits3.gen7_byte_rw.msg_type = msg_type;
insn->bits3.gen7_byte_rw.bti = bti;
insn->bits3.gen7_byte_rw.data_size = elem_size;
if (p->curr.execWidth == 8)
insn->bits3.gen7_byte_rw.simd_mode = GEN_BYTE_SCATTER_SIMD8;
else if (p->curr.execWidth == 16)
insn->bits3.gen7_byte_rw.simd_mode = GEN_BYTE_SCATTER_SIMD16;
else
NOT_SUPPORTED;
}
#if 0
static void setOBlockRW(GenEncoder *p,
GenInstruction *insn,
uint32_t bti,
uint32_t size,
uint32_t msg_type,
uint32_t msg_length,
uint32_t response_length)
{
const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA_CACHE;
setMessageDescriptor(p, insn, sfid, msg_length, response_length);
assert(size == 2 || size == 4);
insn->bits3.gen7_oblock_rw.msg_type = msg_type;
insn->bits3.gen7_oblock_rw.bti = bti;
insn->bits3.gen7_oblock_rw.block_size = size == 2 ? 2 : 3;
insn->bits3.gen7_oblock_rw.header_present = 1;
}
#endif
static void setSamplerMessage(GenEncoder *p,
GenInstruction *insn,
unsigned char bti,
unsigned char sampler,
uint32_t msg_type,
uint32_t response_length,
uint32_t msg_length,
bool header_present,
uint32_t simd_mode,
uint32_t return_format)
{
const GenMessageTarget sfid = GEN_SFID_SAMPLER;
setMessageDescriptor(p, insn, sfid, msg_length, response_length);
insn->bits3.sampler_gen7.bti = bti;
insn->bits3.sampler_gen7.sampler = sampler;
insn->bits3.sampler_gen7.msg_type = msg_type;
insn->bits3.sampler_gen7.simd_mode = simd_mode;
}
static void setTypedWriteMessage(GenEncoder *p,
GenInstruction *insn,
unsigned char bti,
unsigned char msg_type,
uint32_t msg_length,
bool header_present)
{
const GenMessageTarget sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
setMessageDescriptor(p, insn, sfid, msg_length, 0, header_present);
insn->bits3.gen7_typed_rw.bti = bti;
insn->bits3.gen7_typed_rw.msg_type = msg_type;
}
static void setDWordScatterMessgae(GenEncoder *p,
GenInstruction *insn,
uint32_t bti,
uint32_t block_size,
uint32_t msg_type,
uint32_t msg_length,
uint32_t response_length)
{
const GenMessageTarget sfid = GEN6_SFID_DATAPORT_CONSTANT_CACHE;
setMessageDescriptor(p, insn, sfid, msg_length, response_length);
insn->bits3.gen7_dword_rw.msg_type = msg_type;
insn->bits3.gen7_dword_rw.bti = bti;
insn->bits3.gen7_dword_rw.block_size = block_size;
insn->bits3.gen7_dword_rw.invalidate_after_read = 0;
}
//////////////////////////////////////////////////////////////////////////
// Gen Emitter encoding class
//////////////////////////////////////////////////////////////////////////
GenEncoder::GenEncoder(uint32_t simdWidth, uint32_t gen) :
stateNum(0), gen(gen)
{
this->curr.execWidth = simdWidth;
this->curr.quarterControl = GEN_COMPRESSION_Q1;
this->curr.noMask = 0;
this->curr.flag = 0;
this->curr.subFlag = 0;
this->curr.predicate = GEN_PREDICATE_NORMAL;
this->curr.inversePredicate = 0;
}
void GenEncoder::push(void) {
assert(stateNum < MAX_STATE_NUM);
stack[stateNum++] = curr;
}
void GenEncoder::pop(void) {
assert(stateNum > 0);
curr = stack[--stateNum];
}
void GenEncoder::setHeader(GenInstruction *insn) {
if (this->curr.execWidth == 8)
insn->header.execution_size = GEN_WIDTH_8;
else if (this->curr.execWidth == 16)
insn->header.execution_size = GEN_WIDTH_16;
else if (this->curr.execWidth == 1)
insn->header.execution_size = GEN_WIDTH_1;
else
NOT_IMPLEMENTED;
insn->header.acc_wr_control = this->curr.accWrEnable;
insn->header.quarter_control = this->curr.quarterControl;
insn->bits1.ia1.nib_ctrl = this->curr.nibControl;
insn->header.mask_control = this->curr.noMask;
insn->bits2.ia1.flag_reg_nr = this->curr.flag;
insn->bits2.ia1.flag_sub_reg_nr = this->curr.subFlag;
if (this->curr.predicate != GEN_PREDICATE_NONE) {
insn->header.predicate_control = this->curr.predicate;
insn->header.predicate_inverse = this->curr.inversePredicate;
}
insn->header.saturate = this->curr.saturate;
}
void GenEncoder::setDst(GenInstruction *insn, GenRegister dest) {
if (dest.file != GEN_ARCHITECTURE_REGISTER_FILE)
assert(dest.nr < 128);
insn->bits1.da1.dest_reg_file = dest.file;
insn->bits1.da1.dest_reg_type = dest.type;
insn->bits1.da1.dest_address_mode = dest.address_mode;
insn->bits1.da1.dest_reg_nr = dest.nr;
insn->bits1.da1.dest_subreg_nr = dest.subnr;
if (dest.hstride == GEN_HORIZONTAL_STRIDE_0)
dest.hstride = GEN_HORIZONTAL_STRIDE_1;
insn->bits1.da1.dest_horiz_stride = dest.hstride;
}
void GenEncoder::setSrc0(GenInstruction *insn, GenRegister reg) {
if (reg.file != GEN_ARCHITECTURE_REGISTER_FILE)
assert(reg.nr < 128);
if (reg.address_mode == GEN_ADDRESS_DIRECT) {
insn->bits1.da1.src0_reg_file = reg.file;
insn->bits1.da1.src0_reg_type = reg.type;
insn->bits2.da1.src0_abs = reg.absolute;
insn->bits2.da1.src0_negate = reg.negation;
insn->bits2.da1.src0_address_mode = reg.address_mode;
if (reg.file == GEN_IMMEDIATE_VALUE) {
insn->bits3.ud = reg.value.ud;
/* Required to set some fields in src1 as well: */
insn->bits1.da1.src1_reg_file = 0; /* arf */
insn->bits1.da1.src1_reg_type = reg.type;
}
else {
if (insn->header.access_mode == GEN_ALIGN_1) {
insn->bits2.da1.src0_subreg_nr = reg.subnr;
insn->bits2.da1.src0_reg_nr = reg.nr;
} else {
insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
insn->bits2.da16.src0_reg_nr = reg.nr;
}
if (reg.width == GEN_WIDTH_1 &&
insn->header.execution_size == GEN_WIDTH_1) {
insn->bits2.da1.src0_horiz_stride = GEN_HORIZONTAL_STRIDE_0;
insn->bits2.da1.src0_width = GEN_WIDTH_1;
insn->bits2.da1.src0_vert_stride = GEN_VERTICAL_STRIDE_0;
}
else {
insn->bits2.da1.src0_horiz_stride = reg.hstride;
insn->bits2.da1.src0_width = reg.width;
insn->bits2.da1.src0_vert_stride = reg.vstride;
}
}
} else {
insn->bits1.ia1.src0_reg_file = GEN_GENERAL_REGISTER_FILE;
insn->bits1.ia1.src0_reg_type = reg.type;
insn->bits2.ia1.src0_subreg_nr = 0;
insn->bits2.ia1.src0_indirect_offset = 0;
insn->bits2.ia1.src0_abs = 0;
insn->bits2.ia1.src0_negate = 0;
insn->bits2.ia1.src0_address_mode = reg.address_mode;
insn->bits2.ia1.src0_horiz_stride = GEN_HORIZONTAL_STRIDE_0;
insn->bits2.ia1.src0_width = GEN_WIDTH_1;
insn->bits2.ia1.src0_vert_stride = GEN_VERTICAL_STRIDE_ONE_DIMENSIONAL;
}
}
void GenEncoder::setSrc1(GenInstruction *insn, GenRegister reg) {
assert(reg.nr < 128);
assert(reg.file != GEN_ARCHITECTURE_REGISTER_FILE || reg.nr == 0);
insn->bits1.da1.src1_reg_file = reg.file;
insn->bits1.da1.src1_reg_type = reg.type;
insn->bits3.da1.src1_abs = reg.absolute;
insn->bits3.da1.src1_negate = reg.negation;
assert(insn->bits1.da1.src0_reg_file != GEN_IMMEDIATE_VALUE);
if (reg.file == GEN_IMMEDIATE_VALUE)
insn->bits3.ud = reg.value.ud;
else {
assert (reg.address_mode == GEN_ADDRESS_DIRECT);
if (insn->header.access_mode == GEN_ALIGN_1) {
insn->bits3.da1.src1_subreg_nr = reg.subnr;
insn->bits3.da1.src1_reg_nr = reg.nr;
} else {
insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
insn->bits3.da16.src1_reg_nr = reg.nr;
}
if (reg.width == GEN_WIDTH_1 &&
insn->header.execution_size == GEN_WIDTH_1) {
insn->bits3.da1.src1_horiz_stride = GEN_HORIZONTAL_STRIDE_0;
insn->bits3.da1.src1_width = GEN_WIDTH_1;
insn->bits3.da1.src1_vert_stride = GEN_VERTICAL_STRIDE_0;
} else {
insn->bits3.da1.src1_horiz_stride = reg.hstride;
insn->bits3.da1.src1_width = reg.width;
insn->bits3.da1.src1_vert_stride = reg.vstride;
}
}
}
static const uint32_t untypedRWMask[] = {
GEN_UNTYPED_ALPHA|GEN_UNTYPED_BLUE|GEN_UNTYPED_GREEN|GEN_UNTYPED_RED,
GEN_UNTYPED_ALPHA|GEN_UNTYPED_BLUE|GEN_UNTYPED_GREEN,
GEN_UNTYPED_ALPHA|GEN_UNTYPED_BLUE,
GEN_UNTYPED_ALPHA,
0
};
void GenEncoder::READ64(GenRegister dst, GenRegister tmp, GenRegister addr, GenRegister src, uint32_t bti, uint32_t elemNum) {
GenRegister dst32 = GenRegister::retype(dst, GEN_TYPE_UD);
src = GenRegister::retype(src, GEN_TYPE_UD);
addr = GenRegister::retype(addr, GEN_TYPE_UD);
tmp = GenRegister::retype(tmp, GEN_TYPE_UD);
uint32_t originSimdWidth = curr.execWidth;
uint32_t originPredicate = curr.predicate;
uint32_t originMask = curr.noMask;
push();
for ( uint32_t channels = 0, currQuarter = GEN_COMPRESSION_Q1;
channels < originSimdWidth; channels += 8, currQuarter++) {
curr.predicate = GEN_PREDICATE_NONE;
curr.noMask = GEN_MASK_DISABLE;
curr.execWidth = 8;
/* XXX The following instruction is illegal, but it works as SIMD 1*4 mode
which is what we want here. */
MOV(GenRegister::h2(addr), GenRegister::suboffset(src, channels));
ADD(GenRegister::h2(GenRegister::suboffset(addr, 1)), GenRegister::suboffset(src, channels), GenRegister::immd(4));
MOV(GenRegister::h2(GenRegister::suboffset(addr, 8)), GenRegister::suboffset(src, channels + 4));
ADD(GenRegister::h2(GenRegister::suboffset(addr, 9)), GenRegister::suboffset(src, channels + 4), GenRegister::immd(4));
// Let's use SIMD16 to read all bytes for 8 doubles data at one time.
curr.execWidth = 16;
this->UNTYPED_READ(tmp, addr, bti, elemNum);
if (originSimdWidth == 16)
curr.quarterControl = currQuarter;
curr.predicate = originPredicate;
curr.noMask = originMask;
// Back to simd8 for correct predication flag.
curr.execWidth = 8;
MOV(GenRegister::retype(GenRegister::suboffset(dst32, channels * 2), GEN_TYPE_DF), GenRegister::retype(tmp, GEN_TYPE_DF));
}
pop();
}
void GenEncoder::WRITE64(GenRegister msg, GenRegister data, uint32_t bti, uint32_t elemNum, bool is_scalar) {
GenRegister data32 = GenRegister::retype(data, GEN_TYPE_UD);
GenRegister unpacked;
msg = GenRegister::retype(msg, GEN_TYPE_UD);
int originSimdWidth = curr.execWidth;
int originPredicate = curr.predicate;
int originMask = curr.noMask;
push();
for (uint32_t half = 0; half < 2; half++) {
curr.predicate = GEN_PREDICATE_NONE;
curr.noMask = GEN_MASK_DISABLE;
curr.execWidth = 8;
if (is_scalar) {
unpacked = data32;
unpacked.subnr += half * 4;
} else
unpacked = GenRegister::unpacked_ud(data32.nr, data32.subnr + half);
MOV(GenRegister::suboffset(msg, originSimdWidth), unpacked);
if (originSimdWidth == 16) {
if (is_scalar) {
unpacked = data32;
unpacked.subnr += half * 4;
} else
unpacked = GenRegister::unpacked_ud(data32.nr + 2, data32.subnr + half);
MOV(GenRegister::suboffset(msg, originSimdWidth + 8), unpacked);
curr.execWidth = 16;
}
if (half == 1)
ADD(GenRegister::retype(msg, GEN_TYPE_UD), GenRegister::retype(msg, GEN_TYPE_UD), GenRegister::immd(4));
curr.predicate = originPredicate;
curr.noMask = originMask;
this->UNTYPED_WRITE(msg, bti, elemNum);
}
pop();
}
void GenEncoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
GenInstruction *insn = this->next(GEN_OPCODE_SEND);
assert(elemNum >= 1 || elemNum <= 4);
uint32_t msg_length = 0;
uint32_t response_length = 0;
if (this->curr.execWidth == 8) {
msg_length = 1;
response_length = elemNum;
} else if (this->curr.execWidth == 16) {
msg_length = 2;
response_length = 2*elemNum;
} else
NOT_IMPLEMENTED;
this->setHeader(insn);
this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
this->setSrc1(insn, GenRegister::immud(0));
setDPUntypedRW(this,
insn,
bti,
untypedRWMask[elemNum],
GEN_UNTYPED_READ,
msg_length,
response_length);
}
void GenEncoder::UNTYPED_WRITE(GenRegister msg, uint32_t bti, uint32_t elemNum) {
GenInstruction *insn = this->next(GEN_OPCODE_SEND);
assert(elemNum >= 1 || elemNum <= 4);
uint32_t msg_length = 0;
uint32_t response_length = 0;
this->setHeader(insn);
if (this->curr.execWidth == 8) {
this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
msg_length = 1+elemNum;
} else if (this->curr.execWidth == 16) {
this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
msg_length = 2*(1+elemNum);
}
else
NOT_IMPLEMENTED;
this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
this->setSrc1(insn, GenRegister::immud(0));
setDPUntypedRW(this,
insn,
bti,
untypedRWMask[elemNum],
GEN_UNTYPED_WRITE,
msg_length,
response_length);
}
void GenEncoder::BYTE_GATHER(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemSize) {
GenInstruction *insn = this->next(GEN_OPCODE_SEND);
uint32_t msg_length = 0;
uint32_t response_length = 0;
if (this->curr.execWidth == 8) {
msg_length = 1;
response_length = 1;
} else if (this->curr.execWidth == 16) {
msg_length = 2;
response_length = 2;
} else
NOT_IMPLEMENTED;
this->setHeader(insn);
this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
this->setSrc1(insn, GenRegister::immud(0));
setDPByteScatterGather(this,
insn,
bti,
elemSize,
GEN_BYTE_GATHER,
msg_length,
response_length);
}
void GenEncoder::BYTE_SCATTER(GenRegister msg, uint32_t bti, uint32_t elemSize) {
GenInstruction *insn = this->next(GEN_OPCODE_SEND);
uint32_t msg_length = 0;
uint32_t response_length = 0;
this->setHeader(insn);
if (this->curr.execWidth == 8) {
this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
msg_length = 2;
} else if (this->curr.execWidth == 16) {
this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
msg_length = 4;
} else
NOT_IMPLEMENTED;
this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
this->setSrc1(insn, GenRegister::immud(0));
setDPByteScatterGather(this,
insn,
bti,
elemSize,
GEN_BYTE_SCATTER,
msg_length,
response_length);
}
void GenEncoder::DWORD_GATHER(GenRegister dst, GenRegister src, uint32_t bti) {
GenInstruction *insn = this->next(GEN_OPCODE_SEND);
uint32_t msg_length = 0;
uint32_t response_length = 0;
uint32_t block_size = 0;
if (this->curr.execWidth == 8) {
msg_length = 1;
response_length = 1;
block_size = GEN_DWORD_SCATTER_8_DWORDS;
} else if (this->curr.execWidth == 16) {
msg_length = 2;
response_length = 2;
block_size = GEN_DWORD_SCATTER_16_DWORDS;
} else
NOT_IMPLEMENTED;
this->setHeader(insn);
this->setDst(insn, dst);
this->setSrc0(insn, src);
this->setSrc1(insn, GenRegister::immud(0));
setDWordScatterMessgae(this,
insn,
bti,
block_size,
GEN_DWORD_GATHER,
msg_length,
response_length);
}
void GenEncoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum) {
GenInstruction *insn = this->next(GEN_OPCODE_SEND);
uint32_t msg_length = 0;
uint32_t response_length = 0;
if (this->curr.execWidth == 8) {
msg_length = srcNum;
response_length = 1;
} else if (this->curr.execWidth == 16) {
msg_length = 2*srcNum;
response_length = 2;
} else
NOT_IMPLEMENTED;
this->setHeader(insn);
this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
this->setSrc1(insn, GenRegister::immud(0));
const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA_CACHE;
setMessageDescriptor(this, insn, sfid, msg_length, response_length);
insn->bits3.gen7_atomic_op.msg_type = GEN_UNTYPED_ATOMIC_READ;
insn->bits3.gen7_atomic_op.bti = bti;
insn->bits3.gen7_atomic_op.return_data = 1;
insn->bits3.gen7_atomic_op.aop_type = function;
if (this->curr.execWidth == 8)
insn->bits3.gen7_atomic_op.simd_mode = GEN_ATOMIC_SIMD8;
else if (this->curr.execWidth == 16)
insn->bits3.gen7_atomic_op.simd_mode = GEN_ATOMIC_SIMD16;
else
NOT_SUPPORTED;
}
GenInstruction *GenEncoder::next(uint32_t opcode) {
GenInstruction insn;
std::memset(&insn, 0, sizeof(GenInstruction));
insn.header.opcode = opcode;
this->store.push_back(insn);
return &this->store.back();
}
INLINE void _handleDouble(GenEncoder *p, uint32_t opcode, GenRegister dst,
GenRegister src0, GenRegister src1 = GenRegister::null()) {
int w = p->curr.execWidth;
p->push();
p->curr.nibControl = 0;
GenInstruction *insn = p->next(opcode);
p->setHeader(insn);
p->setDst(insn, dst);
p->setSrc0(insn, src0);
if (!GenRegister::isNull(src1))
p->setSrc1(insn, src1);
if (w == 8)
p->curr.nibControl = 1; // second 1/8 mask
insn = p->next(opcode);
p->setHeader(insn);
p->setDst(insn, GenRegister::suboffset(dst, w / 2));
p->setSrc0(insn, GenRegister::suboffset(src0, w / 2));
if (!GenRegister::isNull(src1))
p->setSrc1(insn, GenRegister::suboffset(src1, w / 2));
p->pop();
}
// Double register accessing is a little special,
// Per Gen spec, then only supported mode is SIMD8 and, it only
// handles four doubles each time.
// We need to lower down SIMD16 to two SIMD8 and lower down SIMD8
// to two SIMD1x4.
INLINE void handleDouble(GenEncoder *p, uint32_t opcode, GenRegister dst,
GenRegister src0, GenRegister src1 = GenRegister::null()) {
if (p->curr.execWidth == 8)
_handleDouble(p, opcode, dst, src0, src1);
else if (p->curr.execWidth == 16) {
p->push();
p->curr.execWidth = 8;
p->curr.quarterControl = GEN_COMPRESSION_Q1;
_handleDouble(p, opcode, dst, src0, src1);
p->curr.quarterControl = GEN_COMPRESSION_Q2;
if (!GenRegister::isNull(src1))
src1 = GenRegister::offset(src1, 2);
_handleDouble(p, opcode, GenRegister::offset(dst, 2), GenRegister::offset(src0, 2), src1);
p->pop();
}
}
INLINE void alu1(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src) {
if (dst.isdf() && src.isdf()) {
handleDouble(p, opcode, dst, src);
} else if (dst.isint64() && src.isint64()) { // handle int64
int execWidth = p->curr.execWidth;
p->push();
p->curr.execWidth = 8;
for (int nib = 0; nib < execWidth / 4; nib ++) {
p->curr.chooseNib(nib);
p->MOV(dst.bottom_half(), src.bottom_half());
p->MOV(dst.top_half(), src.top_half());
dst = GenRegister::suboffset(dst, 4);
src = GenRegister::suboffset(src, 4);
}
p->pop();
} else if (needToSplitAlu1(p, dst, src) == false) {
GenInstruction *insn = p->next(opcode);
p->setHeader(insn);
p->setDst(insn, dst);
p->setSrc0(insn, src);
} else {
GenInstruction *insnQ1, *insnQ2;
// Instruction for the first quarter
insnQ1 = p->next(opcode);
p->setHeader(insnQ1);
insnQ1->header.quarter_control = GEN_COMPRESSION_Q1;
insnQ1->header.execution_size = GEN_WIDTH_8;
p->setDst(insnQ1, dst);
p->setSrc0(insnQ1, src);
// Instruction for the second quarter
insnQ2 = p->next(opcode);
p->setHeader(insnQ2);
insnQ2->header.quarter_control = GEN_COMPRESSION_Q2;
insnQ2->header.execution_size = GEN_WIDTH_8;
p->setDst(insnQ2, GenRegister::Qn(dst, 1));
p->setSrc0(insnQ2, GenRegister::Qn(src, 1));
}
}
INLINE void alu2(GenEncoder *p,
uint32_t opcode,
GenRegister dst,
GenRegister src0,
GenRegister src1)
{
if (dst.isdf() && src0.isdf() && src1.isdf()) {
handleDouble(p, opcode, dst, src0, src1);
} else if (needToSplitAlu2(p, dst, src0, src1) == false) {
GenInstruction *insn = p->next(opcode);
p->setHeader(insn);
p->setDst(insn, dst);
p->setSrc0(insn, src0);
p->setSrc1(insn, src1);
} else {
GenInstruction *insnQ1, *insnQ2;
// Instruction for the first quarter
insnQ1 = p->next(opcode);
p->setHeader(insnQ1);
insnQ1->header.quarter_control = GEN_COMPRESSION_Q1;
insnQ1->header.execution_size = GEN_WIDTH_8;
p->setDst(insnQ1, dst);
p->setSrc0(insnQ1, src0);
p->setSrc1(insnQ1, src1);
// Instruction for the second quarter
insnQ2 = p->next(opcode);
p->setHeader(insnQ2);
insnQ2->header.quarter_control = GEN_COMPRESSION_Q2;
insnQ2->header.execution_size = GEN_WIDTH_8;
p->setDst(insnQ2, GenRegister::Qn(dst, 1));
p->setSrc0(insnQ2, GenRegister::Qn(src0, 1));
p->setSrc1(insnQ2, GenRegister::Qn(src1, 1));
}
}
#define NO_SWIZZLE ((0<<0) | (1<<2) | (2<<4) | (3<<6))
static GenInstruction *alu3(GenEncoder *p,
uint32_t opcode,
GenRegister dest,
GenRegister src0,
GenRegister src1,
GenRegister src2)
{
GenInstruction *insn = p->next(opcode);
assert(dest.file == GEN_GENERAL_REGISTER_FILE);
assert(dest.nr < 128);
assert(dest.address_mode == GEN_ADDRESS_DIRECT);
assert(dest.type = GEN_TYPE_F);
insn->bits1.da3src.dest_reg_file = 0;
insn->bits1.da3src.dest_reg_nr = dest.nr;
insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
insn->bits1.da3src.dest_writemask = 0xf;
p->setHeader(insn);
insn->header.access_mode = GEN_ALIGN_16;
insn->header.execution_size = GEN_WIDTH_8;
assert(src0.file == GEN_GENERAL_REGISTER_FILE);
assert(src0.address_mode == GEN_ADDRESS_DIRECT);
assert(src0.nr < 128);
assert(src0.type == GEN_TYPE_F);
insn->bits2.da3src.src0_swizzle = NO_SWIZZLE;
insn->bits2.da3src.src0_subreg_nr = src0.subnr / 4 ;
insn->bits2.da3src.src0_reg_nr = src0.nr;
insn->bits1.da3src.src0_abs = src0.absolute;
insn->bits1.da3src.src0_negate = src0.negation;
insn->bits2.da3src.src0_rep_ctrl = src0.vstride == GEN_VERTICAL_STRIDE_0;
assert(src1.file == GEN_GENERAL_REGISTER_FILE);
assert(src1.address_mode == GEN_ADDRESS_DIRECT);
assert(src1.nr < 128);
assert(src1.type == GEN_TYPE_F);
insn->bits2.da3src.src1_swizzle = NO_SWIZZLE;
insn->bits2.da3src.src1_subreg_nr_low = (src1.subnr / 4) & 0x3;
insn->bits3.da3src.src1_subreg_nr_high = (src1.subnr / 4) >> 2;
insn->bits2.da3src.src1_rep_ctrl = src1.vstride == GEN_VERTICAL_STRIDE_0;
insn->bits3.da3src.src1_reg_nr = src1.nr;
insn->bits1.da3src.src1_abs = src1.absolute;
insn->bits1.da3src.src1_negate = src1.negation;
assert(src2.file == GEN_GENERAL_REGISTER_FILE);
assert(src2.address_mode == GEN_ADDRESS_DIRECT);
assert(src2.nr < 128);
assert(src2.type == GEN_TYPE_F);
insn->bits3.da3src.src2_swizzle = NO_SWIZZLE;
insn->bits3.da3src.src2_subreg_nr = src2.subnr / 4;
insn->bits3.da3src.src2_rep_ctrl = src2.vstride == GEN_VERTICAL_STRIDE_0;
insn->bits3.da3src.src2_reg_nr = src2.nr;
insn->bits1.da3src.src2_abs = src2.absolute;
insn->bits1.da3src.src2_negate = src2.negation;
// Emit second half of the instruction
if (p->curr.execWidth == 16) {
GenInstruction q1Insn = *insn;
insn = p->next(opcode);
*insn = q1Insn;
insn->header.quarter_control = GEN_COMPRESSION_Q2;
insn->bits1.da3src.dest_reg_nr++;
if (insn->bits2.da3src.src0_rep_ctrl == 0)
insn->bits2.da3src.src0_reg_nr++;
if (insn->bits2.da3src.src1_rep_ctrl == 0)
insn->bits3.da3src.src1_reg_nr++;
if (insn->bits3.da3src.src2_rep_ctrl == 0)
insn->bits3.da3src.src2_reg_nr++;
}
return insn;
}
#undef NO_SWIZZLE
#define ALU1(OP) \
void GenEncoder::OP(GenRegister dest, GenRegister src0) { \
alu1(this, GEN_OPCODE_##OP, dest, src0); \
}
#define ALU2(OP) \
void GenEncoder::OP(GenRegister dest, GenRegister src0, GenRegister src1) { \
alu2(this, GEN_OPCODE_##OP, dest, src0, src1); \
}
#define ALU3(OP) \
void GenEncoder::OP(GenRegister dest, GenRegister src0, GenRegister src1, GenRegister src2) { \
alu3(this, GEN_OPCODE_##OP, dest, src0, src1, src2); \
}
void GenEncoder::LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value) {
union { double d; unsigned u[2]; } u;
u.d = value;
GenRegister r = GenRegister::retype(tmp, GEN_TYPE_UD);
push();
curr.predicate = GEN_PREDICATE_NONE;
curr.execWidth = 1;
MOV(r, GenRegister::immud(u.u[1]));
MOV(GenRegister::suboffset(r, 1), GenRegister::immud(u.u[0]));
pop();
r.type = GEN_TYPE_DF;
r.vstride = GEN_VERTICAL_STRIDE_0;
r.width = GEN_WIDTH_1;
r.hstride = GEN_HORIZONTAL_STRIDE_0;
push();
uint32_t width = curr.execWidth;
curr.execWidth = 8;
curr.predicate = GEN_PREDICATE_NONE;
curr.noMask = 1;
curr.quarterControl = GEN_COMPRESSION_Q1;
MOV(dest, r);
if (width == 16) {
curr.quarterControl = GEN_COMPRESSION_Q2;
MOV(GenRegister::offset(dest, 2), r);
}
pop();
}
void GenEncoder::UPSAMPLE_SHORT(GenRegister dest, GenRegister src0, GenRegister src1) {
dest.type = GEN_TYPE_B;
dest.hstride = GEN_HORIZONTAL_STRIDE_2;
src0.type = GEN_TYPE_B;
src0.hstride = GEN_HORIZONTAL_STRIDE_2;
src1.type = GEN_TYPE_B;
src1.hstride = GEN_HORIZONTAL_STRIDE_2;
MOV(dest, src1);
dest.subnr ++;
MOV(dest, src0);
}
void GenEncoder::UPSAMPLE_INT(GenRegister dest, GenRegister src0, GenRegister src1) {
dest.type = GEN_TYPE_W;
dest.hstride = GEN_HORIZONTAL_STRIDE_2;
src0.type = GEN_TYPE_W;
src0.hstride = GEN_HORIZONTAL_STRIDE_2;
src1.type = GEN_TYPE_W;
src1.hstride = GEN_HORIZONTAL_STRIDE_2;
MOV(dest, src1);
dest.subnr += 2;
MOV(dest, src0);
}
void GenEncoder::LOAD_INT64_IMM(GenRegister dest, int64_t value) {
GenRegister u0 = GenRegister::immd((int)value), u1 = GenRegister::immd(value >> 32);
int execWidth = curr.execWidth;
push();
curr.execWidth = 8;
for(int nib = 0; nib < execWidth/4; nib ++) {
curr.chooseNib(nib);
MOV(dest.top_half(), u1);
MOV(dest.bottom_half(), u0);
dest = GenRegister::suboffset(dest, 4);
}
pop();
}
void GenEncoder::MOV_DF(GenRegister dest, GenRegister src0, GenRegister r) {
int w = curr.execWidth;
if (src0.isdf()) {
GBE_ASSERT(0); // MOV DF is called from convert instruction,
// We should never convert a df to a df.
} else {
GenRegister r0 = GenRegister::h2(r);
push();
curr.execWidth = 8;
curr.predicate = GEN_PREDICATE_NONE;
MOV(r0, src0);
MOV(GenRegister::suboffset(r0, 8), GenRegister::suboffset(src0, 4));
curr.predicate = GEN_PREDICATE_NORMAL;
curr.quarterControl = 0;
curr.nibControl = 0;
MOV(dest, r);
curr.nibControl = 1;
MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(r, 8));
pop();
if (w == 16) {
push();
curr.execWidth = 8;
curr.predicate = GEN_PREDICATE_NONE;
MOV(r0, GenRegister::suboffset(src0, 8));
MOV(GenRegister::suboffset(r0, 8), GenRegister::suboffset(src0, 12));
curr.predicate = GEN_PREDICATE_NORMAL;
curr.quarterControl = 1;
curr.nibControl = 0;
MOV(GenRegister::suboffset(dest, 8), r);
curr.nibControl = 1;
MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(r, 8));
pop();
}
}
}
ALU1(MOV)
ALU1(RNDZ)
ALU1(RNDE)
ALU1(RNDD)
ALU1(RNDU)
ALU1(FBH)
ALU1(FBL)
ALU2(SEL)
ALU1(NOT)
ALU2(AND)
ALU2(OR)
ALU2(XOR)
ALU2(SHR)
ALU2(SHL)
ALU2(RSR)
ALU2(RSL)
ALU2(ASR)
ALU1(FRC)
ALU2(MAC)
ALU1(LZD)
ALU2(LINE)
ALU2(PLN)
ALU2(MACH)
ALU3(MAD)
void GenEncoder::SUBB(GenRegister dest, GenRegister src0, GenRegister src1) {
push();
curr.accWrEnable = 1;
alu2(this, GEN_OPCODE_SUBB, dest, src0, src1);
pop();
}
void GenEncoder::ADDC(GenRegister dest, GenRegister src0, GenRegister src1) {
push();
curr.accWrEnable = 1;
alu2(this, GEN_OPCODE_ADDC, dest, src0, src1);
pop();
}
void GenEncoder::ADD(GenRegister dest, GenRegister src0, GenRegister src1) {
if (src0.type == GEN_TYPE_F ||
(src0.file == GEN_IMMEDIATE_VALUE &&
src0.type == GEN_TYPE_VF)) {
assert(src1.type != GEN_TYPE_UD);
assert(src1.type != GEN_TYPE_D);
}
if (src1.type == GEN_TYPE_F ||
(src1.file == GEN_IMMEDIATE_VALUE &&
src1.type == GEN_TYPE_VF)) {
assert(src0.type != GEN_TYPE_UD);
assert(src0.type != GEN_TYPE_D);
}
alu2(this, GEN_OPCODE_ADD, dest, src0, src1);
}
void GenEncoder::MUL(GenRegister dest, GenRegister src0, GenRegister src1) {
if (src0.type == GEN_TYPE_D ||
src0.type == GEN_TYPE_UD ||
src1.type == GEN_TYPE_D ||
src1.type == GEN_TYPE_UD)
assert(dest.type != GEN_TYPE_F);
if (src0.type == GEN_TYPE_F ||
(src0.file == GEN_IMMEDIATE_VALUE &&
src0.type == GEN_TYPE_VF)) {
assert(src1.type != GEN_TYPE_UD);
assert(src1.type != GEN_TYPE_D);
}
if (src1.type == GEN_TYPE_F ||
(src1.file == GEN_IMMEDIATE_VALUE &&
src1.type == GEN_TYPE_VF)) {
assert(src0.type != GEN_TYPE_UD);
assert(src0.type != GEN_TYPE_D);
}
assert(src0.file != GEN_ARCHITECTURE_REGISTER_FILE ||
src0.nr != GEN_ARF_ACCUMULATOR);
assert(src1.file != GEN_ARCHITECTURE_REGISTER_FILE ||
src1.nr != GEN_ARF_ACCUMULATOR);
alu2(this, GEN_OPCODE_MUL, dest, src0, src1);
}
void GenEncoder::NOP(void) {
GenInstruction *insn = this->next(GEN_OPCODE_NOP);
this->setDst(insn, GenRegister::retype(GenRegister::f4grf(0,0), GEN_TYPE_UD));
this->setSrc0(insn, GenRegister::retype(GenRegister::f4grf(0,0), GEN_TYPE_UD));
this->setSrc1(insn, GenRegister::immud(0x0));
}
void GenEncoder::BARRIER(GenRegister src) {
GenInstruction *insn = this->next(GEN_OPCODE_SEND);
this->setHeader(insn);
this->setDst(insn, GenRegister::null());
this->setSrc0(insn, src);
setMessageDescriptor(this, insn, GEN_SFID_MESSAGE_GATEWAY, 1, 0);
insn->bits3.msg_gateway.sub_function_id = GEN_BARRIER_MSG;
insn->bits3.msg_gateway.notify = 0x1;
}
void GenEncoder::FENCE(GenRegister dst) {
GenInstruction *insn = this->next(GEN_OPCODE_SEND);
this->setHeader(insn);
this->setDst(insn, dst);
this->setSrc0(insn, dst);
setMessageDescriptor(this, insn, GEN_SFID_DATAPORT_DATA_CACHE, 1, 1, 1);
insn->bits3.gen7_memory_fence.msg_type = GEN_MEM_FENCE;
insn->bits3.gen7_memory_fence.commit_enable = 0x1;
}
void GenEncoder::JMPI(GenRegister src) {
alu2(this, GEN_OPCODE_JMPI, GenRegister::ip(), GenRegister::ip(), src);
}
void GenEncoder::patchJMPI(uint32_t insnID, int32_t jumpDistance) {
GenInstruction &insn = this->store[insnID];
assert(insnID < this->store.size());
assert(insn.header.opcode == GEN_OPCODE_JMPI);
this->setSrc1(&insn, GenRegister::immd(jumpDistance));
}
void GenEncoder::CMP(uint32_t conditional, GenRegister src0, GenRegister src1) {
if (needToSplitCmp(this, src0, src1) == false) {
GenInstruction *insn = this->next(GEN_OPCODE_CMP);
this->setHeader(insn);
insn->header.destreg_or_condmod = conditional;
this->setDst(insn, GenRegister::null());
this->setSrc0(insn, src0);
this->setSrc1(insn, src1);
} else {
GenInstruction *insnQ1, *insnQ2;
// Instruction for the first quarter
insnQ1 = this->next(GEN_OPCODE_CMP);
this->setHeader(insnQ1);
insnQ1->header.quarter_control = GEN_COMPRESSION_Q1;
insnQ1->header.execution_size = GEN_WIDTH_8;
insnQ1->header.destreg_or_condmod = conditional;
this->setDst(insnQ1, GenRegister::null());
this->setSrc0(insnQ1, src0);
this->setSrc1(insnQ1, src1);
// Instruction for the second quarter
insnQ2 = this->next(GEN_OPCODE_CMP);
this->setHeader(insnQ2);
insnQ2->header.quarter_control = GEN_COMPRESSION_Q2;
insnQ2->header.execution_size = GEN_WIDTH_8;
insnQ2->header.destreg_or_condmod = conditional;
this->setDst(insnQ2, GenRegister::null());
this->setSrc0(insnQ2, GenRegister::Qn(src0, 1));
this->setSrc1(insnQ2, GenRegister::Qn(src1, 1));
}
}
void GenEncoder::SEL_CMP(uint32_t conditional,
GenRegister dst,
GenRegister src0,
GenRegister src1)
{
GenInstruction *insn = this->next(GEN_OPCODE_SEL);
GBE_ASSERT(curr.predicate == GEN_PREDICATE_NONE);
this->setHeader(insn);
insn->header.destreg_or_condmod = conditional;
this->setDst(insn, dst);
this->setSrc0(insn, src0);
this->setSrc1(insn, src1);
}
void GenEncoder::WAIT(void) {
GenInstruction *insn = this->next(GEN_OPCODE_WAIT);
GenRegister src = GenRegister::notification1();
this->setDst(insn, GenRegister::null());
this->setSrc0(insn, src);
this->setSrc1(insn, GenRegister::null());
insn->header.execution_size = 0; /* must */
insn->header.predicate_control = 0;
insn->header.quarter_control = 0;
}
void GenEncoder::MATH(GenRegister dst, uint32_t function, GenRegister src0, GenRegister src1) {
GenInstruction *insn = this->next(GEN_OPCODE_MATH);
assert(dst.file == GEN_GENERAL_REGISTER_FILE);
assert(src0.file == GEN_GENERAL_REGISTER_FILE);
assert(src1.file == GEN_GENERAL_REGISTER_FILE);
assert(dst.hstride == GEN_HORIZONTAL_STRIDE_1);
if (function == GEN_MATH_FUNCTION_INT_DIV_QUOTIENT ||
function == GEN_MATH_FUNCTION_INT_DIV_REMAINDER ||
function == GEN_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
assert(src0.type != GEN_TYPE_F);
assert(src1.type != GEN_TYPE_F);
} else {
assert(src0.type == GEN_TYPE_F);
assert(src1.type == GEN_TYPE_F);
}
insn->header.destreg_or_condmod = function;
this->setHeader(insn);
this->setDst(insn, dst);
this->setSrc0(insn, src0);
this->setSrc1(insn, src1);
if (function == GEN_MATH_FUNCTION_INT_DIV_QUOTIENT ||
function == GEN_MATH_FUNCTION_INT_DIV_REMAINDER) {
insn->header.execution_size = GEN_WIDTH_8;
insn->header.quarter_control = GEN_COMPRESSION_Q1;
if(this->curr.execWidth == 16) {
GenInstruction *insn2 = this->next(GEN_OPCODE_MATH);
GenRegister new_dest, new_src0, new_src1;
new_dest = GenRegister::QnPhysical(dst, 1);
new_src0 = GenRegister::QnPhysical(src0, 1);
new_src1 = GenRegister::QnPhysical(src1, 1);
insn2->header.destreg_or_condmod = function;
this->setHeader(insn2);
insn2->header.execution_size = GEN_WIDTH_8;
insn2->header.quarter_control = GEN_COMPRESSION_Q2;
this->setDst(insn2, new_dest);
this->setSrc0(insn2, new_src0);
this->setSrc1(insn2, new_src1);
}
}
}
void GenEncoder::MATH(GenRegister dst, uint32_t function, GenRegister src) {
GenInstruction *insn = this->next(GEN_OPCODE_MATH);
assert(dst.file == GEN_GENERAL_REGISTER_FILE);
assert(src.file == GEN_GENERAL_REGISTER_FILE);
assert(dst.hstride == GEN_HORIZONTAL_STRIDE_1);
assert(src.type == GEN_TYPE_F);
insn->header.destreg_or_condmod = function;
this->setHeader(insn);
this->setDst(insn, dst);
this->setSrc0(insn, src);
}
void GenEncoder::SAMPLE(GenRegister dest,
GenRegister msg,
bool header_present,
unsigned char bti,
unsigned char sampler,
unsigned int coord_cnt,
uint32_t simdWidth,
uint32_t writemask,
uint32_t return_format)
{
if (writemask == 0) return;
uint32_t msg_type = (simdWidth == 16) ?
GEN_SAMPLER_MESSAGE_SIMD16_SAMPLE : GEN_SAMPLER_MESSAGE_SIMD8_SAMPLE;
uint32_t response_length = (4 * (simdWidth / 8));
uint32_t msg_length = (coord_cnt * (simdWidth / 8));
if (header_present)
msg_length++;
uint32_t simd_mode = (simdWidth == 16) ?
GEN_SAMPLER_SIMD_MODE_SIMD16 : GEN_SAMPLER_SIMD_MODE_SIMD8;
GenInstruction *insn = this->next(GEN_OPCODE_SEND);
this->setHeader(insn);
this->setDst(insn, dest);
this->setSrc0(insn, msg);
setSamplerMessage(this, insn, bti, sampler, msg_type,
response_length, msg_length,
header_present,
simd_mode, return_format);
}
void GenEncoder::TYPED_WRITE(GenRegister msg, bool header_present, unsigned char bti)
{
GenInstruction *insn = this->next(GEN_OPCODE_SEND);
uint32_t msg_type = GEN_TYPED_WRITE;
uint32_t msg_length = header_present ? 9 : 8;
this->setHeader(insn);
this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
this->setSrc0(insn, msg);
setTypedWriteMessage(this, insn, bti, msg_type, msg_length, header_present);
}
static void setScratchMessage(GenEncoder *p,
GenInstruction *insn,
uint32_t offset,
uint32_t block_size,
uint32_t channel_mode,
uint32_t msg_type,
uint32_t msg_length,
uint32_t response_length)
{
const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA_CACHE;
setMessageDescriptor(p, insn, sfid, msg_length, response_length, true);
insn->bits3.gen7_scratch_rw.block_size = block_size;
insn->bits3.gen7_scratch_rw.msg_type = msg_type;
insn->bits3.gen7_scratch_rw.channel_mode = channel_mode;
insn->bits3.gen7_scratch_rw.offset = offset;
insn->bits3.gen7_scratch_rw.category = 1;
}
void GenEncoder::SCRATCH_WRITE(GenRegister msg, uint32_t offset, uint32_t size, uint32_t src_num, uint32_t channel_mode)
{
assert(src_num == 1 || src_num ==2);
uint32_t block_size = src_num == 1 ? GEN_SCRATCH_BLOCK_SIZE_1 : GEN_SCRATCH_BLOCK_SIZE_2;
GenInstruction *insn = this->next(GEN_OPCODE_SEND);
this->setHeader(insn);
this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
this->setSrc0(insn, msg);
this->setSrc1(insn, GenRegister::immud(0));
// here src_num means register that will be write out: in terms of 32byte register number
setScratchMessage(this, insn, offset, block_size, channel_mode, GEN_SCRATCH_WRITE, src_num+1, 0);
}
void GenEncoder::SCRATCH_READ(GenRegister dst, GenRegister src, uint32_t offset, uint32_t size, uint32_t dst_num, uint32_t channel_mode)
{
assert(dst_num == 1 || dst_num ==2);
uint32_t block_size = dst_num == 1 ? GEN_SCRATCH_BLOCK_SIZE_1 : GEN_SCRATCH_BLOCK_SIZE_2;
GenInstruction *insn = this->next(GEN_OPCODE_SEND);
this->setHeader(insn);
this->setDst(insn, dst);
this->setSrc0(insn, src);
this->setSrc1(insn, GenRegister::immud(0));
// here dst_num is the register that will be write-back: in terms of 32byte register
setScratchMessage(this, insn, offset, block_size, channel_mode, GEN_SCRATCH_READ, 1, dst_num);
}
void GenEncoder::EOT(uint32_t msg) {
GenInstruction *insn = this->next(GEN_OPCODE_SEND);
this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
this->setSrc0(insn, GenRegister::ud8grf(msg,0));
this->setSrc1(insn, GenRegister::immud(0));
insn->header.execution_size = GEN_WIDTH_8;
insn->bits3.spawner_gen5.resource = GEN_DO_NOT_DEREFERENCE_URB;
insn->bits3.spawner_gen5.msg_length = 1;
insn->bits3.spawner_gen5.end_of_thread = 1;
insn->header.destreg_or_condmod = GEN_SFID_THREAD_SPAWNER;
}
} /* namespace gbe */
Release_v0.3/backend/src/backend/gen_encoder.hpp 0000664 0000000 0000000 00000017507 12231421770 0021743 0 ustar 00root root 0000000 0000000 /*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see .
*
* Author: Benjamin Segovia
*/
/*
Copyright (C) Intel Corp. 2006. All Rights Reserved.
Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
develop this 3D driver.
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice (including the
next paragraph) shall be included in all copies or substantial
portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
**********************************************************************/
/*
* Authors:
* Keith Whitwell
*/
#ifndef __GBE_GEN_ENCODER_HPP__
#define __GBE_GEN_ENCODER_HPP__
#include "backend/gen_defs.hpp"
#include "backend/gen_register.hpp"
#include "sys/platform.hpp"
#include "sys/vector.hpp"
#include
namespace gbe
{
/*! Helper structure to emit Gen instructions */
class GenEncoder
{
public:
/*! simdWidth is the default width for the instructions */
GenEncoder(uint32_t simdWidth, uint32_t gen);
/*! Size of the stack (should be large enough) */
enum { MAX_STATE_NUM = 16 };
/*! Push the current instruction state */
void push(void);
/*! Pop the latest pushed state */
void pop(void);
/*! The instruction stream we are building */
vector store;
/*! Current instruction state to use */
GenInstructionState curr;
/*! State used to encode the instructions */
GenInstructionState stack[MAX_STATE_NUM];
/*! Number of states currently pushed */
uint32_t stateNum;
/*! Gen generation to encode */
uint32_t gen;
////////////////////////////////////////////////////////////////////////
// Encoding functions
////////////////////////////////////////////////////////////////////////
#define ALU1(OP) void OP(GenRegister dest, GenRegister src0);
#define ALU2(OP) void OP(GenRegister dest, GenRegister src0, GenRegister src1);
#define ALU3(OP) void OP(GenRegister dest, GenRegister src0, GenRegister src1, GenRegister src2);
ALU1(MOV)
ALU1(FBH)
ALU1(FBL)
ALU2(SUBB)
ALU2(UPSAMPLE_SHORT)
ALU2(UPSAMPLE_INT)
ALU1(RNDZ)
ALU1(RNDE)
ALU1(RNDD)
ALU1(RNDU)
ALU2(SEL)
ALU1(NOT)
ALU2(AND)
ALU2(OR)
ALU2(XOR)
ALU2(SHR)
ALU2(SHL)
ALU2(RSR)
ALU2(RSL)
ALU2(ASR)
ALU2(ADD)
ALU2(ADDC)
ALU2(MUL)
ALU1(FRC)
ALU2(MAC)
ALU2(MACH)
ALU1(LZD)
ALU2(LINE)
ALU2(PLN)
ALU3(MAD)
//ALU2(MOV_DF);
#undef ALU1
#undef ALU2
#undef ALU3
void MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp = GenRegister::null());
void LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value);
void LOAD_INT64_IMM(GenRegister dest, int64_t value);
/*! Barrier message (to synchronize threads of a workgroup) */
void BARRIER(GenRegister src);
/*! Memory fence message (to order loads and stores between threads) */
void FENCE(GenRegister dst);
/*! Jump indexed instruction */
void JMPI(GenRegister src);
/*! Compare instructions */
void CMP(uint32_t conditional, GenRegister src0, GenRegister src1);
/*! Select with embedded compare (like sel.le ...) */
void SEL_CMP(uint32_t conditional, GenRegister dst, GenRegister src0, GenRegister src1);
/*! EOT is used to finish GPGPU threads */
void EOT(uint32_t msg_nr);
/*! No-op */
void NOP(void);
/*! Wait instruction (used for the barrier) */
void WAIT(void);
/*! Atomic instructions */
void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum);
/*! Read 64-bits float/int arrays */
void READ64(GenRegister dst, GenRegister tmp, GenRegister addr, GenRegister src, uint32_t bti, uint32_t elemNum);
/*! Write 64-bits float/int arrays */
void WRITE64(GenRegister src, GenRegister data, uint32_t bti, uint32_t elemNum, bool is_scalar);
/*! Untyped read (upto 4 channels) */
void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
/*! Untyped write (upto 4 channels) */
void UNTYPED_WRITE(GenRegister src, uint32_t bti, uint32_t elemNum);
/*! Byte gather (for unaligned bytes, shorts and ints) */
void BYTE_GATHER(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemSize);
/*! Byte scatter (for unaligned bytes, shorts and ints) */
void BYTE_SCATTER(GenRegister src, uint32_t bti, uint32_t elemSize);
/*! DWord gather (for constant cache read) */
void DWORD_GATHER(GenRegister dst, GenRegister src, uint32_t bti);
/*! for scratch memory read */
void SCRATCH_READ(GenRegister msg, GenRegister dst, uint32_t offset, uint32_t size, uint32_t dst_num, uint32_t channel_mode);
/*! for scratch memory write */
void SCRATCH_WRITE(GenRegister msg, uint32_t offset, uint32_t size, uint32_t src_num, uint32_t channel_mode);
/*! Send instruction for the sampler */
void SAMPLE(GenRegister dest,
GenRegister msg,
bool header_present,
unsigned char bti,
unsigned char sampler,
unsigned int coord_cnt,
unsigned int simdWidth,
uint32_t writemask,
uint32_t return_format);
/*! TypedWrite instruction for texture */
void TYPED_WRITE(GenRegister header,
bool header_present,
unsigned char bti);
/*! Extended math function (2 sources) */
void MATH(GenRegister dst, uint32_t function, GenRegister src0, GenRegister src1);
/*! Extended math function (1 source) */
void MATH(GenRegister dst, uint32_t function, GenRegister src);
/*! Patch JMPI (located at index insnID) with the given jump distance */
void patchJMPI(uint32_t insnID, int32_t jumpDistance);
////////////////////////////////////////////////////////////////////////
// Helper functions to encode
////////////////////////////////////////////////////////////////////////
void setHeader(GenInstruction *insn);
void setDst(GenInstruction *insn, GenRegister dest);
void setSrc0(GenInstruction *insn, GenRegister reg);
void setSrc1(GenInstruction *insn, GenRegister reg);
GenInstruction *next(uint32_t opcode);
uint32_t n_instruction(void) const { return store.size(); }
GBE_CLASS(GenEncoder); //!< Use custom allocators
};
} /* namespace gbe */
#endif /* __GBE_GEN_ENCODER_HPP__ */
Release_v0.3/backend/src/backend/gen_insn_gen7_schedule_info.hxx 0000664 0000000 0000000 00000004537 12231421770 0025121 0 ustar 00root root 0000000 0000000 // Family Latency SIMD16 SIMD8
DECL_GEN7_SCHEDULE(Label, 0, 0, 0)
DECL_GEN7_SCHEDULE(Unary, 20, 4, 2)
DECL_GEN7_SCHEDULE(UnaryWithTemp, 20, 4, 2)
DECL_GEN7_SCHEDULE(Binary, 20, 4, 2)
DECL_GEN7_SCHEDULE(BinaryWithTemp, 20, 4, 2)
DECL_GEN7_SCHEDULE(Ternary, 20, 4, 2)
DECL_GEN7_SCHEDULE(I64Shift, 20, 4, 2)
DECL_GEN7_SCHEDULE(I64HADD, 20, 4, 2)
DECL_GEN7_SCHEDULE(I64RHADD, 20, 4, 2)
DECL_GEN7_SCHEDULE(I64ToFloat, 20, 4, 2)
DECL_GEN7_SCHEDULE(I64MULHI, 20, 4, 2)
DECL_GEN7_SCHEDULE(I64MADSAT, 20, 4, 2)
DECL_GEN7_SCHEDULE(Compare, 20, 4, 2)
DECL_GEN7_SCHEDULE(I64Compare, 20, 4, 2)
DECL_GEN7_SCHEDULE(I64DIVREM, 20, 4, 2)
DECL_GEN7_SCHEDULE(Jump, 14, 1, 1)
DECL_GEN7_SCHEDULE(IndirectMove, 20, 2, 2)
DECL_GEN7_SCHEDULE(Eot, 20, 1, 1)
DECL_GEN7_SCHEDULE(NoOp, 20, 2, 2)
DECL_GEN7_SCHEDULE(Wait, 20, 2, 2)
DECL_GEN7_SCHEDULE(Math, 20, 4, 2)
DECL_GEN7_SCHEDULE(Barrier, 80, 1, 1)
DECL_GEN7_SCHEDULE(Fence, 80, 1, 1)
DECL_GEN7_SCHEDULE(Read64, 80, 1, 1)
DECL_GEN7_SCHEDULE(Write64, 80, 1, 1)
DECL_GEN7_SCHEDULE(UntypedRead, 80, 1, 1)
DECL_GEN7_SCHEDULE(UntypedWrite, 80, 1, 1)
DECL_GEN7_SCHEDULE(ByteGather, 80, 1, 1)
DECL_GEN7_SCHEDULE(ByteScatter, 80, 1, 1)
DECL_GEN7_SCHEDULE(DWordGather, 80, 1, 1)
DECL_GEN7_SCHEDULE(Sample, 80, 1, 1)
DECL_GEN7_SCHEDULE(TypedWrite, 80, 1, 1)
DECL_GEN7_SCHEDULE(SpillReg, 80, 1, 1)
DECL_GEN7_SCHEDULE(UnSpillReg, 80, 1, 1)
DECL_GEN7_SCHEDULE(GetImageInfo, 20, 4, 2)
DECL_GEN7_SCHEDULE(Atomic, 80, 1, 1)
DECL_GEN7_SCHEDULE(I64MUL, 20, 4, 2)
DECL_GEN7_SCHEDULE(I64SATADD, 20, 4, 2)
DECL_GEN7_SCHEDULE(I64SATSUB, 20, 4, 2)
Release_v0.3/backend/src/backend/gen_insn_scheduling.cpp 0000664 0000000 0000000 00000056410 12231421770 0023467 0 ustar 00root root 0000000 0000000 /*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see .
*
* Author: Benjamin Segovia
*/
/**
* \file gen_insn_scheduling.cpp
* \author Benjamin Segovia
*/
/*
* Overall idea:
* =============
*
* This is the instruction scheduling part of the code. With Gen, we actually
* have a simple strategy to follow. Indeed, here are the constraints:
*
* 1 - the number of registers per HW thread is constant and given (128 32 bytes
* GRF per thread). So, we can use all these registers with no penalty
* 2 - spilling is super bad. Instruction latency matters but the top priority
* is to avoid as much as possible spilling
*
*
* We schedule twice using at each time a local forward list scheduler
*
* Before the register allocation
* ==============================
*
* We try to limit the register pressure.
* Well, this is a hard problem and we have a decent strategy now that we called
* "zero cycled LIFO scheduling".
* We use a local forward list scheduling and we schedule the instructions in a
* LIFO order i.e. as a stack. Basically, we take the most recent instruction
* and schedule it right away. Obviously we ignore completely the real latencies
* and throuputs and just simulate instructions that are issued and completed in
* zero cycle. For the complex kernels we already have (like menger sponge),
* this provides a pretty good strategy enabling SIMD16 code generation where
* when scheduling is deactivated, even SIMD8 fails
*
* One may argue that this strategy is bad, latency wise. This is not true since
* the register allocator will anyway try to burn as many registers as possible.
* So, there is still opportunities to schedule after register allocation.
*
* Our idea seems to work decently. There is however a strong research article
* that is able to near-optimally reschudle the instructions to minimize
* register use. This is:
*
* "Minimum Register Instruction Sequence Problem: Revisiting Optimal Code
* Generation for DAGs"
*
* After the register allocation
* ==============================
*
* This is here a pretty simple strategy based on a regular forward list
* scheduling. Since Gen is a co-issue based machine, this is useless to take
* into account really precise timings since instruction issues will happen
* out-of-order based on other thread executions.
*
* Note that we over-simplify the problem. Indeed, Gen register file is flexible
* and we are able to use sub-registers of GRF in particular when we handle
* uniforms or mask registers which are spilled in GRFs. Thing is that two
* uniforms may not interfere even if they belong to the same GRF (i.e. they use
* two different sub-registers). This means that the interference relation is
* not transitive for Gen. To simplify everything, we just take consider full
* GRFs (in SIMD8) or double full GRFs (in SIMD16) regardless of the fact this
* is a uniform, a mask or a regular GRF.
*
* Obviously, this leads to extra dependencies in the code.
*/
#include "backend/gen_insn_selection.hpp"
#include "backend/gen_reg_allocation.hpp"
#include "sys/cvar.hpp"
#include "sys/intrusive_list.hpp"
namespace gbe
{
// Helper structure to schedule the basic blocks
struct SelectionScheduler;
// Node for the schedule DAG
struct ScheduleDAGNode;
/*! We need to chain together the node we point */
struct ScheduleListNode : public intrusive_list_node
{
INLINE ScheduleListNode(ScheduleDAGNode *node) : node(node) {}
ScheduleDAGNode *node;
};
/*! Node of the DAG */
struct ScheduleDAGNode
{
INLINE ScheduleDAGNode(SelectionInstruction &insn) :
insn(insn), refNum(0), retiredCycle(0) {}
bool dependsOn(ScheduleDAGNode *node) const {
GBE_ASSERT(node != NULL);
for (auto child : node->children)
if (child.node == this)
return true;
return false;
}
/*! Children that depends on us */
intrusive_list children;
/*! Instruction after code selection */
SelectionInstruction &insn;
/*! Number of nodes that point to us (i.e. nodes we depend on) */
uint32_t refNum;
/*! Cycle when the instruction is retired */
uint32_t retiredCycle;
};
/*! To track loads and stores */
enum GenMemory : uint8_t {
GLOBAL_MEMORY = 0,
LOCAL_MEMORY,
MAX_MEM_SYSTEM
};
/*! Do we allocate after or before the register allocation? */
enum SchedulePolicy {
PRE_ALLOC = 0, // LIFO scheduling (tends to limit register pressure)
POST_ALLOC // FIFO scheduling (limits latency problems)
};
/*! Helper structure to handle dependencies while scheduling. Takes into
* account virtual and physical registers and memory sub-systems
*/
struct DependencyTracker : public NonCopyable
{
DependencyTracker(const Selection &selection, SelectionScheduler &scheduler);
/*! Reset it before scheduling a new block */
void clear(void);
/*! Get an index in the node array for the given register */
uint32_t getIndex(GenRegister reg) const;
/*! Get an index in the node array for the given memory system */
uint32_t getIndex(uint32_t bti) const;
/*! Add a new dependency "node0 depends on node1" */
void addDependency(ScheduleDAGNode *node0, ScheduleDAGNode *node1);
/*! Add a new dependency "node0 depends on node located at index" */
void addDependency(ScheduleDAGNode *node0, uint32_t index);
/*! Add a new dependency "node located at index depends on node0" */
void addDependency(uint32_t index, ScheduleDAGNode *node0);
/*! No dependency for null registers and immediate */
INLINE bool ignoreDependency(GenRegister reg) const {
if (reg.file == GEN_IMMEDIATE_VALUE)
return true;
else if (reg.file == GEN_ARCHITECTURE_REGISTER_FILE) {
if ((reg.nr & 0xf0) == GEN_ARF_NULL)
return true;
}
return false;
}
/*! Owns the tracker */
SelectionScheduler &scheduler;
/*! Add a new dependency "node0 depends on node set for register reg" */
INLINE void addDependency(ScheduleDAGNode *node0, GenRegister reg) {
if (this->ignoreDependency(reg) == false) {
const uint32_t index = this->getIndex(reg);
this->addDependency(node0, index);
if (reg.isdf() || reg.isint64())
this->addDependency(node0, index + 1);
}
}
/*! Add a new dependency "node set for register reg depends on node0" */
INLINE void addDependency(GenRegister reg, ScheduleDAGNode *node0) {
if (this->ignoreDependency(reg) == false) {
const uint32_t index = this->getIndex(reg);
this->addDependency(index, node0);
if (reg.isdf() || reg.isint64())
this->addDependency(index + 1, node0);
}
}
/*! Make the node located at insnID a barrier */
void makeBarrier(int32_t insnID, int32_t insnNum);
/*! Update all the writes (memory, predicates, registers) */
void updateWrites(ScheduleDAGNode *node);
/*! Maximum number of *physical* flag registers */
static const uint32_t MAX_FLAG_REGISTER = 8u;
/*! Maximum number of *physical* accumulators registers */
static const uint32_t MAX_ACC_REGISTER = 1u;
/*! Stores the last node that wrote to a register / memory ... */
vector nodes;
/*! Stores the nodes per instruction */
vector insnNodes;
/*! Number of virtual register in the selection */
uint32_t grfNum;
};
/*! Perform the instruction scheduling */
struct SelectionScheduler : public NonCopyable
{
/*! Init the book keeping structures */
SelectionScheduler(GenContext &ctx, Selection &selection, SchedulePolicy policy);
/*! Make all lists empty */
void clearLists(void);
/*! Return the number of instructions to schedule in the DAG */
int32_t buildDAG(SelectionBlock &bb);
/*! Schedule the DAG */
void scheduleDAG(SelectionBlock &bb, int32_t insnNum);
/*! To limit register pressure or limit insn latency problems */
SchedulePolicy policy;
/*! Make ScheduleListNode allocation faster */
DECL_POOL(ScheduleListNode, listPool);
/*! Make ScheduleDAGNode allocation faster */
DECL_POOL(ScheduleDAGNode, nodePool);
/*! Ready list is instructions that can be scheduled */
intrusive_list ready;
/*! Active list is instructions that are executing */
intrusive_list active;
/*! Handle complete compilation */
GenContext &ctx;
/*! Code to schedule */
Selection &selection;
/*! To help tracking dependencies */
DependencyTracker tracker;
};
DependencyTracker::DependencyTracker(const Selection &selection, SelectionScheduler &scheduler) :
scheduler(scheduler)
{
if (scheduler.policy == PRE_ALLOC) {
this->grfNum = selection.getRegNum();
nodes.resize(grfNum + MAX_FLAG_REGISTER + MAX_ACC_REGISTER + MAX_MEM_SYSTEM);
} else {
const uint32_t simdWidth = scheduler.ctx.getSimdWidth();
GBE_ASSERT(simdWidth == 8 || simdWidth == 16);
this->grfNum = simdWidth == 8 ? 128 : 64;
nodes.resize(grfNum + MAX_FLAG_REGISTER + MAX_ACC_REGISTER + MAX_MEM_SYSTEM);
}
insnNodes.resize(selection.getLargestBlockSize());
}
void DependencyTracker::clear(void) { for (auto &x : nodes) x = NULL; }
void DependencyTracker::addDependency(ScheduleDAGNode *node0, ScheduleDAGNode *node1) {
if (node0 != NULL && node1 != NULL && node0 != node1 && node0->dependsOn(node1) == false) {
ScheduleListNode *dep = scheduler.newScheduleListNode(node0);
node0->refNum++;
node1->children.push_back(dep);
}
}
void DependencyTracker::addDependency(ScheduleDAGNode *node, uint32_t index) {
this->addDependency(node, this->nodes[index]);
}
void DependencyTracker::addDependency(uint32_t index, ScheduleDAGNode *node) {
this->addDependency(this->nodes[index], node);
}
void DependencyTracker::makeBarrier(int32_t barrierID, int32_t insnNum) {
ScheduleDAGNode *barrier = this->insnNodes[barrierID];
// The barrier depends on all nodes before it
for (int32_t insnID = 0; insnID < barrierID; ++insnID)
this->addDependency(barrier, this->insnNodes[insnID]);
// All nodes after barriers depend on the barrier
for (int32_t insnID = barrierID + 1; insnID < insnNum; ++insnID)
this->addDependency(this->insnNodes[insnID], barrier);
}
static GenRegister getFlag(const SelectionInstruction &insn) {
if (insn.state.physicalFlag) {
const uint32_t nr = insn.state.flag;
const uint32_t subnr = insn.state.subFlag;
return GenRegister::flag(nr, subnr);
} else
return GenRegister::uw1grf(ir::Register(insn.state.flagIndex));
}
uint32_t DependencyTracker::getIndex(GenRegister reg) const {
// Non GRF physical register
if (reg.physical) {
//GBE_ASSERT (reg.file == GEN_ARCHITECTURE_REGISTER_FILE);
if(reg.file == GEN_ARCHITECTURE_REGISTER_FILE) {
const uint32_t file = reg.nr & 0xf0;
const uint32_t nr = reg.nr & 0x0f;
if (file == GEN_ARF_FLAG) {
const uint32_t subnr = reg.subnr / sizeof(uint16_t);
GBE_ASSERT(nr < MAX_FLAG_REGISTER && (subnr == 0 || subnr == 1));
return grfNum + 2*nr + subnr;
} else if (file == GEN_ARF_ACCUMULATOR) {
GBE_ASSERT(nr < MAX_ACC_REGISTER);
return grfNum + MAX_FLAG_REGISTER + nr;
} else {
NOT_SUPPORTED;
return 0;
}
} else {
const uint32_t simdWidth = scheduler.ctx.getSimdWidth();
return simdWidth == 8 ? reg.nr : reg.nr / 2;
}
}
// We directly manipulate physical GRFs here
else if (scheduler.policy == POST_ALLOC) {
const GenRegister physical = scheduler.ctx.ra->genReg(reg);
const uint32_t simdWidth = scheduler.ctx.getSimdWidth();
return simdWidth == 8 ? physical.nr : physical.nr / 2;
}
// We use virtual registers since allocation is not done yet
else
return reg.value.reg;
}
uint32_t DependencyTracker::getIndex(uint32_t bti) const {
const uint32_t memDelta = grfNum + MAX_FLAG_REGISTER + MAX_ACC_REGISTER;
return bti == 0xfe ? memDelta + LOCAL_MEMORY : memDelta + GLOBAL_MEMORY;
}
void DependencyTracker::updateWrites(ScheduleDAGNode *node) {
const SelectionInstruction &insn = node->insn;
// Track writes in registers
for (uint32_t dstID = 0; dstID < insn.dstNum; ++dstID) {
const GenRegister dst = insn.dst(dstID);
if (this->ignoreDependency(dst) == false) {
const uint32_t index = this->getIndex(dst);
this->nodes[index] = node;
if (dst.isdf() || dst.isint64())
this->nodes[index + 1] = node;
}
}
// Track writes in predicates
if (insn.opcode == SEL_OP_CMP || insn.opcode == SEL_OP_I64CMP) {
const uint32_t index = this->getIndex(getFlag(insn));
this->nodes[index] = node;
}
// Track writes in accumulators
if (insn.state.accWrEnable) {
const uint32_t index = this->getIndex(GenRegister::acc());
this->nodes[index] = node;
}
// Track writes in memory
if (insn.isWrite()) {
const uint32_t index = this->getIndex(insn.extra.function);
this->nodes[index] = node;
}
if(insn.opcode == SEL_OP_SPILL_REG) {
const uint32_t index = this->getIndex(0xff);
this->nodes[index] = node;
}
// Consider barriers and wait write to memory
if (insn.opcode == SEL_OP_BARRIER ||
insn.opcode == SEL_OP_FENCE ||
insn.opcode == SEL_OP_WAIT) {
const uint32_t local = this->getIndex(0xfe);
const uint32_t global = this->getIndex(0x00);
this->nodes[local] = this->nodes[global] = node;
}
}
/*! Kind-of roughly estimated latency. Nothing real here */
static uint32_t getLatencyGen7(const SelectionInstruction &insn) {
#define DECL_GEN7_SCHEDULE(FAMILY, LATENCY, SIMD16, SIMD8)\
const uint32_t FAMILY##InstructionLatency = LATENCY;
#include "gen_insn_gen7_schedule_info.hxx"
#undef DECL_GEN7_SCHEDULE
switch (insn.opcode) {
#define DECL_SELECTION_IR(OP, FAMILY) case SEL_OP_##OP: return FAMILY##Latency;
#include "backend/gen_insn_selection.hxx"
#undef DECL_SELECTION_IR
};
return 0;
}
/*! Throughput in cycles for SIMD8 or SIMD16 */
static uint32_t getThroughputGen7(const SelectionInstruction &insn, bool isSIMD8) {
#define DECL_GEN7_SCHEDULE(FAMILY, LATENCY, SIMD16, SIMD8)\
const uint32_t FAMILY##InstructionThroughput = isSIMD8 ? SIMD8 : SIMD16;
#include "gen_insn_gen7_schedule_info.hxx"
#undef DECL_GEN7_SCHEDULE
switch (insn.opcode) {
#define DECL_SELECTION_IR(OP, FAMILY) case SEL_OP_##OP: return FAMILY##Throughput;
#include "backend/gen_insn_selection.hxx"
#undef DECL_SELECTION_IR
};
return 0;
}
SelectionScheduler::SelectionScheduler(GenContext &ctx,
Selection &selection,
SchedulePolicy policy) :
policy(policy), listPool(nextHighestPowerOf2(selection.getLargestBlockSize())),
ctx(ctx), selection(selection), tracker(selection, *this)
{
this->clearLists();
}
void SelectionScheduler::clearLists(void) {
this->ready.fast_clear();
this->active.fast_clear();
}
int32_t SelectionScheduler::buildDAG(SelectionBlock &bb) {
nodePool.rewind();
listPool.rewind();
tracker.clear();
this->clearLists();
// Track write-after-write and read-after-write dependencies
int32_t insnNum = 0;
for (auto &insn : bb.insnList) {
// Create a new node for this instruction
ScheduleDAGNode *node = this->newScheduleDAGNode(insn);
tracker.insnNodes[insnNum++] = node;
// read-after-write in registers
for (uint32_t srcID = 0; srcID < insn.srcNum; ++srcID)
tracker.addDependency(node, insn.src(srcID));
// read-after-write for predicate
if (insn.state.predicate != GEN_PREDICATE_NONE)
tracker.addDependency(node, getFlag(insn));
// read-after-write in memory
if (insn.isRead()) {
const uint32_t index = tracker.getIndex(insn.extra.function);
tracker.addDependency(node, index);
}
//read-after-write of scratch memory
if (insn.opcode == SEL_OP_UNSPILL_REG) {
const uint32_t index = tracker.getIndex(0xff);
tracker.addDependency(node, index);
}
// Consider barriers and wait are reading memory (local and global)
if (insn.opcode == SEL_OP_BARRIER ||
insn.opcode == SEL_OP_FENCE ||
insn.opcode == SEL_OP_WAIT) {
const uint32_t local = tracker.getIndex(0xfe);
const uint32_t global = tracker.getIndex(0x00);
tracker.addDependency(node, local);
tracker.addDependency(node, global);
}
// write-after-write in registers
for (uint32_t dstID = 0; dstID < insn.dstNum; ++dstID)
tracker.addDependency(node, insn.dst(dstID));
// write-after-write for predicate
if (insn.opcode == SEL_OP_CMP || insn.opcode == SEL_OP_I64CMP)
tracker.addDependency(node, getFlag(insn));
// write-after-write for accumulators
if (insn.state.accWrEnable)
tracker.addDependency(node, GenRegister::acc());
// write-after-write in memory
if (insn.isWrite()) {
const uint32_t index = tracker.getIndex(insn.extra.function);
tracker.addDependency(node, index);
}
// write-after-write in scratch memory
if (insn.opcode == SEL_OP_SPILL_REG) {
const uint32_t index = tracker.getIndex(0xff);
tracker.addDependency(node, index);
}
// Consider barriers and wait are writing memory (local and global)
if (insn.opcode == SEL_OP_BARRIER ||
insn.opcode == SEL_OP_FENCE ||
insn.opcode == SEL_OP_WAIT) {
const uint32_t local = tracker.getIndex(0xfe);
const uint32_t global = tracker.getIndex(0x00);
tracker.addDependency(node, local);
tracker.addDependency(node, global);
}
// Track all writes done by the instruction
tracker.updateWrites(node);
}
// Track write-after-read dependencies
tracker.clear();
for (int32_t insnID = insnNum-1; insnID >= 0; --insnID) {
ScheduleDAGNode *node = tracker.insnNodes[insnID];
const SelectionInstruction &insn = node->insn;
// write-after-read in registers
for (uint32_t srcID = 0; srcID < insn.srcNum; ++srcID)
tracker.addDependency(insn.src(srcID), node);
// write-after-read for predicate
if (insn.state.predicate != GEN_PREDICATE_NONE)
tracker.addDependency(getFlag(insn), node);
// write-after-read in memory
if (insn.isRead()) {
const uint32_t index = tracker.getIndex(insn.extra.function);
tracker.addDependency(index, node);
}
// Consider barriers and wait are reading memory (local and global)
if (insn.opcode == SEL_OP_BARRIER ||
insn.opcode == SEL_OP_FENCE ||
insn.opcode == SEL_OP_WAIT) {
const uint32_t local = tracker.getIndex(0xfe);
const uint32_t global = tracker.getIndex(0x00);
tracker.addDependency(local, node);
tracker.addDependency(global, node);
}
// Track all writes done by the instruction
tracker.updateWrites(node);
}
// Make labels and branches non-schedulable (i.e. they act as barriers)
for (int32_t insnID = 0; insnID < insnNum; ++insnID) {
ScheduleDAGNode *node = tracker.insnNodes[insnID];
if (node->insn.isBranch() || node->insn.isLabel() || node->insn.opcode == SEL_OP_EOT)
tracker.makeBarrier(insnID, insnNum);
}
// Build the initial ready list (should only be the label actually)
for (int32_t insnID = 0; insnID < insnNum; ++insnID) {
ScheduleDAGNode *node = tracker.insnNodes[insnID];
if (node->refNum == 0) {
ScheduleListNode *listNode = this->newScheduleListNode(node);
this->ready.push_back(listNode);
}
}
return insnNum;
}
void SelectionScheduler::scheduleDAG(SelectionBlock &bb, int32_t insnNum) {
uint32_t cycle = 0;
const bool isSIMD8 = this->ctx.getSimdWidth() == 8;
while (insnNum) {
// Retire all the instructions that finished
for (auto toRetireIt = active.begin(); toRetireIt != active.end();) {
ScheduleDAGNode *toRetireNode = toRetireIt.node()->node;
// Instruction is now complete
if (toRetireNode->retiredCycle <= cycle) {
toRetireIt = this->active.erase(toRetireIt);
// Traverse all children and make them ready if no more dependency
auto &children = toRetireNode->children;
for (auto it = children.begin(); it != children.end();) {
if (--it->node->refNum == 0) {
ScheduleListNode *listNode = it.node();
it = children.erase(it);
this->ready.push_back(listNode);
} else
++it;
}
}
// Get the next one
else
++toRetireIt;
}
// Try to schedule something from the ready list
intrusive_list::iterator toSchedule;
if (policy == POST_ALLOC) // FIFO scheduling
toSchedule = this->ready.begin();
else // LIFO scheduling
toSchedule = this->ready.rbegin();
// toSchedule = this->ready.begin();
if (toSchedule != this->ready.end()) {
// The instruction is instantaneously issued to simulate zero cycle
// scheduling
if (policy == POST_ALLOC)
cycle += getThroughputGen7(toSchedule->node->insn, isSIMD8);
this->ready.erase(toSchedule);
this->active.push_back(toSchedule.node());
// When we schedule before allocation, instruction is instantaneously
// ready. This allows to have a real LIFO strategy
if (policy == POST_ALLOC)
toSchedule->node->retiredCycle = cycle + getLatencyGen7(toSchedule->node->insn);
else
toSchedule->node->retiredCycle = cycle;
bb.append(&toSchedule->node->insn);
insnNum--;
} else
cycle++;
}
}
BVAR(OCL_POST_ALLOC_INSN_SCHEDULE, false);
BVAR(OCL_PRE_ALLOC_INSN_SCHEDULE, false);
void schedulePostRegAllocation(GenContext &ctx, Selection &selection) {
if (OCL_POST_ALLOC_INSN_SCHEDULE) {
SelectionScheduler scheduler(ctx, selection, POST_ALLOC);
for (auto &bb : *selection.blockList) {
const int32_t insnNum = scheduler.buildDAG(bb);
bb.insnList.clear();
scheduler.scheduleDAG(bb, insnNum);
}
}
}
void schedulePreRegAllocation(GenContext &ctx, Selection &selection) {
if (OCL_PRE_ALLOC_INSN_SCHEDULE) {
SelectionScheduler scheduler(ctx, selection, PRE_ALLOC);
for (auto &bb : *selection.blockList) {
const int32_t insnNum = scheduler.buildDAG(bb);
bb.insnList.clear();
scheduler.scheduleDAG(bb, insnNum);
}
}
}
} /* namespace gbe */
Release_v0.3/backend/src/backend/gen_insn_scheduling.hpp 0000664 0000000 0000000 00000002625 12231421770 0023473 0 ustar 00root root 0000000 0000000 /*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see .
*
* Author: Benjamin Segovia
*/
/**
* \file gen_insn_scheduling.hpp
* \author Benjamin Segovia
*/
#ifndef __GBE_GEN_INSN_SCHEDULING_HPP__
#define __GBE_GEN_INSN_SCHEDULING_HPP__
namespace gbe
{
class Selection; // Pre ISA code
class GenContext; // Handle compilation for Gen
/*! Schedule the code per basic block (tends to limit register number) */
void schedulePreRegAllocation(GenContext &ctx, Selection &selection);
/*! Schedule the code per basic block (tends to deal with insn latency) */
void schedulePostRegAllocation(GenContext &ctx, Selection &selection);
} /* namespace gbe */
#endif /* __GBE_GEN_INSN_SCHEDULING_HPP__ */
Release_v0.3/backend/src/backend/gen_insn_selection.cpp 0000664 0000000 0000000 00000341642 12231421770 0023333 0 ustar 00root root 0000000 0000000 /*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see .
*
* Author: Benjamin Segovia
*/
/**
* \file gen_insn_selection.cpp
* \author Benjamin Segovia
*/
/* This is the instruction selection code. First of all, this is a bunch of c++
* crap. Sorry if this is not that readable. Anyway, the goal here is to take
* GenIR code (i.e. the very regular, very RISC IR) and to produce GenISA with
* virtual registers (i.e. regular GenIR registers).
*
* Overall idea:
* =============
*
* There is a lot of papers and research about that but I tried to keep it
* simple. No dynamic programming, nothing like this. Just a recursive maximal
* munch.
*
* Basically, the code is executed per basic block from bottom to top. Patterns
* of GenIR instructions are defined and each instruction is matched against the
* best pattern i.e. the pattern that catches the largest number of
* instructions. Once matched, a sequence of instructions is output.
*
* Each instruction the match depends on is then marked as "root" i.e. we
* indicate that each of these instructions must be generated: we indeed need their
* destinations for the next instructions (remember that we generate the code in
* reverse order)
*
* Patterns:
* =========
*
* There is a lot of patterns and I did not implement all of them obviously. I
* just quickly gather the complete code to make pattern implementation kind of
* easy. This is pretty verbose to add a pattern but it should be not too hard
* to add new ones.
*
* To create and register patterns, I just abused C++ pre-main. A bunch of
* patterns is then created and sorted per opcode (i.e. the opcode of the root
* of the pattern): this creates a library of patterns that may be used in
* run-time.
*
* Predication / Masking and CFG linearization
* ===========================================
*
* The current version is based on an unfortunate choice. Basically, the problem
* to solve is how to map unstructured branches (i.e. regular gotos) onto Gen.
* Gen has a native support for structured branches (if/else/endif/while...) but
* nothing really native for unstructured branches.
*
* The idea we implemented is simple. We stole one flag register (here f0.0) to
* mask all the instructions (and only activate the proper SIMD lanes) and we
* use the CFG linearization technique to properly handle the control flow. This
* is not really good for one particular reason: Gen instructions must use the
* *same* flag register for the predicates (used for masking) and the
* conditional modifier (used as a destination for CMP). This leads to extra
* complications with compare instructions and select instructions. Basically,
* we need to insert extra MOVs.
*
* Also, there is some extra kludge to handle the predicates for JMPI.
*
* See TODO for a better idea for branching and masking
*
* TODO:
* =====
*
* Sadly, I recreated here a new DAG class. This is just a bad idea since we
* already have the DAG per basic block with the Function graph i.e. the
* complete graph of uses and definitions. I think we should be able to save a
* lot of code here if we can simply reuse the code from UD / DU chains.
*
* Finally, cross-block instruction selection is quite possible with this simple
* approach. Basically, instructions from dominating blocks could be merged and
* matched with other instructions in the dominated block. This leads to the
* interesting approach which consists in traversing the dominator tree in post
* order
*
* About masking and branching, a much better idea (that I found later unfortunately)
* is to replace the use of the flag by uses of if/endif to enclose the basic
* block. So, instead of using predication, we use auto-masking. The very cool
* consequence is that we can reintegrate back the structured branches.
* Basically, we will be able to identify branches that can be mapped to
* structured branches and mix nicely unstructured branches (which will use
* jpmi, if/endif to mask the blocks) and structured branches (which are pretty
* fast)
*/
#include "backend/gen_insn_selection.hpp"
#include "backend/gen_context.hpp"
#include "ir/function.hpp"
#include "ir/liveness.hpp"
#include "ir/profile.hpp"
#include "sys/cvar.hpp"
#include "sys/vector.hpp"
#include
namespace gbe
{
///////////////////////////////////////////////////////////////////////////
// Helper functions
///////////////////////////////////////////////////////////////////////////
uint32_t getGenType(ir::Type type) {
using namespace ir;
switch (type) {
case TYPE_BOOL: return GEN_TYPE_UW;
case TYPE_S8: return GEN_TYPE_B;
case TYPE_U8: return GEN_TYPE_UB;
case TYPE_S16: return GEN_TYPE_W;
case TYPE_U16: return GEN_TYPE_UW;
case TYPE_S32: return GEN_TYPE_D;
case TYPE_U32: return GEN_TYPE_UD;
case TYPE_S64: return GEN_TYPE_L;
case TYPE_U64: return GEN_TYPE_UL;
case TYPE_FLOAT: return GEN_TYPE_F;
case TYPE_DOUBLE: return GEN_TYPE_DF;
default: NOT_SUPPORTED; return GEN_TYPE_F;
}
}
uint32_t getGenCompare(ir::Opcode opcode) {
using namespace ir;
switch (opcode) {
case OP_LE: return GEN_CONDITIONAL_LE;
case OP_LT: return GEN_CONDITIONAL_L;
case OP_GE: return GEN_CONDITIONAL_GE;
case OP_GT: return GEN_CONDITIONAL_G;
case OP_EQ: return GEN_CONDITIONAL_EQ;
case OP_NE: return GEN_CONDITIONAL_NEQ;
default: NOT_SUPPORTED; return 0u;
};
}
///////////////////////////////////////////////////////////////////////////
// SelectionInstruction
///////////////////////////////////////////////////////////////////////////
SelectionInstruction::SelectionInstruction(SelectionOpcode op, uint32_t dst, uint32_t src) :
parent(NULL), opcode(op), dstNum(dst), srcNum(src)
{}
void SelectionInstruction::prepend(SelectionInstruction &other) {
gbe::prepend(&other, this);
other.parent = this->parent;
}
void SelectionInstruction::append(SelectionInstruction &other) {
gbe::append(&other, this);
other.parent = this->parent;
}
bool SelectionInstruction::isRead(void) const {
return this->opcode == SEL_OP_UNTYPED_READ ||
this->opcode == SEL_OP_READ64 ||
this->opcode == SEL_OP_ATOMIC ||
this->opcode == SEL_OP_BYTE_GATHER;
}
bool SelectionInstruction::isWrite(void) const {
return this->opcode == SEL_OP_UNTYPED_WRITE ||
this->opcode == SEL_OP_WRITE64 ||
this->opcode == SEL_OP_ATOMIC ||
this->opcode == SEL_OP_BYTE_SCATTER;
}
bool SelectionInstruction::isBranch(void) const {
return this->opcode == SEL_OP_JMPI;
}
bool SelectionInstruction::isLabel(void) const {
return this->opcode == SEL_OP_LABEL;
}
///////////////////////////////////////////////////////////////////////////
// SelectionVector
///////////////////////////////////////////////////////////////////////////
SelectionVector::SelectionVector(void) :
insn(NULL), reg(NULL), regNum(0), isSrc(0)
{}
///////////////////////////////////////////////////////////////////////////
// SelectionBlock
///////////////////////////////////////////////////////////////////////////
SelectionBlock::SelectionBlock(const ir::BasicBlock *bb) : bb(bb) {}
void SelectionBlock::append(ir::Register reg) { tmp.push_back(reg); }
void SelectionBlock::append(SelectionInstruction *insn) {
this->insnList.push_back(insn);
insn->parent = this;
}
void SelectionBlock::prepend(SelectionInstruction *insn) {
this->insnList.push_front(insn);
insn->parent = this;
}
void SelectionBlock::append(SelectionVector *vec) {
this->vectorList.push_back(vec);
}
///////////////////////////////////////////////////////////////////////////
// Maximal munch selection on DAG
///////////////////////////////////////////////////////////////////////////
/*! All instructions in a block are organized into a DAG */
class SelectionDAG
{
public:
INLINE SelectionDAG(const ir::Instruction &insn) :
insn(insn), mergeable(0), childNum(insn.getSrcNum()), isRoot(0) {
for (uint32_t childID = 0; childID < childNum; ++childID)
this->child[childID] = NULL;
}
/*! Mergeable are non-root instructions with valid sources */
INLINE void setAsMergeable(uint32_t which) { mergeable|=(1< opcodes;
/*! Number of instruction generated */
uint32_t insnNum;
/*! Cost of the pattern */
uint32_t cost;
};
/*! Store and sort all the patterns. This is our global library we use for the
* code selection
*/
class SelectionLibrary
{
public:
/*! Will register all the patterns */
SelectionLibrary(void);
/*! Release and destroy all the registered patterns */
~SelectionLibrary(void);
/*! Insert the given pattern for all associated opcodes */
template void insert(void);
/*! One list of pattern per opcode */
typedef vector PatternList;
/*! All lists of patterns properly sorted per opcode */
PatternList patterns[ir::OP_INVALID];
/*! All patterns to free */
vector toFree;
};
///////////////////////////////////////////////////////////////////////////
// Code selection internal implementation
///////////////////////////////////////////////////////////////////////////
/*! Actual implementation of the instruction selection engine */
class Selection::Opaque
{
public:
/*! simdWidth is the default width for the instructions */
Opaque(GenContext &ctx);
/*! Release everything */
virtual ~Opaque(void);
/*! Implements the instruction selection itself */
void select(void);
/*! Start a backward generation (from the end of the block) */
void startBackwardGeneration(void);
/*! End backward code generation and output the code in the block */
void endBackwardGeneration(void);
/*! Implement public class */
uint32_t getLargestBlockSize(void) const;
/*! Implement public class */
INLINE uint32_t getVectorNum(void) const { return this->vectorNum; }
/*! Implement public class */
INLINE ir::Register replaceSrc(SelectionInstruction *insn, uint32_t regID);
/*! Implement public class */
INLINE ir::Register replaceDst(SelectionInstruction *insn, uint32_t regID);
/*! spill a register (insert spill/unspill instructions) */
INLINE void spillReg(ir::Register reg, uint32_t registerPool);
/*! Implement public class */
INLINE uint32_t getRegNum(void) const { return file.regNum(); }
/*! Implements public interface */
bool isScalarOrBool(ir::Register reg) const;
/*! Implements public interface */
INLINE ir::RegisterData getRegisterData(ir::Register reg) const {
return file.get(reg);
}
/*! Implement public class */
INLINE ir::RegisterFamily getRegisterFamily(ir::Register reg) const {
return file.get(reg).family;
}
/*! Implement public class */
SelectionInstruction *create(SelectionOpcode, uint32_t dstNum, uint32_t srcNum);
/*! Return the selection register from the GenIR one */
GenRegister selReg(ir::Register, ir::Type type = ir::TYPE_FLOAT) const;
/*! Compute the nth register part when using SIMD8 with Qn (n in 2,3,4) */
GenRegister selRegQn(ir::Register, uint32_t quarter, ir::Type type = ir::TYPE_FLOAT) const;
/*! Size of the stack (should be large enough) */
enum { MAX_STATE_NUM = 16 };
/*! Push the current instruction state */
INLINE void push(void) {
assert(stateNum < MAX_STATE_NUM);
stack[stateNum++] = curr;
}
/*! Pop the latest pushed state */
INLINE void pop(void) {
assert(stateNum > 0);
curr = stack[--stateNum];
}
/*! Create a new register in the register file and append it in the
* temporary list of the current block
*/
INLINE ir::Register reg(ir::RegisterFamily family) {
GBE_ASSERT(block != NULL);
const ir::Register reg = file.append(family);
block->append(reg);
return reg;
}
/*! Append a block at the block stream tail. It becomes the current block */
void appendBlock(const ir::BasicBlock &bb);
/*! Append an instruction in the current block */
SelectionInstruction *appendInsn(SelectionOpcode, uint32_t dstNum, uint32_t srcNum);
/*! Append a new vector of registers in the current block */
SelectionVector *appendVector(void);
/*! Build a DAG for the basic block (return number of instructions) */
uint32_t buildBasicBlockDAG(const ir::BasicBlock &bb);
/*! Perform the selection on the basic block */
void matchBasicBlock(uint32_t insnNum);
/*! A root instruction needs to be generated */
bool isRoot(const ir::Instruction &insn) const;
/*! To handle selection block allocation */
DECL_POOL(SelectionBlock, blockPool);
/*! To handle selection instruction allocation */
LinearAllocator insnAllocator;
/*! To handle selection vector allocation */
DECL_POOL(SelectionVector, vecPool);
/*! Per register information used with top-down block sweeping */
vector regDAG;
/*! Store one DAG per instruction */
vector insnDAG;
/*! Owns this structure */
GenContext &ctx;
/*! Tail of the code fragment for backward code generation */
intrusive_list bwdList;
/*! List of emitted blocks */
intrusive_list blockList;
/*! Currently processed block */
SelectionBlock *block;
/*! Current instruction state to use */
GenInstructionState curr;
/*! We append new registers so we duplicate the function register file */
ir::RegisterFile file;
/*! State used to encode the instructions */
GenInstructionState stack[MAX_STATE_NUM];
/*! Maximum number of instructions in the basic blocks */
uint32_t maxInsnNum;
/*! Speed up instruction dag allocation */
DECL_POOL(SelectionDAG, dagPool);
/*! Total number of registers in the function we encode */
uint32_t regNum;
/*! Number of states currently pushed */
uint32_t stateNum;
/*! Number of vector allocated */
uint32_t vectorNum;
/*! If true, generate code backward */
bool bwdCodeGeneration;
/*! To make function prototypes more readable */
typedef const GenRegister &Reg;
#define ALU1(OP) \
INLINE void OP(Reg dst, Reg src) { ALU1(SEL_OP_##OP, dst, src); }
#define ALU1WithTemp(OP) \
INLINE void OP(Reg dst, Reg src, Reg temp) { ALU1WithTemp(SEL_OP_##OP, dst, src, temp); }
#define ALU2(OP) \
INLINE void OP(Reg dst, Reg src0, Reg src1) { ALU2(SEL_OP_##OP, dst, src0, src1); }
#define ALU2WithTemp(OP) \
INLINE void OP(Reg dst, Reg src0, Reg src1, Reg temp) { ALU2WithTemp(SEL_OP_##OP, dst, src0, src1, temp); }
#define ALU3(OP) \
INLINE void OP(Reg dst, Reg src0, Reg src1, Reg src2) { ALU3(SEL_OP_##OP, dst, src0, src1, src2); }
#define I64Shift(OP) \
INLINE void OP(Reg dst, Reg src0, Reg src1, GenRegister tmp[7]) { I64Shift(SEL_OP_##OP, dst, src0, src1, tmp); }
ALU1(MOV)
ALU1WithTemp(MOV_DF)
ALU1WithTemp(LOAD_DF_IMM)
ALU1(LOAD_INT64_IMM)
ALU1(RNDZ)
ALU1(RNDE)
ALU2(SEL)
ALU2(SEL_INT64)
ALU1(NOT)
ALU2(AND)
ALU2(OR)
ALU2(XOR)
ALU2(I64AND)
ALU2(I64OR)
ALU2(I64XOR)
ALU2(SHR)
ALU2(SHL)
ALU2(RSR)
ALU2(RSL)
ALU2(ASR)
ALU2(ADD)
ALU2WithTemp(I64ADD)
ALU2WithTemp(I64SUB)
ALU2(MUL)
ALU1(FRC)
ALU1(RNDD)
ALU1(RNDU)
ALU2(MACH)
ALU1(LZD)
ALU3(MAD)
ALU2WithTemp(MUL_HI)
ALU1(FBH)
ALU1(FBL)
ALU2WithTemp(HADD)
ALU2WithTemp(RHADD)
ALU2(UPSAMPLE_SHORT)
ALU2(UPSAMPLE_INT)
ALU2(UPSAMPLE_LONG)
ALU1WithTemp(CONVI_TO_I64)
ALU1WithTemp(CONVF_TO_I64)
ALU1(CONVI64_TO_I)
I64Shift(I64SHL)
I64Shift(I64SHR)
I64Shift(I64ASR)
#undef ALU1
#undef ALU1WithTemp
#undef ALU2
#undef ALU2WithTemp
#undef ALU3
#undef I64Shift
/*! Convert 64-bit integer to 32-bit float */
void CONVI64_TO_F(Reg dst, Reg src, GenRegister tmp[4]);
/*! Saturated 64bit x*y + z */
void I64MADSAT(Reg dst, Reg src0, Reg src1, Reg src2, GenRegister tmp[10]);
/*! High 64bit of x*y */
void I64_MUL_HI(Reg dst, Reg src0, Reg src1, GenRegister tmp[10]);
/*! (x+y)>>1 without mod. overflow */
void I64HADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[4]);
/*! (x+y+1)>>1 without mod. overflow */
void I64RHADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[4]);
/*! Shift a 64-bit integer */
void I64Shift(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, GenRegister tmp[7]);
/*! Compare 64-bit integer */
void I64CMP(uint32_t conditional, Reg src0, Reg src1, GenRegister tmp[3]);
/*! Saturated addition of 64-bit integer */
void I64SATADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]);
/*! Saturated subtraction of 64-bit integer */
void I64SATSUB(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]);
/*! Encode a barrier instruction */
void BARRIER(GenRegister src);
/*! Encode a barrier instruction */
void FENCE(GenRegister dst);
/*! Encode a label instruction */
void LABEL(ir::LabelIndex label);
/*! Jump indexed instruction */
void JMPI(Reg src, ir::LabelIndex target);
/*! Compare instructions */
void CMP(uint32_t conditional, Reg src0, Reg src1);
/*! Select instruction with embedded comparison */
void SEL_CMP(uint32_t conditional, Reg dst, Reg src0, Reg src1);
/* Constant buffer move instruction */
void INDIRECT_MOVE(Reg dst, Reg src);
/*! EOT is used to finish GPGPU threads */
void EOT(void);
/*! No-op */
void NOP(void);
/*! Wait instruction (used for the barrier) */
void WAIT(void);
/*! Atomic instruction */
void ATOMIC(Reg dst, uint32_t function, uint32_t srcNum, Reg src0, Reg src1, Reg src2, uint32_t bti);
/*! Read 64 bits float/int array */
void READ64(Reg addr, Reg tempAddr, const GenRegister *dst, uint32_t elemNum, uint32_t valueNum, uint32_t bti);
/*! Write 64 bits float/int array */
void WRITE64(Reg addr, const GenRegister *src, uint32_t srcNum, const GenRegister *dst, uint32_t dstNum, uint32_t bti);
/*! Untyped read (up to 4 elements) */
void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
/*! Untyped write (up to 4 elements) */
void UNTYPED_WRITE(Reg addr, const GenRegister *src, uint32_t elemNum, uint32_t bti);
/*! Byte gather (for unaligned bytes, shorts and ints) */
void BYTE_GATHER(Reg dst, Reg addr, uint32_t elemSize, uint32_t bti);
/*! Byte scatter (for unaligned bytes, shorts and ints) */
void BYTE_SCATTER(Reg addr, Reg src, uint32_t elemSize, uint32_t bti);
/*! DWord scatter (for constant cache read) */
void DWORD_GATHER(Reg dst, Reg addr, uint32_t bti);
/*! Extended math function (2 arguments) */
void MATH(Reg dst, uint32_t function, Reg src0, Reg src1);
/*! Extended math function (1 argument) */
void MATH(Reg dst, uint32_t function, Reg src);
/*! Encode unary instructions */
void ALU1(SelectionOpcode opcode, Reg dst, Reg src);
/*! Encode unary with temp reg instructions */
void ALU1WithTemp(SelectionOpcode opcode, Reg dst, Reg src0, Reg temp);
/*! Encode binary instructions */
void ALU2(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1);
/*! Encode binary with temp reg instructions */
void ALU2WithTemp(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, Reg temp);
/*! Encode ternary instructions */
void ALU3(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, Reg src2);
/*! Encode sample instructions */
void SAMPLE(GenRegister *dst, uint32_t dstNum, GenRegister *src, uint32_t srcNum, GenRegister *msgPayloads, uint32_t msgNum, uint32_t bti, uint32_t sampler);
/*! Encode typed write instructions */
void TYPED_WRITE(GenRegister *src, uint32_t srcNum, GenRegister *msgs, uint32_t msgNum, uint32_t bti);
/*! Get image information */
void GET_IMAGE_INFO(uint32_t type, GenRegister *dst, uint32_t dst_num, uint32_t bti);
/*! Multiply 64-bit integers */
void I64MUL(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]);
/*! 64-bit integer division */
void I64DIV(Reg dst, Reg src0, Reg src1, GenRegister tmp[14]);
/*! 64-bit integer remainder of division */
void I64REM(Reg dst, Reg src0, Reg src1, GenRegister tmp[14]);
/*! Use custom allocators */
GBE_CLASS(Opaque);
friend class SelectionBlock;
friend class SelectionInstruction;
};
///////////////////////////////////////////////////////////////////////////
// Helper function
///////////////////////////////////////////////////////////////////////////
/*! Directly mark all sources as root (when no match is found) */
static void markAllChildren(SelectionDAG &dag) {
// Do not merge anything, so all sources become roots
for (uint32_t childID = 0; childID < dag.childNum; ++childID)
if (dag.child[childID])
dag.child[childID]->isRoot = 1;
}
/*! Helper function to figure if two sources are the same */
static bool sourceMatch(SelectionDAG *src0DAG, uint32_t src0ID,
SelectionDAG *src1DAG, uint32_t src1ID)
{
GBE_ASSERT(src0DAG && src1DAG);
// Ensure they are the same physical registers
const ir::Register src0 = src0DAG->insn.getSrc(src0ID);
const ir::Register src1 = src1DAG->insn.getSrc(src1ID);
if (src0 != src1)
return false;
// Ensure they contain the same values
return src0DAG->child[src0ID] == src1DAG->child[src1ID];
}
Selection::Opaque::Opaque(GenContext &ctx) :
ctx(ctx), block(NULL),
curr(ctx.getSimdWidth()), file(ctx.getFunction().getRegisterFile()),
maxInsnNum(ctx.getFunction().getLargestBlockSize()), dagPool(maxInsnNum),
stateNum(0), vectorNum(0), bwdCodeGeneration(false)
{
const ir::Function &fn = ctx.getFunction();
this->regNum = fn.regNum();
this->regDAG.resize(regNum);
this->insnDAG.resize(maxInsnNum);
}
Selection::Opaque::~Opaque(void) {
for (auto it = blockList.begin(); it != blockList.end();) {
SelectionBlock &block = *it;
++it;
this->deleteSelectionBlock(&block);
}
}
SelectionInstruction*
Selection::Opaque::create(SelectionOpcode opcode, uint32_t dstNum, uint32_t srcNum)
{
const size_t regSize = (dstNum+srcNum)*sizeof(GenRegister);
const size_t size = sizeof(SelectionInstruction) + regSize;
void *ptr = insnAllocator.allocate(size);
return new (ptr) SelectionInstruction(opcode, dstNum, srcNum);
}
void Selection::Opaque::startBackwardGeneration(void) {
this->bwdCodeGeneration = true;
}
void Selection::Opaque::endBackwardGeneration(void) {
for (auto it = bwdList.rbegin(); it != bwdList.rend();) {
SelectionInstruction &insn = *it;
auto toRemoveIt = it--;
bwdList.erase(toRemoveIt);
this->block->prepend(&insn);
}
this->bwdCodeGeneration = false;
}
uint32_t Selection::Opaque::getLargestBlockSize(void) const {
size_t maxInsnNum = 0;
for (const auto &bb : blockList)
maxInsnNum = std::max(maxInsnNum, bb.insnList.size());
return uint32_t(maxInsnNum);
}
void Selection::Opaque::appendBlock(const ir::BasicBlock &bb) {
this->block = this->newSelectionBlock(&bb);
this->blockList.push_back(this->block);
}
SelectionInstruction *Selection::Opaque::appendInsn(SelectionOpcode opcode,
uint32_t dstNum,
uint32_t srcNum)
{
GBE_ASSERT(this->block != NULL);
SelectionInstruction *insn = this->create(opcode, dstNum, srcNum);
if (this->bwdCodeGeneration)
this->bwdList.push_back(insn);
else
this->block->append(insn);
insn->state = this->curr;
return insn;
}
SelectionVector *Selection::Opaque::appendVector(void) {
GBE_ASSERT(this->block != NULL);
SelectionVector *vector = this->newSelectionVector();
if (this->bwdCodeGeneration)
vector->insn = this->bwdList.back();
else
vector->insn = this->block->insnList.back();
this->block->append(vector);
this->vectorNum++;
return vector;
}
void Selection::Opaque::spillReg(ir::Register spilledReg, uint32_t registerPool) {
assert(registerPool != 0);
const uint32_t simdWidth = ctx.getSimdWidth();
const uint32_t dstStart = registerPool + 1;
const uint32_t srcStart = registerPool + 1;
uint32_t ptr = ctx.allocateScratchMem(typeSize(GEN_TYPE_D)*simdWidth);
for (auto &block : blockList)
for (auto &insn : block.insnList) {
// spill / unspill insn should be skipped when do spilling
if(insn.opcode == SEL_OP_SPILL_REG || insn.opcode == SEL_OP_UNSPILL_REG) continue;
const uint32_t srcNum = insn.srcNum, dstNum = insn.dstNum;
for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
const GenRegister selReg = insn.src(srcID);
const ir::Register reg = selReg.reg();
if(reg == spilledReg && selReg.file == GEN_GENERAL_REGISTER_FILE && selReg.physical == 0) {
GBE_ASSERT(srcID < 5);
SelectionInstruction *unspill = this->create(SEL_OP_UNSPILL_REG, 1, 0);
unspill->state = GenInstructionState(simdWidth);
unspill->dst(0) = GenRegister(GEN_GENERAL_REGISTER_FILE, srcStart+srcID, 0,
selReg.type, selReg.vstride, selReg.width, selReg.hstride);
GenRegister src = insn.src(srcID);
// change nr/subnr, keep other register settings
src.nr = srcStart+srcID; src.subnr=0; src.physical=1;
insn.src(srcID) = src;
unspill->extra.scratchOffset = ptr;
unspill->extra.scratchMsgHeader = registerPool;
insn.prepend(*unspill);
}
}
for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
const GenRegister selReg = insn.dst(dstID);
const ir::Register reg = selReg.reg();
if(reg == spilledReg && selReg.file == GEN_GENERAL_REGISTER_FILE && selReg.physical == 0) {
GBE_ASSERT(dstID < 5);
SelectionInstruction *spill = this->create(SEL_OP_SPILL_REG, 0, 1);
spill->state = GenInstructionState(simdWidth);
spill->src(0) =GenRegister(GEN_GENERAL_REGISTER_FILE, dstStart + dstID, 0,
selReg.type, selReg.vstride, selReg.width, selReg.hstride);
GenRegister dst = insn.dst(dstID);
// change nr/subnr, keep other register settings
dst.physical =1; dst.nr = dstStart+dstID; dst.subnr = 0;
insn.dst(dstID)= dst;
spill->extra.scratchOffset = ptr;
spill->extra.scratchMsgHeader = registerPool;
insn.append(*spill);
}
}
}
}
ir::Register Selection::Opaque::replaceSrc(SelectionInstruction *insn, uint32_t regID) {
SelectionBlock *block = insn->parent;
const uint32_t simdWidth = ctx.getSimdWidth();
ir::Register tmp;
// This will append the temporary register in the instruction block
this->block = block;
tmp = this->reg(ir::FAMILY_DWORD);
// Generate the MOV instruction and replace the register in the instruction
SelectionInstruction *mov = this->create(SEL_OP_MOV, 1, 1);
mov->src(0) = GenRegister::retype(insn->src(regID), GEN_TYPE_F);
mov->state = GenInstructionState(simdWidth);
insn->src(regID) = mov->dst(0) = GenRegister::fxgrf(simdWidth, tmp);
insn->prepend(*mov);
return tmp;
}
ir::Register Selection::Opaque::replaceDst(SelectionInstruction *insn, uint32_t regID) {
SelectionBlock *block = insn->parent;
uint32_t simdWidth = ctx.getSimdWidth();
ir::Register tmp;
ir::RegisterFamily f = file.get(insn->dst(regID).reg()).family;
int genType = f == ir::FAMILY_QWORD ? GEN_TYPE_DF : GEN_TYPE_F;
GenRegister gr;
// This will append the temporary register in the instruction block
this->block = block;
tmp = this->reg(f);
// Generate the MOV instruction and replace the register in the instruction
SelectionInstruction *mov = this->create(SEL_OP_MOV, 1, 1);
mov->dst(0) = GenRegister::retype(insn->dst(regID), genType);
mov->state = GenInstructionState(simdWidth);
gr = f == ir::FAMILY_QWORD ? GenRegister::dfxgrf(simdWidth, tmp) : GenRegister::fxgrf(simdWidth, tmp);
insn->dst(regID) = mov->src(0) = gr;
insn->append(*mov);
return tmp;
}
bool Selection::Opaque::isScalarOrBool(ir::Register reg) const {
if (ctx.isScalarReg(reg))
return true;
else {
const ir::RegisterFamily family = file.get(reg).family;
return family == ir::FAMILY_BOOL;
}
}
#define SEL_REG(SIMD16, SIMD8, SIMD1) \
if (ctx.sel->isScalarOrBool(reg) == true) \
return GenRegister::retype(GenRegister::SIMD1(reg), genType); \
else if (simdWidth == 8) \
return GenRegister::retype(GenRegister::SIMD8(reg), genType); \
else { \
GBE_ASSERT (simdWidth == 16); \
return GenRegister::retype(GenRegister::SIMD16(reg), genType); \
}
GenRegister Selection::Opaque::selReg(ir::Register reg, ir::Type type) const {
using namespace ir;
const uint32_t genType = getGenType(type);
const uint32_t simdWidth = ctx.getSimdWidth();
const RegisterData data = file.get(reg);
const RegisterFamily family = data.family;
switch (family) {
case FAMILY_BOOL: SEL_REG(uw1grf, uw1grf, uw1grf); break;
case FAMILY_WORD: SEL_REG(uw16grf, uw8grf, uw1grf); break;
case FAMILY_BYTE: SEL_REG(ub16grf, ub8grf, ub1grf); break;
case FAMILY_DWORD: SEL_REG(f16grf, f8grf, f1grf); break;
case FAMILY_QWORD: SEL_REG(df16grf, df8grf, df1grf); break;
default: NOT_SUPPORTED;
}
GBE_ASSERT(false);
return GenRegister();
}
#undef SEL_REG
GenRegister Selection::Opaque::selRegQn(ir::Register reg, uint32_t q, ir::Type type) const {
GenRegister sreg = this->selReg(reg, type);
sreg.quarter = q;
return sreg;
}
/*! Syntactic sugar for method declaration */
typedef const GenRegister &Reg;
void Selection::Opaque::LABEL(ir::LabelIndex index) {
SelectionInstruction *insn = this->appendInsn(SEL_OP_LABEL, 0, 0);
insn->index = uint16_t(index);
}
void Selection::Opaque::BARRIER(GenRegister src) {
SelectionInstruction *insn = this->appendInsn(SEL_OP_BARRIER, 0, 1);
insn->src(0) = src;
}
void Selection::Opaque::FENCE(GenRegister dst) {
SelectionInstruction *insn = this->appendInsn(SEL_OP_FENCE, 1, 0);
insn->dst(0) = dst;
}
void Selection::Opaque::JMPI(Reg src, ir::LabelIndex index) {
SelectionInstruction *insn = this->appendInsn(SEL_OP_JMPI, 0, 1);
insn->src(0) = src;
insn->index = uint16_t(index);
}
void Selection::Opaque::CMP(uint32_t conditional, Reg src0, Reg src1) {
SelectionInstruction *insn = this->appendInsn(SEL_OP_CMP, 0, 2);
insn->src(0) = src0;
insn->src(1) = src1;
insn->extra.function = conditional;
}
void Selection::Opaque::SEL_CMP(uint32_t conditional, Reg dst, Reg src0, Reg src1) {
SelectionInstruction *insn = this->appendInsn(SEL_OP_SEL_CMP, 1, 2);
insn->dst(0) = dst;
insn->src(0) = src0;
insn->src(1) = src1;
insn->extra.function = conditional;
}
void Selection::Opaque::INDIRECT_MOVE(Reg dst, Reg src) {
SelectionInstruction *insn = this->appendInsn(SEL_OP_INDIRECT_MOVE, 1, 1);
insn->dst(0) = dst;
insn->src(0) = src;
}
void Selection::Opaque::ATOMIC(Reg dst, uint32_t function,
uint32_t srcNum, Reg src0,
Reg src1, Reg src2, uint32_t bti) {
SelectionInstruction *insn = this->appendInsn(SEL_OP_ATOMIC, 1, srcNum);
insn->dst(0) = dst;
insn->src(0) = src0;
if(srcNum > 1) insn->src(1) = src1;
if(srcNum > 2) insn->src(2) = src2;
insn->extra.function = function;
insn->extra.elem = bti;
SelectionVector *vector = this->appendVector();
vector->regNum = srcNum;
vector->reg = &insn->src(0);
vector->isSrc = 1;
}
void Selection::Opaque::EOT(void) { this->appendInsn(SEL_OP_EOT, 0, 0); }
void Selection::Opaque::NOP(void) { this->appendInsn(SEL_OP_NOP, 0, 0); }
void Selection::Opaque::WAIT(void) { this->appendInsn(SEL_OP_WAIT, 0, 0); }
/* elemNum contains all the temporary register and the
real destination registers.*/
void Selection::Opaque::READ64(Reg addr,
Reg tempAddr,
const GenRegister *dst,
uint32_t elemNum,
uint32_t valueNum,
uint32_t bti)
{
SelectionInstruction *insn = this->appendInsn(SEL_OP_READ64, elemNum + 1, 1);
SelectionVector *srcVector = this->appendVector();
SelectionVector *dstVector = this->appendVector();
/* temporary addr register is to be modified, set it to dst registers.*/
insn->dst(0) = tempAddr;
// Regular instruction to encode
for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
insn->dst(elemID + 1) = dst[elemID];
insn->src(0) = addr;
insn->extra.function = bti;
insn->extra.elem = valueNum;
// Only the temporary registers need contiguous allocation
dstVector->regNum = elemNum - valueNum;
dstVector->isSrc = 0;
dstVector->reg = &insn->dst(1);
// Source cannot be scalar (yet)
srcVector->regNum = 1;
srcVector->isSrc = 1;
srcVector->reg = &insn->src(0);
}
void Selection::Opaque::UNTYPED_READ(Reg addr,
const GenRegister *dst,
uint32_t elemNum,
uint32_t bti)
{
SelectionInstruction *insn = this->appendInsn(SEL_OP_UNTYPED_READ, elemNum, 1);
SelectionVector *srcVector = this->appendVector();
SelectionVector *dstVector = this->appendVector();
// Regular instruction to encode
for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
insn->dst(elemID) = dst[elemID];
insn->src(0) = addr;
insn->extra.function = bti;
insn->extra.elem = elemNum;
// Sends require contiguous allocation
dstVector->regNum = elemNum;
dstVector->isSrc = 0;
dstVector->reg = &insn->dst(0);
// Source cannot be scalar (yet)
srcVector->regNum = 1;
srcVector->isSrc = 1;
srcVector->reg = &insn->src(0);
}
/* elemNum contains all the temporary register and the
real data registers.*/
void Selection::Opaque::WRITE64(Reg addr,
const GenRegister *src,
uint32_t srcNum,
const GenRegister *dst,
uint32_t dstNum,
uint32_t bti)
{
SelectionInstruction *insn = this->appendInsn(SEL_OP_WRITE64, dstNum, srcNum + 1);
SelectionVector *vector = this->appendVector();
// Regular instruction to encode
insn->src(0) = addr;
for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
insn->src(elemID + 1) = src[elemID];
for (uint32_t elemID = 0; elemID < dstNum; ++elemID)
insn->dst(elemID) = dst[elemID];
insn->extra.function = bti;
insn->extra.elem = srcNum;
// Only the addr + temporary registers need to be contiguous.
vector->regNum = dstNum;
vector->reg = &insn->dst(0);
vector->isSrc = 1;
}
void Selection::Opaque::UNTYPED_WRITE(Reg addr,
const GenRegister *src,
uint32_t elemNum,
uint32_t bti)
{
SelectionInstruction *insn = this->appendInsn(SEL_OP_UNTYPED_WRITE, 0, elemNum+1);
SelectionVector *vector = this->appendVector();
// Regular instruction to encode
insn->src(0) = addr;
for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
insn->src(elemID+1) = src[elemID];
insn->extra.function = bti;
insn->extra.elem = elemNum;
// Sends require contiguous allocation for the sources
vector->regNum = elemNum+1;
vector->reg = &insn->src(0);
vector->isSrc = 1;
}
void Selection::Opaque::BYTE_GATHER(Reg dst, Reg addr, uint32_t elemSize, uint32_t bti) {
SelectionInstruction *insn = this->appendInsn(SEL_OP_BYTE_GATHER, 1, 1);
SelectionVector *srcVector = this->appendVector();
SelectionVector *dstVector = this->appendVector();
// Instruction to encode
insn->src(0) = addr;
insn->dst(0) = dst;
insn->extra.function = bti;
insn->extra.elem = elemSize;
// byte gather requires vector in the sense that scalar are not allowed
// (yet)
dstVector->regNum = 1;
dstVector->isSrc = 0;
dstVector->reg = &insn->dst(0);
srcVector->regNum = 1;
srcVector->isSrc = 1;
srcVector->reg = &insn->src(0);
}
void Selection::Opaque::BYTE_SCATTER(Reg addr, Reg src, uint32_t elemSize, uint32_t bti) {
SelectionInstruction *insn = this->appendInsn(SEL_OP_BYTE_SCATTER, 0, 2);
SelectionVector *vector = this->appendVector();
// Instruction to encode
insn->src(0) = addr;
insn->src(1) = src;
insn->extra.function = bti;
insn->extra.elem = elemSize;
// value and address are contiguous in the send
vector->regNum = 2;
vector->isSrc = 1;
vector->reg = &insn->src(0);
}
void Selection::Opaque::DWORD_GATHER(Reg dst, Reg addr, uint32_t bti) {
SelectionInstruction *insn = this->appendInsn(SEL_OP_DWORD_GATHER, 1, 1);
insn->src(0) = addr;
insn->dst(0) = dst;
insn->extra.function = bti;
}
void Selection::Opaque::MATH(Reg dst, uint32_t function, Reg src0, Reg src1) {
SelectionInstruction *insn = this->appendInsn(SEL_OP_MATH, 1, 2);
insn->dst(0) = dst;
insn->src(0) = src0;
insn->src(1) = src1;
insn->extra.function = function;
}
void Selection::Opaque::MATH(Reg dst, uint32_t function, Reg src) {
SelectionInstruction *insn = this->appendInsn(SEL_OP_MATH, 1, 1);
insn->dst(0) = dst;
insn->src(0) = src;
insn->extra.function = function;
}
void Selection::Opaque::I64MUL(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]) {
SelectionInstruction *insn = this->appendInsn(SEL_OP_I64MUL, 7, 2);
insn->dst(0) = dst;
insn->src(0) = src0;
insn->src(1) = src1;
for(int i = 0; i < 6; i++)
insn->dst(i + 1) = tmp[i];
}
void Selection::Opaque::I64DIV(Reg dst, Reg src0, Reg src1, GenRegister tmp[14]) {
SelectionInstruction *insn = this->appendInsn(SEL_OP_I64DIV, 15, 2);
insn->dst(0) = dst;
insn->src(0) = src0;
insn->src(1) = src1;
for(int i = 0; i < 14; i++)
insn->dst(i + 1) = tmp[i];
}
void Selection::Opaque::I64REM(Reg dst, Reg src0, Reg src1, GenRegister tmp[14]) {
SelectionInstruction *insn = this->appendInsn(SEL_OP_I64REM, 15, 2);
insn->dst(0) = dst;
insn->src(0) = src0;
insn->src(1) = src1;
for(int i = 0; i < 14; i++)
insn->dst(i + 1) = tmp[i];
}
void Selection::Opaque::ALU1(SelectionOpcode opcode, Reg dst, Reg src) {
SelectionInstruction *insn = this->appendInsn(opcode, 1, 1);
insn->dst(0) = dst;
insn->src(0) = src;
}
void Selection::Opaque::ALU1WithTemp(SelectionOpcode opcode, Reg dst, Reg src, Reg temp) {
SelectionInstruction *insn = this->appendInsn(opcode, 2, 1);
insn->dst(0) = dst;
insn->src(0) = src;
insn->dst(1) = temp;
}
void Selection::Opaque::ALU2(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1) {
SelectionInstruction *insn = this->appendInsn(opcode, 1, 2);
insn->dst(0) = dst;
insn->src(0) = src0;
insn->src(1) = src1;
}
void Selection::Opaque::ALU2WithTemp(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, Reg temp) {
SelectionInstruction *insn = this->appendInsn(opcode, 2, 2);
insn->dst(0) = dst;
insn->src(0) = src0;
insn->src(1) = src1;
insn->dst(1) = temp;
}
void Selection::Opaque::ALU3(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, Reg src2) {
SelectionInstruction *insn = this->appendInsn(opcode, 1, 3);
insn->dst(0) = dst;
insn->src(0) = src0;
insn->src(1) = src1;
insn->src(2) = src2;
}
void Selection::Opaque::I64CMP(uint32_t conditional, Reg src0, Reg src1, GenRegister tmp[3]) {
SelectionInstruction *insn = this->appendInsn(SEL_OP_I64CMP, 3, 2);
insn->src(0) = src0;
insn->src(1) = src1;
for(int i=0; i<3; i++)
insn->dst(i) = tmp[i];
insn->extra.function = conditional;
}
void Selection::Opaque::I64SATADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]) {
SelectionInstruction *insn = this->appendInsn(SEL_OP_I64SATADD, 7, 2);
insn->dst(0) = dst;
insn->src(0) = src0;
insn->src(1) = src1;
for(int i=0; i<6; i++)
insn->dst(i + 1) = tmp[i];
}
void Selection::Opaque::I64SATSUB(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]) {
SelectionInstruction *insn = this->appendInsn(SEL_OP_I64SATSUB, 7, 2);
insn->dst(0) = dst;
insn->src(0) = src0;
insn->src(1) = src1;
for(int i=0; i<6; i++)
insn->dst(i + 1) = tmp[i];
}
void Selection::Opaque::CONVI64_TO_F(Reg dst, Reg src, GenRegister tmp[4]) {
SelectionInstruction *insn = this->appendInsn(SEL_OP_CONVI64_TO_F, 5, 1);
insn->dst(0) = dst;
insn->src(0) = src;
for(int i = 0; i < 4; i ++)
insn->dst(i + 1) = tmp[i];
}
void Selection::Opaque::I64MADSAT(Reg dst, Reg src0, Reg src1, Reg src2, GenRegister tmp[10]) {
SelectionInstruction *insn = this->appendInsn(SEL_OP_I64MADSAT, 11, 3);
insn->dst(0) = dst;
insn->src(0) = src0;
insn->src(1) = src1;
insn->src(2) = src2;
for(int i = 0; i < 10; i ++)
insn->dst(i + 1) = tmp[i];
}
void Selection::Opaque::I64_MUL_HI(Reg dst, Reg src0, Reg src1, GenRegister tmp[10]) {
SelectionInstruction *insn = this->appendInsn(SEL_OP_I64_MUL_HI, 11, 2);
insn->dst(0) = dst;
insn->src(0) = src0;
insn->src(1) = src1;
for(int i = 0; i < 10; i ++)
insn->dst(i + 1) = tmp[i];
}
void Selection::Opaque::I64HADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[4]) {
SelectionInstruction *insn = this->appendInsn(SEL_OP_I64HADD, 5, 2);
insn->dst(0) = dst;
insn->src(0) = src0;
insn->src(1) = src1;
for(int i = 0; i < 4; i ++)
insn->dst(i + 1) = tmp[i];
}
void Selection::Opaque::I64RHADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[4]) {
SelectionInstruction *insn = this->appendInsn(SEL_OP_I64RHADD, 5, 2);
insn->dst(0) = dst;
insn->src(0) = src0;
insn->src(1) = src1;
for(int i = 0; i < 4; i ++)
insn->dst(i + 1) = tmp[i];
}
void Selection::Opaque::I64Shift(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, GenRegister tmp[7]) {
SelectionInstruction *insn = this->appendInsn(opcode, 8, 2);
insn->dst(0) = dst;
insn->src(0) = src0;
insn->src(1) = src1;
for(int i = 0; i < 7; i ++)
insn->dst(i + 1) = tmp[i];
}
// Boiler plate to initialize the selection library at c++ pre-main
static SelectionLibrary *selLib = NULL;
static void destroySelectionLibrary(void) { GBE_DELETE(selLib); }
static struct SelectionLibraryInitializer {
SelectionLibraryInitializer(void) {
selLib = GBE_NEW_NO_ARG(SelectionLibrary);
atexit(destroySelectionLibrary);
}
} selectionLibraryInitializer;
bool Selection::Opaque::isRoot(const ir::Instruction &insn) const {
if (insn.getDstNum() > 1 ||
insn.hasSideEffect() ||
insn.isMemberOf() ||
insn.isMemberOf())
return true;
// No side effect, not a branch and no destination? Impossible
GBE_ASSERT(insn.getDstNum() == 1);
// Root if alive outside the block.
// XXX we should use Value and not registers in liveness info
const ir::BasicBlock *insnBlock = insn.getParent();
const ir::Liveness &liveness = this->ctx.getLiveness();
const ir::Liveness::LiveOut &liveOut = liveness.getLiveOut(insnBlock);
const ir::Register reg = insn.getDst(0);
if (liveOut.contains(reg))
return true;
// The instruction is only used in the current basic block
return false;
}
uint32_t Selection::Opaque::buildBasicBlockDAG(const ir::BasicBlock &bb)
{
using namespace ir;
// Clear all registers
for (uint32_t regID = 0; regID < this->regNum; ++regID)
this->regDAG[regID] = NULL;
// Build the DAG on the fly
uint32_t insnNum = 0;
const_cast(bb).foreach([&](const Instruction &insn) {
// Build a selectionDAG node for instruction
SelectionDAG *dag = this->newSelectionDAG(insn);
// Point to non-root children
const uint32_t srcNum = insn.getSrcNum();
for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
const ir::Register reg = insn.getSrc(srcID);
SelectionDAG *child = this->regDAG[reg];
if (child) {
const ir::Instruction &childInsn = child->insn;
const uint32_t childSrcNum = childInsn.getSrcNum();
// We can merge a child only if its sources are still valid
bool mergeable = true;
for (uint32_t otherID = 0; otherID < childSrcNum; ++otherID) {
const SelectionDAG *srcDAG = child->child[otherID];
const ir::Register srcReg = childInsn.getSrc(otherID);
SelectionDAG *currDAG = this->regDAG[srcReg];
if (srcDAG != currDAG) {
mergeable = false;
break;
}
}
if (mergeable) dag->setAsMergeable(srcID);
dag->child[srcID] = child;
} else
dag->child[srcID] = NULL;
}
// Make it a root if we must
if (this->isRoot(insn)) dag->isRoot = 1;
// Save the DAG <-> instruction mapping
this->insnDAG[insnNum++] = dag;
// Associate all output registers to this instruction
const uint32_t dstNum = insn.getDstNum();
for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
const ir::Register reg = insn.getDst(dstID);
this->regDAG[reg] = dag;
}
});
return insnNum;
}
void Selection::Opaque::matchBasicBlock(uint32_t insnNum)
{
// Bottom up code generation
for (int32_t insnID = insnNum-1; insnID >= 0; --insnID) {
// Process all possible patterns for this instruction
SelectionDAG &dag = *insnDAG[insnID];
if (dag.isRoot) {
const ir::Instruction &insn = dag.insn;
const ir::Opcode opcode = insn.getOpcode();
auto it = selLib->patterns[opcode].begin();
const auto end = selLib->patterns[opcode].end();
// Start a new code fragment
this->startBackwardGeneration();
// Try all the patterns from best to worst
do {
if ((*it)->emit(*this, dag))
break;
++it;
} while (it != end);
GBE_ASSERT(it != end);
// Output the code in the current basic block
this->endBackwardGeneration();
}
}
}
void Selection::Opaque::select(void)
{
using namespace ir;
const Function &fn = ctx.getFunction();
// Perform the selection per basic block
fn.foreachBlock([&](const BasicBlock &bb) {
this->dagPool.rewind();
this->appendBlock(bb);
const uint32_t insnNum = this->buildBasicBlockDAG(bb);
this->matchBasicBlock(insnNum);
});
}
void Selection::Opaque::SAMPLE(GenRegister *dst, uint32_t dstNum,
GenRegister *src, uint32_t srcNum,
GenRegister *msgPayloads, uint32_t msgNum,
uint32_t bti, uint32_t sampler) {
SelectionInstruction *insn = this->appendInsn(SEL_OP_SAMPLE, dstNum, msgNum + srcNum);
SelectionVector *dstVector = this->appendVector();
SelectionVector *msgVector = this->appendVector();
// Regular instruction to encode
for (uint32_t elemID = 0; elemID < dstNum; ++elemID)
insn->dst(elemID) = dst[elemID];
for (uint32_t elemID = 0; elemID < msgNum; ++elemID)
insn->src(elemID) = msgPayloads[elemID];
for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
insn->src(msgNum + elemID) = src[elemID];
// Sends require contiguous allocation
dstVector->regNum = dstNum;
dstVector->isSrc = 0;
dstVector->reg = &insn->dst(0);
// Only the messages require contiguous registers.
msgVector->regNum = msgNum;
msgVector->isSrc = 1;
msgVector->reg = &insn->src(0);
insn->extra.function = bti;
insn->extra.elem = sampler;
}
///////////////////////////////////////////////////////////////////////////
// Code selection public implementation
///////////////////////////////////////////////////////////////////////////
Selection::Selection(GenContext &ctx) {
this->blockList = NULL;
this->opaque = GBE_NEW(Selection::Opaque, ctx);
}
void Selection::Opaque::TYPED_WRITE(GenRegister *src, uint32_t srcNum,
GenRegister *msgs, uint32_t msgNum,
uint32_t bti) {
uint32_t elemID = 0;
uint32_t i;
SelectionInstruction *insn = this->appendInsn(SEL_OP_TYPED_WRITE, 0, msgNum + srcNum);
SelectionVector *msgVector = this->appendVector();;
for( i = 0; i < msgNum; ++i, ++elemID)
insn->src(elemID) = msgs[i];
for (i = 0; i < srcNum; ++i, ++elemID)
insn->src(elemID) = src[i];
insn->extra.function = bti;
insn->extra.elem = msgNum;
// Sends require contiguous allocation
msgVector->regNum = msgNum;
msgVector->isSrc = 1;
msgVector->reg = &insn->src(0);
}
void Selection::Opaque::GET_IMAGE_INFO(uint32_t infoType, GenRegister *dst,
uint32_t dstNum, uint32_t bti) {
SelectionInstruction *insn = this->appendInsn(SEL_OP_GET_IMAGE_INFO, dstNum, 0);
for(uint32_t i = 0; i < dstNum; ++i)
insn->dst(i) = dst[i];
insn->extra.function = bti;
insn->extra.elem = infoType;
}
Selection::~Selection(void) { GBE_DELETE(this->opaque); }
void Selection::select(void) {
this->opaque->select();
this->blockList = &this->opaque->blockList;
}
bool Selection::isScalarOrBool(ir::Register reg) const {
return this->opaque->isScalarOrBool(reg);
}
uint32_t Selection::getLargestBlockSize(void) const {
return this->opaque->getLargestBlockSize();
}
uint32_t Selection::getVectorNum(void) const {
return this->opaque->getVectorNum();
}
uint32_t Selection::getRegNum(void) const {
return this->opaque->getRegNum();
}
ir::RegisterFamily Selection::getRegisterFamily(ir::Register reg) const {
return this->opaque->getRegisterFamily(reg);
}
ir::RegisterData Selection::getRegisterData(ir::Register reg) const {
return this->opaque->getRegisterData(reg);
}
ir::Register Selection::replaceSrc(SelectionInstruction *insn, uint32_t regID) {
return this->opaque->replaceSrc(insn, regID);
}
ir::Register Selection::replaceDst(SelectionInstruction *insn, uint32_t regID) {
return this->opaque->replaceDst(insn, regID);
}
void Selection::spillReg(ir::Register reg, uint32_t registerPool) {
this->opaque->spillReg(reg, registerPool);
}
SelectionInstruction *Selection::create(SelectionOpcode opcode, uint32_t dstNum, uint32_t srcNum) {
return this->opaque->create(opcode, dstNum, srcNum);
}
///////////////////////////////////////////////////////////////////////////
// Implementation of all patterns
///////////////////////////////////////////////////////////////////////////
bool canGetRegisterFromImmediate(const ir::Instruction &insn) {
using namespace ir;
const auto &childInsn = cast(insn);
const auto &imm = childInsn.getImmediate();
if(imm.type != TYPE_DOUBLE && imm.type != TYPE_S64 && imm.type != TYPE_U64)
return true;
return false;
}
GenRegister getRegisterFromImmediate(ir::Immediate imm)
{
using namespace ir;
switch (imm.type) {
case TYPE_U32: return GenRegister::immud(imm.data.u32);
case TYPE_S32: return GenRegister::immd(imm.data.s32);
case TYPE_FLOAT: return GenRegister::immf(imm.data.f32);
case TYPE_U16: return GenRegister::immuw(imm.data.u16);
case TYPE_S16: return GenRegister::immw(imm.data.s16);
case TYPE_U8: return GenRegister::immuw(imm.data.u8);
case TYPE_S8: return GenRegister::immw(imm.data.s8);
case TYPE_DOUBLE: return GenRegister::immdf(imm.data.f64);
default: NOT_SUPPORTED; return GenRegister::immuw(0);
}
}
/*! Template for the one-to-many instruction patterns */
template
class OneToManyPattern : public SelectionPattern
{
public:
/*! Register the pattern for all opcodes of the family */
OneToManyPattern(uint32_t insnNum, uint32_t cost) :
SelectionPattern(insnNum, cost)
{
for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
if (ir::isOpcodeFrom(ir::Opcode(op)) == true)
this->opcodes.push_back(ir::Opcode(op));
}
/*! Call the child method with the proper prototype */
virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const {
if (static_cast(this)->emitOne(sel, ir::cast(dag.insn))) {
markAllChildren(dag);
return true;
}
return false;
}
};
/*! Declare a naive one-to-many pattern */
#define DECL_PATTERN(FAMILY) \
struct FAMILY##Pattern : public OneToManyPattern
#define DECL_CTOR(FAMILY, INSN_NUM, COST) \
FAMILY##Pattern(void) : OneToManyPattern(INSN_NUM, COST) {}
/*! Unary instruction patterns */
DECL_PATTERN(UnaryInstruction)
{
static ir::Type getType(const ir::Opcode opcode, const ir::Type insnType) {
if (insnType == ir::TYPE_S64 || insnType == ir::TYPE_U64 || insnType == ir::TYPE_S8 || insnType == ir::TYPE_U8)
return insnType;
if (opcode == ir::OP_FBH || opcode == ir::OP_FBL)
return ir::TYPE_U32;
if (insnType == ir::TYPE_S16 || insnType == ir::TYPE_U16)
return insnType;
return ir::TYPE_FLOAT;
}
INLINE bool emitOne(Selection::Opaque &sel, const ir::UnaryInstruction &insn) const {
const ir::Opcode opcode = insn.getOpcode();
const ir::Type insnType = insn.getType();
const GenRegister dst = sel.selReg(insn.getDst(0), getType(opcode, insnType));
const GenRegister src = sel.selReg(insn.getSrc(0), getType(opcode, insnType));
switch (opcode) {
case ir::OP_ABS:
if (insn.getType() == ir::TYPE_S32) {
const GenRegister src_ = GenRegister::retype(src, GEN_TYPE_D);
const GenRegister dst_ = GenRegister::retype(dst, GEN_TYPE_D);
sel.MOV(dst_, GenRegister::abs(src_));
} else {
GBE_ASSERT(insn.getType() == ir::TYPE_FLOAT);
sel.MOV(dst, GenRegister::abs(src));
}
break;
case ir::OP_MOV:
if (dst.isdf()) {
ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD);
sel.MOV_DF(dst, src, sel.selReg(r));
} else
sel.MOV(dst, src);
break;
case ir::OP_RNDD: sel.RNDD(dst, src); break;
case ir::OP_RNDE: sel.RNDE(dst, src); break;
case ir::OP_RNDU: sel.RNDU(dst, src); break;
case ir::OP_RNDZ: sel.RNDZ(dst, src); break;
case ir::OP_FBH: sel.FBH(dst, src); break;
case ir::OP_FBL: sel.FBL(dst, src); break;
case ir::OP_COS: sel.MATH(dst, GEN_MATH_FUNCTION_COS, src); break;
case ir::OP_SIN: sel.MATH(dst, GEN_MATH_FUNCTION_SIN, src); break;
case ir::OP_LOG: sel.MATH(dst, GEN_MATH_FUNCTION_LOG, src); break;
case ir::OP_SQR: sel.MATH(dst, GEN_MATH_FUNCTION_SQRT, src); break;
case ir::OP_RSQ: sel.MATH(dst, GEN_MATH_FUNCTION_RSQ, src); break;
case ir::OP_RCP: sel.MATH(dst, GEN_MATH_FUNCTION_INV, src); break;
default: NOT_SUPPORTED;
}
return true;
}
DECL_CTOR(UnaryInstruction, 1, 1)
};
BVAR(OCL_OPTIMIZE_IMMEDIATE, true);
/*! Binary regular instruction pattern */
class BinaryInstructionPattern : public SelectionPattern
{
public:
BinaryInstructionPattern(void) : SelectionPattern(1,1) {
for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
if (ir::isOpcodeFrom(ir::Opcode(op)) == true)
this->opcodes.push_back(ir::Opcode(op));
}
bool emitDivRemInst(Selection::Opaque &sel, SelectionDAG &dag, ir::Opcode op) const
{
using namespace ir;
const ir::BinaryInstruction &insn = cast(dag.insn);
const Type type = insn.getType();
GenRegister dst = sel.selReg(insn.getDst(0), type);
GenRegister src0 = sel.selReg(insn.getSrc(0), type);
GenRegister src1 = sel.selReg(insn.getSrc(1), type);
const uint32_t simdWidth = sel.curr.execWidth;
const RegisterFamily family = getFamily(type);
uint32_t function = (op == OP_DIV)?
GEN_MATH_FUNCTION_INT_DIV_QUOTIENT :
GEN_MATH_FUNCTION_INT_DIV_REMAINDER;
//bytes and shorts must be converted to int for DIV and REM per GEN restriction
if((family == FAMILY_WORD || family == FAMILY_BYTE)) {
GenRegister tmp0, tmp1;
ir::Register reg = sel.reg(FAMILY_DWORD);
tmp0 = GenRegister::udxgrf(simdWidth, reg);
tmp0 = GenRegister::retype(tmp0, GEN_TYPE_D);
sel.MOV(tmp0, src0);
tmp1 = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
tmp1 = GenRegister::retype(tmp1, GEN_TYPE_D);
sel.MOV(tmp1, src1);
sel.MATH(tmp0, function, tmp0, tmp1);
GenRegister unpacked;
if(family == FAMILY_WORD) {
unpacked = GenRegister::unpacked_uw(reg);
} else {
unpacked = GenRegister::unpacked_ub(reg);
}
unpacked = GenRegister::retype(unpacked, getGenType(type));
sel.MOV(dst, unpacked);
} else if (type == TYPE_S32 || type == TYPE_U32 ) {
sel.MATH(dst, function, src0, src1);
} else if(type == TYPE_FLOAT) {
GBE_ASSERT(op != OP_REM);
sel.MATH(dst, GEN_MATH_FUNCTION_FDIV, src0, src1);
} else if (type == TYPE_S64 || type == TYPE_U64) {
GenRegister tmp[14];
for(int i=0; i<13; i++) {
tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
tmp[i].type = GEN_TYPE_UD;
}
tmp[13] = sel.selReg(sel.reg(FAMILY_BOOL));
if(op == OP_DIV)
sel.I64DIV(dst, src0, src1, tmp);
else
sel.I64REM(dst, src0, src1, tmp);
}
markAllChildren(dag);
return true;
}
INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const
{
using namespace ir;
const ir::BinaryInstruction &insn = cast(dag.insn);
const Opcode opcode = insn.getOpcode();
const Type type = insn.getType();
GenRegister dst = sel.selReg(insn.getDst(0), type);
if(opcode == OP_DIV || opcode == OP_REM) {
return this->emitDivRemInst(sel, dag, opcode);
}
// Immediates not supported
if (opcode == OP_POW) {
GenRegister src0 = sel.selReg(insn.getSrc(0), type);
GenRegister src1 = sel.selReg(insn.getSrc(1), type);
if(type == TYPE_FLOAT) {
sel.MATH(dst, GEN_MATH_FUNCTION_POW, src0, src1);
} else {
NOT_IMPLEMENTED;
}
markAllChildren(dag);
return true;
}
sel.push();
// Boolean values use scalars
if (sel.isScalarOrBool(insn.getDst(0)) == true) {
sel.curr.execWidth = 1;
sel.curr.predicate = GEN_PREDICATE_NONE;
sel.curr.noMask = 1;
}
// Look for immediate values
GenRegister src0, src1;
SelectionDAG *dag0 = dag.child[0];
SelectionDAG *dag1 = dag.child[1];
// Right source can always be an immediate
if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag1->insn)) {
const auto &childInsn = cast(dag1->insn);
src0 = sel.selReg(insn.getSrc(0), type);
src1 = getRegisterFromImmediate(childInsn.getImmediate());
if (dag0) dag0->isRoot = 1;
}
// Left source cannot be immediate but it is OK if we can commute
else if (OCL_OPTIMIZE_IMMEDIATE && dag0 != NULL && insn.commutes() && dag0->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag0->insn)) {
const auto &childInsn = cast(dag0->insn);
src0 = sel.selReg(insn.getSrc(1), type);
src1 = getRegisterFromImmediate(childInsn.getImmediate());
if (dag1) dag1->isRoot = 1;
}
// Just grab the two sources
else {
src0 = sel.selReg(insn.getSrc(0), type);
src1 = sel.selReg(insn.getSrc(1), type);
markAllChildren(dag);
}
// Output the binary instruction
switch (opcode) {
case OP_ADD:
if (type == Type::TYPE_U64 || type == Type::TYPE_S64) {
GenRegister t = sel.selReg(sel.reg(RegisterFamily::FAMILY_QWORD), Type::TYPE_S64);
sel.I64ADD(dst, src0, src1, t);
} else
sel.ADD(dst, src0, src1);
break;
case OP_ADDSAT:
if (type == Type::TYPE_U64 || type == Type::TYPE_S64) {
GenRegister tmp[6];
for(int i=0; i<5; i++) {
tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
tmp[i].type = GEN_TYPE_UD;
}
tmp[5] = sel.selReg(sel.reg(FAMILY_BOOL));
sel.I64SATADD(dst, src0, src1, tmp);
break;
}
sel.push();
sel.curr.saturate = GEN_MATH_SATURATE_SATURATE;
sel.ADD(dst, src0, src1);
sel.pop();
break;
case OP_XOR:
if (type == Type::TYPE_U64 || type == Type::TYPE_S64)
sel.I64XOR(dst, src0, src1);
else
sel.XOR(dst, src0, src1);
break;
case OP_OR:
if (type == Type::TYPE_U64 || type == Type::TYPE_S64)
sel.I64OR(dst, src0, src1);
else
sel.OR(dst, src0, src1);
break;
case OP_AND:
if (type == Type::TYPE_U64 || type == Type::TYPE_S64)
sel.I64AND(dst, src0, src1);
else
sel.AND(dst, src0, src1);
break;
case OP_SUB:
if (type == Type::TYPE_U64 || type == Type::TYPE_S64) {
GenRegister t = sel.selReg(sel.reg(RegisterFamily::FAMILY_QWORD), Type::TYPE_S64);
sel.I64SUB(dst, src0, src1, t);
} else
sel.ADD(dst, src0, GenRegister::negate(src1));
break;
case OP_SUBSAT:
if (type == Type::TYPE_U64 || type == Type::TYPE_S64) {
GenRegister tmp[6];
for(int i=0; i<5; i++) {
tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
tmp[i].type = GEN_TYPE_UD;
}
tmp[5] = sel.selReg(sel.reg(FAMILY_BOOL));
sel.I64SATSUB(dst, src0, src1, tmp);
break;
}
sel.push();
sel.curr.saturate = GEN_MATH_SATURATE_SATURATE;
sel.ADD(dst, src0, GenRegister::negate(src1));
sel.pop();
break;
case OP_SHL:
if (type == TYPE_S64 || type == TYPE_U64) {
GenRegister tmp[7];
for(int i = 0; i < 6; i ++)
tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
tmp[6] = sel.selReg(sel.reg(FAMILY_BOOL));
sel.I64SHL(dst, src0, src1, tmp);
} else
sel.SHL(dst, src0, src1);
break;
case OP_SHR:
if (type == TYPE_S64 || type == TYPE_U64) {
GenRegister tmp[7];
for(int i = 0; i < 6; i ++)
tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
tmp[6] = sel.selReg(sel.reg(FAMILY_BOOL));
sel.I64SHR(dst, src0, src1, tmp);
} else
sel.SHR(dst, src0, src1);
break;
case OP_ASR:
if (type == TYPE_S64 || type == TYPE_U64) {
GenRegister tmp[7];
for(int i = 0; i < 6; i ++)
tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
tmp[6] = sel.selReg(sel.reg(FAMILY_BOOL));
sel.I64ASR(dst, src0, src1, tmp);
} else
sel.ASR(dst, src0, src1);
break;
case OP_MUL_HI: {
GenRegister temp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
sel.MUL_HI(dst, src0, src1, temp);
break;
}
case OP_I64_MUL_HI:
{
GenRegister temp[10];
for(int i=0; i<9; i++) {
temp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
temp[i].type = GEN_TYPE_UD;
}
temp[9] = sel.selReg(sel.reg(FAMILY_BOOL));
sel.I64_MUL_HI(dst, src0, src1, temp);
break;
}
case OP_MUL:
if (type == TYPE_U32 || type == TYPE_S32) {
sel.pop();
return false;
} else if (type == TYPE_S64 || type == TYPE_U64) {
GenRegister tmp[6];
for(int i = 0; i < 6; i++)
tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
sel.I64MUL(dst, src0, src1, tmp);
} else
sel.MUL(dst, src0, src1);
break;
case OP_HADD: {
GenRegister temp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_D);
sel.HADD(dst, src0, src1, temp);
break;
}
case OP_RHADD: {
GenRegister temp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_D);
sel.RHADD(dst, src0, src1, temp);
break;
}
case OP_I64HADD:
{
GenRegister tmp[4];
for(int i=0; i<4; i++)
tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
sel.I64HADD(dst, src0, src1, tmp);
break;
}
case OP_I64RHADD:
{
GenRegister tmp[4];
for(int i=0; i<4; i++)
tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
sel.I64RHADD(dst, src0, src1, tmp);
break;
}
case OP_UPSAMPLE_SHORT:
sel.UPSAMPLE_SHORT(dst, src0, src1);
break;
case OP_UPSAMPLE_INT:
sel.UPSAMPLE_INT(dst, src0, src1);
break;
case OP_UPSAMPLE_LONG:
sel.UPSAMPLE_LONG(dst, src0, src1);
break;
default: NOT_IMPLEMENTED;
}
sel.pop();
return true;
}
};
/*! MAD pattern */
class MulAddInstructionPattern : public SelectionPattern
{
public:
/*! Register the pattern for all opcodes of the family */
MulAddInstructionPattern(void) : SelectionPattern(2, 1) {
this->opcodes.push_back(ir::OP_ADD);
}
/*! Implements base class */
virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const
{
using namespace ir;
// MAD tend to increase liveness of the sources (since there are three of
// them). TODO refine this strategy. Well, we should be able at least to
// evaluate per basic block register pressure and selectively enable
// disable MADs
if (sel.ctx.limitRegisterPressure)
return false;
// We are good to try. We need a MUL for one of the two sources
const ir::BinaryInstruction &insn = cast(dag.insn);
if (insn.getType() != TYPE_FLOAT)
return false;
SelectionDAG *child0 = dag.child[0];
SelectionDAG *child1 = dag.child[1];
const GenRegister dst = sel.selReg(insn.getDst(0), TYPE_FLOAT);
if (child0 && child0->insn.getOpcode() == OP_MUL) {
GBE_ASSERT(cast(child0->insn).getType() == TYPE_FLOAT);
const GenRegister src0 = sel.selReg(child0->insn.getSrc(0), TYPE_FLOAT);
const GenRegister src1 = sel.selReg(child0->insn.getSrc(1), TYPE_FLOAT);
const GenRegister src2 = sel.selReg(insn.getSrc(1), TYPE_FLOAT);
sel.MAD(dst, src2, src0, src1); // order different on HW!
if (child0->child[0]) child0->child[0]->isRoot = 1;
if (child0->child[1]) child0->child[1]->isRoot = 1;
if (child1) child1->isRoot = 1;
return true;
}
if (child1 && child1->insn.getOpcode() == OP_MUL) {
GBE_ASSERT(cast(child1->insn).getType() == TYPE_FLOAT);
const GenRegister src0 = sel.selReg(child1->insn.getSrc(0), TYPE_FLOAT);
const GenRegister src1 = sel.selReg(child1->insn.getSrc(1), TYPE_FLOAT);
const GenRegister src2 = sel.selReg(insn.getSrc(0), TYPE_FLOAT);
sel.MAD(dst, src2, src0, src1); // order different on HW!
if (child1->child[0]) child1->child[0]->isRoot = 1;
if (child1->child[1]) child1->child[1]->isRoot = 1;
if (child0) child0->isRoot = 1;
return true;
}
return false;
}
};
/*! sel.{le,l,ge...} like patterns */
class SelectModifierInstructionPattern : public SelectionPattern
{
public:
/*! Register the pattern for all opcodes of the family */
SelectModifierInstructionPattern(void) : SelectionPattern(2, 1) {
this->opcodes.push_back(ir::OP_SEL);
}
/*! Implements base class */
virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const
{
using namespace ir;
SelectionDAG *cmp = dag.child[0];
const SelectInstruction &insn = cast(dag.insn);
if (insn.getType() == TYPE_S64 || insn.getType() == TYPE_U64) // not support
return false;
// Not in this block
if (cmp == NULL) return false;
// We need to match a compare
if (cmp->insn.isMemberOf() == false) return false;
// We look for something like that:
// cmp.{le,ge...} flag src0 src1
// sel dst flag src0 src1
// So both sources must match
if (sourceMatch(cmp, 0, &dag, 1) == false) return false;
if (sourceMatch(cmp, 1, &dag, 2) == false) return false;
// OK, we merge the instructions
const ir::CompareInstruction &cmpInsn = cast(cmp->insn);
const ir::Opcode opcode = cmpInsn.getOpcode();
const uint32_t genCmp = getGenCompare(opcode);
// Like for regular selects, we need a temporary since we cannot predicate
// properly
const ir::Type type = cmpInsn.getType();
const RegisterFamily family = getFamily(type);
const GenRegister tmp = sel.selReg(sel.reg(family), type);
const uint32_t simdWidth = sel.curr.execWidth;
const GenRegister dst = sel.selReg(insn.getDst(0), type);
const GenRegister src0 = sel.selReg(cmpInsn.getSrc(0), type);
const GenRegister src1 = sel.selReg(cmpInsn.getSrc(1), type);
sel.push();
sel.curr.predicate = GEN_PREDICATE_NONE;
sel.curr.execWidth = simdWidth;
sel.SEL_CMP(genCmp, tmp, src0, src1);
sel.pop();
// Update the destination register properly now
sel.MOV(dst, tmp);
// We need the sources of the compare instruction
markAllChildren(*cmp);
return true;
}
};
/*! 32 bits integer multiply needs more instructions */
class Int32x32MulInstructionPattern : public SelectionPattern
{
public:
/*! Register the pattern for all opcodes of the family */
Int32x32MulInstructionPattern(void) : SelectionPattern(1, 4) {
this->opcodes.push_back(ir::OP_MUL);
}
/*! Implements base class */
virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const
{
using namespace ir;
const ir::BinaryInstruction &insn = cast(dag.insn);
const uint32_t simdWidth = sel.curr.execWidth;
const Type type = insn.getType();
if (type == TYPE_U32 || type == TYPE_S32) {
GenRegister dst = sel.selReg(insn.getDst(0), type);
GenRegister src0 = sel.selReg(insn.getSrc(0), type);
GenRegister src1 = sel.selReg(insn.getSrc(1), type);
sel.push();
// Either left part of the 16-wide register or just a simd 8 register
dst = GenRegister::retype(dst, GEN_TYPE_D);
src0 = GenRegister::retype(src0, GEN_TYPE_D);
src1 = GenRegister::retype(src1, GEN_TYPE_D);
sel.curr.execWidth = 8;
sel.curr.quarterControl = GEN_COMPRESSION_Q1;
sel.MUL(GenRegister::retype(GenRegister::acc(), GEN_TYPE_D), src0, src1);
sel.curr.accWrEnable = 1;
sel.MACH(GenRegister::retype(GenRegister::null(), GEN_TYPE_D), src0, src1);
sel.curr.accWrEnable = 0;
sel.MOV(GenRegister::retype(dst, GEN_TYPE_F), GenRegister::acc());
// Right part of the 16-wide register now
if (simdWidth == 16) {
int predicate = sel.curr.predicate;
int noMask = sel.curr.noMask;
sel.curr.noMask = 1;
sel.curr.predicate = GEN_PREDICATE_NONE;
const GenRegister nextSrc0 = sel.selRegQn(insn.getSrc(0), 1, TYPE_S32);
const GenRegister nextSrc1 = sel.selRegQn(insn.getSrc(1), 1, TYPE_S32);
sel.MUL(GenRegister::retype(GenRegister::acc(), GEN_TYPE_D), nextSrc0, nextSrc1);
sel.curr.accWrEnable = 1;
sel.MACH(GenRegister::retype(GenRegister::null(), GEN_TYPE_D), nextSrc0, nextSrc1);
sel.curr.accWrEnable = 0;
sel.curr.quarterControl = GEN_COMPRESSION_Q2;
if (predicate != GEN_PREDICATE_NONE || noMask != 1) {
const ir::Register reg = sel.reg(FAMILY_DWORD);
sel.MOV(GenRegister::f8grf(reg), GenRegister::acc());
sel.curr.noMask = noMask;;
sel.curr.predicate = predicate;
sel.MOV(GenRegister::retype(GenRegister::next(dst), GEN_TYPE_F),
GenRegister::f8grf(reg));
} else
sel.MOV(GenRegister::retype(GenRegister::next(dst), GEN_TYPE_F), GenRegister::acc());
}
sel.pop();
// All children are marked as root
markAllChildren(dag);
return true;
} else
return false;
}
};
/*! 32x16 bits integer can be done in one instruction */
class Int32x16MulInstructionPattern : public SelectionPattern
{
public:
/*! Register the pattern for all opcodes of the family */
Int32x16MulInstructionPattern(void) : SelectionPattern(1, 1) {
this->opcodes.push_back(ir::OP_MUL);
}
bool is16BitSpecialReg(ir::Register reg) const {
if (reg == ir::ocl::lid0 ||
reg == ir::ocl::lid1 ||
reg == ir::ocl::lid2 ||
reg == ir::ocl::lsize0 ||
reg == ir::ocl::lsize1||
reg == ir::ocl::lsize2)
return true;
else
return false;
}
/*! Try to emit a multiply where child childID is a 16 immediate */
bool emitMulImmediate(Selection::Opaque &sel, SelectionDAG &dag, uint32_t childID) const {
using namespace ir;
const ir::BinaryInstruction &insn = cast(dag.insn);
const Register dst = insn.getDst(0);
const Register src1 = insn.getSrc(childID ^ 1);
const SelectionDAG *src0DAG = dag.child[childID];
if (src0DAG != NULL) {
if (src0DAG->insn.getOpcode() == OP_LOADI) {
const auto &loadimm = cast(src0DAG->insn);
const Immediate imm = loadimm.getImmediate();
const Type type = imm.type;
GBE_ASSERT(type == TYPE_U32 || type == TYPE_S32);
if (type == TYPE_U32 && imm.data.u32 <= 0xffff) {
sel.MUL(sel.selReg(dst, type),
sel.selReg(src1, type),
GenRegister::immuw(imm.data.u32));
if (dag.child[childID ^ 1] != NULL)
dag.child[childID ^ 1]->isRoot = 1;
return true;
}
if (type == TYPE_S32 && (imm.data.s32 >= -32768 && imm.data.s32 <= 32767)) {
sel.MUL(sel.selReg(dst, type),
sel.selReg(src1, type),
GenRegister::immw(imm.data.s32));
if (dag.child[childID ^ 1] != NULL)
dag.child[childID ^ 1]->isRoot = 1;
return true;
}
}
}
return false;
}
/*! Try to emit a multiply with a 16 bit special register */
bool emitMulSpecialReg(Selection::Opaque &sel, SelectionDAG &dag, uint32_t childID) const {
using namespace ir;
const BinaryInstruction &insn = cast(dag.insn);
const Type type = insn.getType();
const Register dst = insn.getDst(0);
const Register src0 = insn.getSrc(childID);
const Register src1 = insn.getSrc(childID ^ 1);
if (is16BitSpecialReg(src0)) {
sel.MUL(sel.selReg(dst, type),
sel.selReg(src1, type),
sel.selReg(src0, TYPE_U32));
markAllChildren(dag);
return true;
}
return false;
}
virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const
{
using namespace ir;
const BinaryInstruction &insn = cast(dag.insn);
const Type type = insn.getType();
if (type == TYPE_U32 || type == TYPE_S32) {
if (this->emitMulSpecialReg(sel, dag, 0))
return true;
if (this->emitMulSpecialReg(sel, dag, 1))
return true;
if (this->emitMulImmediate(sel, dag, 0))
return true;
if (this->emitMulImmediate(sel, dag, 1))
return true;
}
return false;
}
};
#define DECL_NOT_IMPLEMENTED_ONE_TO_MANY(FAMILY) \
struct FAMILY##Pattern : public OneToManyPattern\
{\
INLINE bool emitOne(Selection::Opaque &sel, const ir::FAMILY &insn) const {\
NOT_IMPLEMENTED;\
return false;\
}\
DECL_CTOR(FAMILY, 1, 1); \
}
#undef DECL_NOT_IMPLEMENTED_ONE_TO_MANY
/*! Load immediate pattern */
DECL_PATTERN(LoadImmInstruction)
{
INLINE bool emitOne(Selection::Opaque &sel, const ir::LoadImmInstruction &insn) const
{
using namespace ir;
const Type type = insn.getType();
const Immediate imm = insn.getImmediate();
const GenRegister dst = sel.selReg(insn.getDst(0), type);
GenRegister flagReg;
sel.push();
if (sel.isScalarOrBool(insn.getDst(0)) == true) {
sel.curr.execWidth = 1;
if(type == TYPE_BOOL) {
if(imm.data.b) {
if(sel.curr.predicate == GEN_PREDICATE_NONE)
flagReg = GenRegister::immuw(0xffff);
else {
if(sel.curr.physicalFlag)
flagReg = GenRegister::flag(sel.curr.flag, sel.curr.subFlag);
else
flagReg = sel.selReg(Register(sel.curr.flagIndex), TYPE_U16);
}
} else
flagReg = GenRegister::immuw(0x0);
}
sel.curr.predicate = GEN_PREDICATE_NONE;
sel.curr.noMask = 1;
}
switch (type) {
case TYPE_BOOL:
sel.MOV(dst, flagReg);
break;
case TYPE_U32:
case TYPE_S32:
case TYPE_FLOAT:
sel.MOV(GenRegister::retype(dst, GEN_TYPE_F),
GenRegister::immf(imm.data.f32));
break;
case TYPE_U16: sel.MOV(dst, GenRegister::immuw(imm.data.u16)); break;
case TYPE_S16: sel.MOV(dst, GenRegister::immw(imm.data.s16)); break;
case TYPE_U8: sel.MOV(dst, GenRegister::immuw(imm.data.u8)); break;
case TYPE_S8: sel.MOV(dst, GenRegister::immw(imm.data.s8)); break;
case TYPE_DOUBLE: sel.LOAD_DF_IMM(dst, GenRegister::immdf(imm.data.f64), sel.selReg(sel.reg(FAMILY_QWORD))); break;
case TYPE_S64: sel.LOAD_INT64_IMM(dst, GenRegister::immint64(imm.data.s64)); break;
case TYPE_U64: sel.LOAD_INT64_IMM(dst, GenRegister::immint64(imm.data.u64)); break;
default: NOT_SUPPORTED;
}
sel.pop();
return true;
}
DECL_CTOR(LoadImmInstruction, 1,1);
};
/*! Sync instruction */
DECL_PATTERN(SyncInstruction)
{
INLINE bool emitOne(Selection::Opaque &sel, const ir::SyncInstruction &insn) const
{
using namespace ir;
const ir::Register reg = sel.reg(FAMILY_DWORD);
const uint32_t params = insn.getParameters();
if(params == syncGlobalBarrier) {
const ir::Register fenceDst = sel.reg(FAMILY_DWORD);
sel.FENCE(sel.selReg(fenceDst, ir::TYPE_U32));
}
sel.push();
sel.curr.predicate = GEN_PREDICATE_NONE;
// As only the payload.2 is used and all the other regions are ignored
// SIMD8 mode here is safe.
sel.curr.execWidth = 8;
sel.curr.physicalFlag = 0;
sel.curr.noMask = 1;
// Copy barrier id from r0.
sel.AND(GenRegister::ud8grf(reg), GenRegister::ud1grf(ir::ocl::barrierid), GenRegister::immud(0x0f000000));
// A barrier is OK to start the thread synchronization *and* SLM fence
sel.BARRIER(GenRegister::f8grf(reg));
// Now we wait for the other threads
sel.curr.execWidth = 1;
sel.WAIT();
sel.pop();
return true;
}
DECL_CTOR(SyncInstruction, 1,1);
};
INLINE uint32_t getByteScatterGatherSize(ir::Type type) {
using namespace ir;
switch (type) {
case TYPE_DOUBLE:
case TYPE_S64:
case TYPE_U64:
return GEN_BYTE_SCATTER_QWORD;
case TYPE_FLOAT:
case TYPE_U32:
case TYPE_S32:
return GEN_BYTE_SCATTER_DWORD;
case TYPE_U16:
case TYPE_S16:
return GEN_BYTE_SCATTER_WORD;
case TYPE_U8:
case TYPE_S8:
return GEN_BYTE_SCATTER_BYTE;
default: NOT_SUPPORTED;
return GEN_BYTE_SCATTER_BYTE;
}
}
/*! Load instruction pattern */
DECL_PATTERN(LoadInstruction)
{
void emitUntypedRead(Selection::Opaque &sel,
const ir::LoadInstruction &insn,
GenRegister addr,
uint32_t bti) const
{
using namespace ir;
const uint32_t valueNum = insn.getValueNum();
vector dst(valueNum);
for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
dst[dstID] = GenRegister::retype(sel.selReg(insn.getValue(dstID)), GEN_TYPE_F);
sel.UNTYPED_READ(addr, dst.data(), valueNum, bti);
}
void emitDWordGather(Selection::Opaque &sel,
const ir::LoadInstruction &insn,
GenRegister addr,
uint32_t bti) const
{
using namespace ir;
const uint32_t valueNum = insn.getValueNum();
const uint32_t simdWidth = sel.ctx.getSimdWidth();
GBE_ASSERT(valueNum == 1);
GenRegister dst = GenRegister::retype(sel.selReg(insn.getValue(0)), GEN_TYPE_F);
// get dword based address
GenRegister addrDW = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
sel.SHR(addrDW, GenRegister::retype(addr, GEN_TYPE_UD), GenRegister::immud(2));
sel.DWORD_GATHER(dst, addrDW, bti);
}
void emitRead64(Selection::Opaque &sel,
const ir::LoadInstruction &insn,
GenRegister addr,
uint32_t bti) const
{
using namespace ir;
const uint32_t valueNum = insn.getValueNum();
uint32_t dstID;
/* XXX support scalar only right now. */
GBE_ASSERT(valueNum == 1);
// The first 16 DWORD register space is for temporary usage at encode stage.
uint32_t tmpRegNum = (sel.ctx.getSimdWidth() == 8) ? valueNum * 2 : valueNum;
GenRegister dst[valueNum + tmpRegNum];
for (dstID = 0; dstID < tmpRegNum ; ++dstID)
dst[dstID] = sel.selReg(sel.reg(FAMILY_DWORD));
for ( uint32_t valueID = 0; valueID < valueNum; ++dstID, ++valueID)
dst[dstID] = sel.selReg(insn.getValue(valueID), ir::TYPE_U64);
sel.READ64(addr, sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64), dst, valueNum + tmpRegNum, valueNum, bti);
}
void emitByteGather(Selection::Opaque &sel,
const ir::LoadInstruction &insn,
const uint32_t elemSize,
GenRegister address,
GenRegister value,
uint32_t bti) const
{
using namespace ir;
GBE_ASSERT(insn.getValueNum() == 1);
const uint32_t simdWidth = sel.ctx.getSimdWidth();
// We need a temporary register if we read bytes or words
Register dst = Register(value.value.reg);
if (elemSize == GEN_BYTE_SCATTER_WORD ||
elemSize == GEN_BYTE_SCATTER_BYTE) {
dst = sel.reg(FAMILY_DWORD);
sel.BYTE_GATHER(GenRegister::fxgrf(simdWidth, dst), address, elemSize, bti);
}
// Repack bytes or words using a converting mov instruction
if (elemSize == GEN_BYTE_SCATTER_WORD)
sel.MOV(GenRegister::retype(value, GEN_TYPE_UW), GenRegister::unpacked_uw(dst));
else if (elemSize == GEN_BYTE_SCATTER_BYTE)
sel.MOV(GenRegister::retype(value, GEN_TYPE_UB), GenRegister::unpacked_ub(dst));
}
void emitIndirectMove(Selection::Opaque &sel,
const ir::LoadInstruction &insn,
GenRegister address) const
{
using namespace ir;
GBE_ASSERT(insn.getValueNum() == 1); //todo: handle vec later
const GenRegister dst = sel.selReg(insn.getValue(0), insn.getValueType());
const GenRegister src = address;
sel.INDIRECT_MOVE(dst, src);
}
INLINE bool emitOne(Selection::Opaque &sel, const ir::LoadInstruction &insn) const {
using namespace ir;
const GenRegister address = sel.selReg(insn.getAddress());
const AddressSpace space = insn.getAddressSpace();
GBE_ASSERT(insn.getAddressSpace() == MEM_GLOBAL ||
insn.getAddressSpace() == MEM_CONSTANT ||
insn.getAddressSpace() == MEM_PRIVATE ||
insn.getAddressSpace() == MEM_LOCAL);
GBE_ASSERT(sel.ctx.isScalarReg(insn.getValue(0)) == false);
const Type type = insn.getValueType();
const uint32_t elemSize = getByteScatterGatherSize(type);
if (insn.getAddressSpace() == MEM_CONSTANT) {
// XXX TODO read 64bit constant through constant cache
// Per HW Spec, constant cache messages can read at least DWORD data.
// So, byte/short data type, we have to read through data cache.
if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
this->emitRead64(sel, insn, address, 0x2);
else if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
this->emitDWordGather(sel, insn, address, 0x2);
else {
const GenRegister value = sel.selReg(insn.getValue(0));
this->emitByteGather(sel, insn, elemSize, address, value, 0x2);
}
}
else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
this->emitRead64(sel, insn, address, space == MEM_LOCAL ? 0xfe : 0x00);
else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
this->emitUntypedRead(sel, insn, address, space == MEM_LOCAL ? 0xfe : 0x00);
else {
const GenRegister value = sel.selReg(insn.getValue(0));
this->emitByteGather(sel, insn, elemSize, address, value, space == MEM_LOCAL ? 0xfe : 0x01);
}
return true;
}
DECL_CTOR(LoadInstruction, 1, 1);
};
/*! Store instruction pattern */
DECL_PATTERN(StoreInstruction)
{
void emitUntypedWrite(Selection::Opaque &sel,
const ir::StoreInstruction &insn,
uint32_t bti) const
{
using namespace ir;
const uint32_t valueNum = insn.getValueNum();
const uint32_t addrID = ir::StoreInstruction::addressIndex;
GenRegister addr;
vector value(valueNum);
addr = GenRegister::retype(sel.selReg(insn.getSrc(addrID)), GEN_TYPE_F);;
for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
value[valueID] = GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_F);
sel.UNTYPED_WRITE(addr, value.data(), valueNum, bti);
}
void emitWrite64(Selection::Opaque &sel,
const ir::StoreInstruction &insn,
uint32_t bti) const
{
using namespace ir;
const uint32_t valueNum = insn.getValueNum();
const uint32_t addrID = ir::StoreInstruction::addressIndex;
GenRegister addr;
uint32_t srcID;
/* XXX support scalar only right now. */
GBE_ASSERT(valueNum == 1);
addr = GenRegister::retype(sel.selReg(insn.getSrc(addrID)), GEN_TYPE_F);
// The first 16 DWORD register space is for temporary usage at encode stage.
uint32_t tmpRegNum = (sel.ctx.getSimdWidth() == 8) ? valueNum * 2 : valueNum;
GenRegister src[valueNum];
GenRegister dst[tmpRegNum + 1];
/* dst 0 is for the temporary address register. */
dst[0] = sel.selReg(sel.reg(FAMILY_DWORD));
for (srcID = 0; srcID < tmpRegNum; ++srcID)
dst[srcID + 1] = sel.selReg(sel.reg(FAMILY_DWORD));
for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
src[valueID] = sel.selReg(insn.getValue(valueID), ir::TYPE_U64);
sel.WRITE64(addr, src, valueNum, dst, tmpRegNum + 1, bti);
}
void emitByteScatter(Selection::Opaque &sel,
const ir::StoreInstruction &insn,
const uint32_t elemSize,
GenRegister addr,
GenRegister value,
uint32_t bti) const
{
using namespace ir;
const uint32_t simdWidth = sel.ctx.getSimdWidth();
const GenRegister dst = value;
GBE_ASSERT(insn.getValueNum() == 1);
if (elemSize == GEN_BYTE_SCATTER_WORD) {
value = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
sel.MOV(value, GenRegister::retype(dst, GEN_TYPE_UW));
} else if (elemSize == GEN_BYTE_SCATTER_BYTE) {
value = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
sel.MOV(value, GenRegister::retype(dst, GEN_TYPE_UB));
}
sel.BYTE_SCATTER(addr, value, elemSize, bti);
}
INLINE bool emitOne(Selection::Opaque &sel, const ir::StoreInstruction &insn) const
{
using namespace ir;
const AddressSpace space = insn.getAddressSpace();
const uint32_t bti = space == MEM_LOCAL ? 0xfe : 0x01;
const Type type = insn.getValueType();
const uint32_t elemSize = getByteScatterGatherSize(type);
if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
this->emitWrite64(sel, insn, bti);
else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
this->emitUntypedWrite(sel, insn, bti);
else {
const GenRegister address = sel.selReg(insn.getAddress());
const GenRegister value = sel.selReg(insn.getValue(0));
this->emitByteScatter(sel, insn, elemSize, address, value, bti);
}
return true;
}
DECL_CTOR(StoreInstruction, 1, 1);
};
/*! Compare instruction pattern */
class CompareInstructionPattern : public SelectionPattern
{
public:
CompareInstructionPattern(void) : SelectionPattern(1,1) {
for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
if (ir::isOpcodeFrom(ir::Opcode(op)) == true)
this->opcodes.push_back(ir::Opcode(op));
}
INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const
{
using namespace ir;
const ir::CompareInstruction &insn = cast(dag.insn);
const Opcode opcode = insn.getOpcode();
const Type type = insn.getType();
const uint32_t genCmp = getGenCompare(opcode);
const Register dst = insn.getDst(0);
// Limit the compare to the active lanes. Use the same compare as for f0.0
sel.push();
const LabelIndex label = insn.getParent()->getLabelIndex();
const GenRegister blockip = sel.selReg(ocl::blockip, TYPE_U16);
const GenRegister labelReg = GenRegister::immuw(label);
sel.curr.predicate = GEN_PREDICATE_NONE;
sel.curr.physicalFlag = 0;
sel.curr.flagIndex = uint16_t(dst);
sel.CMP(GEN_CONDITIONAL_LE, blockip, labelReg);
sel.pop();
// Look for immediate values for the right source
GenRegister src0, src1;
SelectionDAG *dag0 = dag.child[0];
SelectionDAG *dag1 = dag.child[1];
// Right source can always be an immediate
if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag1->insn)) {
const auto &childInsn = cast(dag1->insn);
src0 = sel.selReg(insn.getSrc(0), type);
src1 = getRegisterFromImmediate(childInsn.getImmediate());
if (dag0) dag0->isRoot = 1;
} else {
src0 = sel.selReg(insn.getSrc(0), type);
src1 = sel.selReg(insn.getSrc(1), type);
markAllChildren(dag);
}
sel.push();
sel.curr.physicalFlag = 0;
sel.curr.flagIndex = uint16_t(dst);
if (type == TYPE_S64 || type == TYPE_U64) {
GenRegister tmp[3];
for(int i=0; i<3; i++)
tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
sel.I64CMP(genCmp, src0, src1, tmp);
} else
sel.CMP(genCmp, src0, src1);
sel.pop();
return true;
}
};
/*! Convert instruction pattern */
DECL_PATTERN(ConvertInstruction)
{
INLINE bool emitOne(Selection::Opaque &sel, const ir::ConvertInstruction &insn) const
{
using namespace ir;
const Type dstType = insn.getDstType();
const Type srcType = insn.getSrcType();
const RegisterFamily dstFamily = getFamily(dstType);
const RegisterFamily srcFamily = getFamily(srcType);
const GenRegister dst = sel.selReg(insn.getDst(0), dstType);
const GenRegister src = sel.selReg(insn.getSrc(0), srcType);
if(insn.getOpcode() == ir::OP_SAT_CVT) {
sel.push();
sel.curr.saturate = 1;
}
// We need two instructions to make the conversion
if (dstFamily != FAMILY_DWORD && dstFamily != FAMILY_QWORD && (srcFamily == FAMILY_DWORD || srcFamily == FAMILY_QWORD)) {
GenRegister unpacked;
if (dstFamily == FAMILY_WORD) {
const uint32_t type = dstType == TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W;
unpacked = GenRegister::unpacked_uw(sel.reg(FAMILY_DWORD));
unpacked = GenRegister::retype(unpacked, type);
} else {
const uint32_t type = dstType == TYPE_U8 ? GEN_TYPE_UB : GEN_TYPE_B;
unpacked = GenRegister::unpacked_ub(sel.reg(FAMILY_DWORD));
unpacked = GenRegister::retype(unpacked, type);
}
if(srcFamily == FAMILY_QWORD) {
GenRegister tmp = sel.selReg(sel.reg(FAMILY_DWORD));
tmp.type = GEN_TYPE_D;
sel.CONVI64_TO_I(tmp, src);
sel.MOV(unpacked, tmp);
} else
sel.MOV(unpacked, src);
sel.MOV(dst, unpacked);
} else if ((dstType == ir::TYPE_S32 || dstType == ir::TYPE_U32) && srcFamily == FAMILY_QWORD) {
sel.CONVI64_TO_I(dst, src);
} else if (dstType == ir::TYPE_FLOAT && srcFamily == FAMILY_QWORD) {
GenRegister tmp[4];
for(int i=0; i<3; i++) {
tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
tmp[i].type = GEN_TYPE_UD;
}
tmp[3] = sel.selReg(sel.reg(FAMILY_BOOL));
sel.CONVI64_TO_F(dst, src, tmp);
} else if (dst.isdf()) {
ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD);
sel.MOV_DF(dst, src, sel.selReg(r));
} else if (dst.isint64()) {
switch(src.type) {
case GEN_TYPE_F:
sel.CONVF_TO_I64(dst, src, sel.selReg(sel.reg(FAMILY_DWORD)));
break;
case GEN_TYPE_DF:
NOT_IMPLEMENTED;
default:
sel.CONVI_TO_I64(dst, src, sel.selReg(sel.reg(FAMILY_DWORD)));
}
} else
sel.MOV(dst, src);
if(insn.getOpcode() == ir::OP_SAT_CVT)
sel.pop();
return true;
}
DECL_CTOR(ConvertInstruction, 1, 1);
};
/*! Convert instruction pattern */
DECL_PATTERN(AtomicInstruction)
{
INLINE bool emitOne(Selection::Opaque &sel, const ir::AtomicInstruction &insn) const
{
using namespace ir;
const AtomicOps atomicOp = insn.getAtomicOpcode();
const AddressSpace space = insn.getAddressSpace();
const uint32_t bti = space == MEM_LOCAL ? 0xfe : 0x01;
const uint32_t srcNum = insn.getSrcNum();
const GenRegister src0 = sel.selReg(insn.getSrc(0), TYPE_U32); //address
GenRegister src1 = src0, src2 = src0;
if(srcNum > 1) src1 = sel.selReg(insn.getSrc(1), TYPE_U32);
if(srcNum > 2) src2 = sel.selReg(insn.getSrc(2), TYPE_U32);
GenRegister dst = sel.selReg(insn.getDst(0), TYPE_U32);
GenAtomicOpCode genAtomicOp = (GenAtomicOpCode)atomicOp;
sel.ATOMIC(dst, genAtomicOp, srcNum, src0, src1, src2, bti);
return true;
}
DECL_CTOR(AtomicInstruction, 1, 1);
};
/*! Select instruction pattern */
class SelectInstructionPattern : public SelectionPattern
{
public:
SelectInstructionPattern(void) : SelectionPattern(1,1) {
for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
if (ir::isOpcodeFrom(ir::Opcode(op)) == true)
this->opcodes.push_back(ir::Opcode(op));
}
INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const
{
using namespace ir;
const ir::SelectInstruction &insn = cast(dag.insn);
// Get all registers for the instruction
const Type type = insn.getType();
const GenRegister dst = sel.selReg(insn.getDst(0), type);
// Look for immediate values for the right source
GenRegister src0, src1;
SelectionDAG *dag0 = dag.child[0]; // source 0 is the predicate!
SelectionDAG *dag1 = dag.child[1];
SelectionDAG *dag2 = dag.child[2];
// Right source can always be an immediate
if (OCL_OPTIMIZE_IMMEDIATE && dag2 != NULL && dag2->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag2->insn)) {
const auto &childInsn = cast(dag2->insn);
src0 = sel.selReg(insn.getSrc(SelectInstruction::src0Index), type);
src1 = getRegisterFromImmediate(childInsn.getImmediate());
if (dag0) dag0->isRoot = 1;
if (dag1) dag1->isRoot = 1;
} else {
src0 = sel.selReg(insn.getSrc(SelectInstruction::src0Index), type);
src1 = sel.selReg(insn.getSrc(SelectInstruction::src1Index), type);
markAllChildren(dag);
}
// Since we cannot predicate the select instruction with our current mask,
// we need to perform the selection in two steps (one to select, one to
// update the destination register)
const RegisterFamily family = getFamily(type);
const GenRegister tmp = sel.selReg(sel.reg(family), type);
const uint32_t simdWidth = sel.ctx.getSimdWidth();
const Register pred = insn.getPredicate();
sel.push();
sel.curr.predicate = GEN_PREDICATE_NORMAL;
sel.curr.execWidth = simdWidth;
sel.curr.physicalFlag = 0;
sel.curr.flagIndex = uint16_t(pred);
sel.curr.noMask = 0;
if(type == ir::TYPE_S64 || type == ir::TYPE_U64)
sel.SEL_INT64(tmp, src0, src1);
else
sel.SEL(tmp, src0, src1);
sel.pop();
// Update the destination register properly now
sel.MOV(dst, tmp);
return true;
}
};
DECL_PATTERN(TernaryInstruction)
{
INLINE bool emitOne(Selection::Opaque &sel, const ir::TernaryInstruction &insn) const {
using namespace ir;
const Type type = insn.getType();
const GenRegister dst = sel.selReg(insn.getDst(0), type),
src0 = sel.selReg(insn.getSrc(0), type),
src1 = sel.selReg(insn.getSrc(1), type),
src2 = sel.selReg(insn.getSrc(2), type);
switch(insn.getOpcode()) {
case OP_I64MADSAT:
{
GenRegister tmp[10];
for(int i=0; i<9; i++) {
tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
tmp[i].type = GEN_TYPE_UD;
}
tmp[9] = sel.selReg(sel.reg(FAMILY_BOOL));
sel.I64MADSAT(dst, src0, src1, src2, tmp);
break;
}
default:
NOT_IMPLEMENTED;
}
return true;
}
DECL_CTOR(TernaryInstruction, 1, 1);
};
/*! Label instruction pattern */
DECL_PATTERN(LabelInstruction)
{
INLINE bool emitOne(Selection::Opaque &sel, const ir::LabelInstruction &insn) const
{
using namespace ir;
const LabelIndex label = insn.getLabelIndex();
const GenRegister src0 = sel.selReg(ocl::blockip);
const GenRegister src1 = GenRegister::immuw(label);
const uint32_t simdWidth = sel.ctx.getSimdWidth();
sel.LABEL(label);
// Do not emit any code for the "returning" block. There is no need for it
if (insn.getParent() == &sel.ctx.getFunction().getBottomBlock())
return true;
// Emit the mask computation at the head of each basic block
sel.push();
sel.curr.predicate = GEN_PREDICATE_NONE;
sel.curr.flag = 0;
sel.curr.subFlag = 0;
sel.CMP(GEN_CONDITIONAL_LE, GenRegister::retype(src0, GEN_TYPE_UW), src1);
sel.pop();
// If it is required, insert a JUMP to bypass the block
if (sel.ctx.hasJIP(&insn)) {
const LabelIndex jip = sel.ctx.getLabelIndex(&insn);
sel.push();
if (simdWidth == 8)
sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
else if (simdWidth == 16)
sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
else
NOT_IMPLEMENTED;
sel.curr.inversePredicate = 1;
sel.curr.execWidth = 1;
sel.curr.flag = 0;
sel.curr.subFlag = 0;
sel.curr.noMask = 1;
sel.JMPI(GenRegister::immd(0), jip);
sel.pop();
}
return true;
}
DECL_CTOR(LabelInstruction, 1, 1);
};
DECL_PATTERN(SampleInstruction)
{
INLINE bool emitOne(Selection::Opaque &sel, const ir::SampleInstruction &insn) const
{
using namespace ir;
GenRegister msgPayloads[4];
GenRegister dst[insn.getDstNum()], src[insn.getSrcNum() - 2];
uint32_t srcNum = insn.getSrcNum();
uint32_t samplerOffset = 0;
if (srcNum == 6) {
/* We have the clamp border workaround. */
samplerOffset = insn.getSrc(srcNum - 1).value() * 8;
srcNum--;
}
for( int i = 0; i < 4; ++i)
msgPayloads[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
for (uint32_t valueID = 0; valueID < insn.getDstNum(); ++valueID)
dst[valueID] = sel.selReg(insn.getDst(valueID), insn.getDstType());
for (uint32_t valueID = 0; valueID < srcNum - 2; ++valueID)
src[valueID] = sel.selReg(insn.getSrc(valueID + 2), insn.getSrcType());
uint32_t bti = sel.ctx.getFunction().getImageSet()->getIdx
(insn.getSrc(SampleInstruction::SURFACE_BTI));
uint32_t sampler = sel.ctx.getFunction().getSamplerSet()->getIdx
(insn.getSrc(SampleInstruction::SAMPLER_BTI)) + samplerOffset;
sel.SAMPLE(dst, insn.getDstNum(), src, srcNum - 2, msgPayloads, 4, bti, sampler);
return true;
}
DECL_CTOR(SampleInstruction, 1, 1);
};
/*! Typed write instruction pattern. */
DECL_PATTERN(TypedWriteInstruction)
{
INLINE bool emitOne(Selection::Opaque &sel, const ir::TypedWriteInstruction &insn) const
{
using namespace ir;
const uint32_t simdWidth = sel.ctx.getSimdWidth();
uint32_t valueID = 0;
GenRegister msgs[9]; // (header + U + V + R + LOD + 4)
GenRegister src[insn.getSrcNum()];
uint32_t msgNum = (8 / (simdWidth / 8)) + 1;
uint32_t coordNum = (insn.getSrcNum() == 7) ? 2 : 3;
for(uint32_t i = 0; i < msgNum; i++)
msgs[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
// u, v, w coords should use coord type.
for (; valueID < coordNum; ++valueID)
src[valueID] = sel.selReg(insn.getSrc(valueID + 1), insn.getCoordType());
for (; (valueID + 1) < insn.getSrcNum(); ++valueID)
src[valueID] = sel.selReg(insn.getSrc(valueID + 1), insn.getSrcType());
uint32_t bti = sel.ctx.getFunction().getImageSet()->getIdx
(insn.getSrc(TypedWriteInstruction::SURFACE_BTI));
sel.TYPED_WRITE(src, insn.getSrcNum() - 1, msgs, msgNum, bti);
return true;
}
DECL_CTOR(TypedWriteInstruction, 1, 1);
};
/*! get image info instruction pattern. */
DECL_PATTERN(GetImageInfoInstruction)
{
INLINE bool emitOne(Selection::Opaque &sel, const ir::GetImageInfoInstruction &insn) const
{
using namespace ir;
GenRegister dst;
dst = sel.selReg(insn.getDst(0), TYPE_U32);
GenRegister imageInfoReg = GenRegister::ud1grf(insn.getSrc(0));
sel.MOV(dst, imageInfoReg);
return true;
}
DECL_CTOR(GetImageInfoInstruction, 1, 1);
};
/*! get sampler info instruction pattern. */
DECL_PATTERN(GetSamplerInfoInstruction)
{
INLINE bool emitOne(Selection::Opaque &sel, const ir::GetSamplerInfoInstruction &insn) const
{
using namespace ir;
GenRegister dst, src;
dst = sel.selReg(insn.getDst(0), TYPE_U16);
src = GenRegister::offset(GenRegister::uw1grf(ocl::samplerinfo), 0, sel.ctx.getFunction().getSamplerSet()->getIdx(insn.getSrc(0)) * 2);
src.subphysical = 1;
sel.MOV(dst, src);
return true;
}
DECL_CTOR(GetSamplerInfoInstruction, 1, 1);
};
/*! Branch instruction pattern */
DECL_PATTERN(BranchInstruction)
{
void emitForwardBranch(Selection::Opaque &sel,
const ir::BranchInstruction &insn,
ir::LabelIndex dst,
ir::LabelIndex src) const
{
using namespace ir;
const GenRegister ip = sel.selReg(ocl::blockip, TYPE_U16);
const LabelIndex jip = sel.ctx.getLabelIndex(&insn);
const uint32_t simdWidth = sel.ctx.getSimdWidth();
// We will not emit any jump if we must go the next block anyway
const BasicBlock *curr = insn.getParent();
const BasicBlock *next = curr->getNextBlock();
const LabelIndex nextLabel = next->getLabelIndex();
if (insn.isPredicated() == true) {
const Register pred = insn.getPredicateIndex();
// Update the PcIPs
sel.push();
sel.curr.physicalFlag = 0;
sel.curr.flagIndex = uint16_t(pred);
sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
sel.pop();
if (nextLabel == jip) return;
// It is slightly more complicated than for backward jump. We check that
// all PcIPs are greater than the next block IP to be sure that we can
// jump
sel.push();
sel.curr.physicalFlag = 0;
sel.curr.flagIndex = uint16_t(pred);
sel.curr.predicate = GEN_PREDICATE_NONE;
sel.CMP(GEN_CONDITIONAL_G, ip, GenRegister::immuw(nextLabel));
// Branch to the jump target
// XXX TODO: For group size not aligned to simdWidth, ALL8/16h may not
// work correct, as flag register bits mapped to non-active lanes tend
// to be zero.
if (simdWidth == 8)
sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
else if (simdWidth == 16)
sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
else
NOT_SUPPORTED;
sel.curr.execWidth = 1;
sel.curr.noMask = 1;
sel.JMPI(GenRegister::immd(0), jip);
sel.pop();
} else {
// Update the PcIPs
sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
// Do not emit branch when we go to the next block anyway
if (nextLabel == jip) return;
sel.push();
sel.curr.execWidth = 1;
sel.curr.noMask = 1;
sel.curr.predicate = GEN_PREDICATE_NONE;
sel.JMPI(GenRegister::immd(0), jip);
sel.pop();
}
}
void emitBackwardBranch(Selection::Opaque &sel,
const ir::BranchInstruction &insn,
ir::LabelIndex dst,
ir::LabelIndex src) const
{
using namespace ir;
const GenRegister ip = sel.selReg(ocl::blockip, TYPE_U16);
const Function &fn = sel.ctx.getFunction();
const BasicBlock &bb = fn.getBlock(src);
const LabelIndex jip = sel.ctx.getLabelIndex(&insn);
const uint32_t simdWidth = sel.ctx.getSimdWidth();
GBE_ASSERT(bb.getNextBlock() != NULL);
if (insn.isPredicated() == true) {
const Register pred = insn.getPredicateIndex();
// Update the PcIPs for all the branches. Just put the IPs of the next
// block. Next instruction will properly reupdate the IPs of the lanes
// that actually take the branch
const LabelIndex next = bb.getNextBlock()->getLabelIndex();
sel.MOV(ip, GenRegister::immuw(uint16_t(next)));
sel.push();
// Re-update the PcIPs for the branches that takes the backward jump
sel.curr.physicalFlag = 0;
sel.curr.flagIndex = uint16_t(pred);
sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
// Branch to the jump target
if (simdWidth == 8)
sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
else if (simdWidth == 16)
sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
else
NOT_SUPPORTED;
sel.curr.execWidth = 1;
sel.curr.noMask = 1;
sel.JMPI(GenRegister::immd(0), jip);
sel.pop();
} else {
// Update the PcIPs
sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
// Branch to the jump target
sel.push();
sel.curr.execWidth = 1;
sel.curr.noMask = 1;
sel.curr.predicate = GEN_PREDICATE_NONE;
sel.JMPI(GenRegister::immd(0), jip);
sel.pop();
}
}
INLINE bool emitOne(Selection::Opaque &sel, const ir::BranchInstruction &insn) const {
using namespace ir;
const Opcode opcode = insn.getOpcode();
if (opcode == OP_RET)
sel.EOT();
else if (opcode == OP_BRA) {
const LabelIndex dst = insn.getLabelIndex();
const LabelIndex src = insn.getParent()->getLabelIndex();
// We handle foward and backward branches differently
if (uint32_t(dst) <= uint32_t(src))
this->emitBackwardBranch(sel, insn, dst, src);
else
this->emitForwardBranch(sel, insn, dst, src);
} else
NOT_IMPLEMENTED;
return true;
}
DECL_CTOR(BranchInstruction, 1, 1);
};
/*! Sort patterns */
INLINE bool cmp(const SelectionPattern *p0, const SelectionPattern *p1) {
if (p0->insnNum != p1->insnNum)
return p0->insnNum > p1->insnNum;
return p0->cost < p1->cost;
}
SelectionLibrary::SelectionLibrary(void) {
this->insert();
this->insert();
this->insert();
this->insert();
this->insert();
this->insert();
this->insert();
this->insert();
this->insert();
this->insert();
this->insert();
this->insert();
this->insert();
this->insert();
this->insert();
this->insert();
this->insert();
this->insert();
this->insert();
this->insert();
this->insert();
// Sort all the patterns with the number of instructions they output
for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
std::sort(this->patterns[op].begin(), this->patterns[op].end(), cmp);
}
SelectionLibrary::~SelectionLibrary(void) {
for (auto pattern : this->toFree)
GBE_DELETE(const_cast(pattern));
}
template
void SelectionLibrary::insert(void) {
const SelectionPattern *pattern = GBE_NEW_NO_ARG(PatternType);
this->toFree.push_back(pattern);
for (auto opcode : pattern->opcodes)
this->patterns[opcode].push_back(pattern);
}
} /* namespace gbe */
Release_v0.3/backend/src/backend/gen_insn_selection.hpp 0000664 0000000 0000000 00000017557 12231421770 0023345 0 ustar 00root root 0000000 0000000 /*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see .
*
* Author: Benjamin Segovia
*/
/**
* \file gen_insn_selection.hpp
* \author Benjamin Segovia
*/
#ifndef __GEN_INSN_SELECTION_HPP__
#define __GEN_INSN_SELECTION_HPP__
#include "ir/register.hpp"
#include "ir/instruction.hpp"
#include "backend/gen_register.hpp"
#include "backend/gen_encoder.hpp"
#include "backend/gen_context.hpp"
#include "sys/vector.hpp"
#include "sys/intrusive_list.hpp"
namespace gbe
{
/*! Translate IR type to Gen type */
uint32_t getGenType(ir::Type type);
/*! Translate IR compare to Gen compare */
uint32_t getGenCompare(ir::Opcode opcode);
/*! Selection opcodes properly encoded from 0 to n for fast jump tables
* generations
*/
enum SelectionOpcode {
#define DECL_SELECTION_IR(OP, FN) SEL_OP_##OP,
#include "backend/gen_insn_selection.hxx"
#undef DECL_SELECTION_IR
};
// Owns and Allocates selection instructions
class Selection;
// List of SelectionInstruction forms a block
class SelectionBlock;
/*! A selection instruction is also almost a Gen instruction but *before* the
* register allocation
*/
class SelectionInstruction : public NonCopyable, public intrusive_list_node
{
public:
/*! Owns the instruction */
SelectionBlock *parent;
/*! Append an instruction before this one */
void prepend(SelectionInstruction &insn);
/*! Append an instruction after this one */
void append(SelectionInstruction &insn);
/*! Does it read memory? */
bool isRead(void) const;
/*! Does it write memory? */
bool isWrite(void) const;
/*! Is it a branch instruction (i.e. modify control flow) */
bool isBranch(void) const;
/*! Is it a label instruction (i.e. change the implicit mask) */
bool isLabel(void) const;
/*! Get the destination register */
GenRegister &dst(uint32_t dstID) { return regs[dstID]; }
/*! Get the source register */
GenRegister &src(uint32_t srcID) { return regs[dstNum+srcID]; }
/*! Damn C++ */
const GenRegister &dst(uint32_t dstID) const { return regs[dstID]; }
/*! Damn C++ */
const GenRegister &src(uint32_t srcID) const { return regs[dstNum+srcID]; }
/*! No more than 17 sources (used by typed writes on simd8 mode.) */
enum { MAX_SRC_NUM = 17 };
/*! No more than 11 destinations (used by samples and untyped reads) */
enum { MAX_DST_NUM = 11 };
/*! State of the instruction (extra fields neeed for the encoding) */
GenInstructionState state;
union {
struct {
/*! Store bti for loads/stores and function for math, atomic and compares */
uint16_t function:8;
/*! elemSize for byte scatters / gathers, elemNum for untyped msg, bti for atomic */
uint16_t elem:8;
};
struct {
/*! Number of sources in the tuple */
uint16_t width:4;
/*! vertical stride (0,1,2,4,8 or 16) */
uint16_t vstride:5;
/*! horizontal stride (0,1,2,4,8 or 16) */
uint16_t hstride:5;
/*! offset (0 to 7) */
uint16_t offset:5;
};
struct {
uint16_t scratchOffset;
uint16_t scratchMsgHeader;
};
} extra;
/*! Gen opcode */
uint8_t opcode;
/*! Number of destinations */
uint8_t dstNum:4;
/*! Number of sources */
uint8_t srcNum:5;
/*! To store various indices */
uint16_t index;
/*! Variable sized. Destinations and sources go here */
GenRegister regs[0];
private:
/*! Just Selection class can create SelectionInstruction */
SelectionInstruction(SelectionOpcode, uint32_t dstNum, uint32_t srcNum);
// Allocates (with a linear allocator) and owns SelectionInstruction
friend class Selection;
};
/*! Instructions like sends require to make registers contiguous in GRF */
class SelectionVector : public NonCopyable, public intrusive_list_node
{
public:
SelectionVector(void);
/*! The instruction that requires the vector of registers */
SelectionInstruction *insn;
/*! Directly points to the selection instruction registers */
GenRegister *reg;
/*! Number of registers in the vector */
uint16_t regNum;
/*! Indicate if this a destination or a source vector */
uint16_t isSrc;
};
// Owns the selection block
class Selection;
/*! A selection block is the counterpart of the IR Basic block. It contains
* the instructions generated from an IR basic block
*/
class SelectionBlock : public NonCopyable, public intrusive_list_node
{
public:
SelectionBlock(const ir::BasicBlock *bb);
/*! All the emitted instructions in the block */
intrusive_list insnList;
/*! The vectors that may be required by some instructions of the block */
intrusive_list vectorList;
/*! Extra registers needed by the block (only live in the block) */
gbe::vector tmp;
/*! Associated IR basic block */
const ir::BasicBlock *bb;
/*! Append a new temporary register */
void append(ir::Register reg);
/*! Append a new selection vector in the block */
void append(SelectionVector *vec);
/*! Append a new selection instruction at the end of the block */
void append(SelectionInstruction *insn);
/*! Append a new selection instruction at the beginning of the block */
void prepend(SelectionInstruction *insn);
};
/*! Owns the selection engine */
class GenContext;
/*! Selection engine produces the pre-ISA instruction blocks */
class Selection
{
public:
/*! Initialize internal structures used for the selection */
Selection(GenContext &ctx);
/*! Release everything */
~Selection(void);
/*! Implements the instruction selection itself */
void select(void);
/*! Bool and scalar register use scalar physical registers */
bool isScalarOrBool(ir::Register reg) const;
/*! Get the number of instructions of the largest block */
uint32_t getLargestBlockSize(void) const;
/*! Number of register vectors in the selection */
uint32_t getVectorNum(void) const;
/*! Number of registers (temporaries are created during selection) */
uint32_t getRegNum(void) const;
/*! Get the family for the given register */
ir::RegisterFamily getRegisterFamily(ir::Register reg) const;
/*! Get the data for the given register */
ir::RegisterData getRegisterData(ir::Register reg) const;
/*! Replace a source by the returned temporary register */
ir::Register replaceSrc(SelectionInstruction *insn, uint32_t regID);
/*! Replace a destination to the returned temporary register */
ir::Register replaceDst(SelectionInstruction *insn, uint32_t regID);
/*! spill a register (insert spill/unspill instructions) */
void spillReg(ir::Register reg, uint32_t registerPool);
/*! Create a new selection instruction */
SelectionInstruction *create(SelectionOpcode, uint32_t dstNum, uint32_t srcNum);
/*! List of emitted blocks */
intrusive_list *blockList;
/*! Actual implementation of the register allocator (use Pimpl) */
class Opaque;
/*! Created and destroyed in cpp */
Opaque *opaque;
/*! Use custom allocators */
GBE_CLASS(Selection);
};
} /* namespace gbe */
#endif /* __GEN_INSN_SELECTION_HPP__ */
Release_v0.3/backend/src/backend/gen_insn_selection.hxx 0000664 0000000 0000000 00000007230 12231421770 0023350 0 ustar 00root root 0000000 0000000 DECL_SELECTION_IR(LABEL, LabelInstruction)
DECL_SELECTION_IR(MOV, UnaryInstruction)
DECL_SELECTION_IR(MOV_DF, UnaryWithTempInstruction)
DECL_SELECTION_IR(LOAD_DF_IMM, UnaryWithTempInstruction)
DECL_SELECTION_IR(LOAD_INT64_IMM, UnaryInstruction)
DECL_SELECTION_IR(NOT, UnaryInstruction)
DECL_SELECTION_IR(LZD, UnaryInstruction)
DECL_SELECTION_IR(RNDZ, UnaryInstruction)
DECL_SELECTION_IR(RNDE, UnaryInstruction)
DECL_SELECTION_IR(RNDD, UnaryInstruction)
DECL_SELECTION_IR(RNDU, UnaryInstruction)
DECL_SELECTION_IR(FRC, UnaryInstruction)
DECL_SELECTION_IR(SEL, BinaryInstruction)
DECL_SELECTION_IR(SEL_INT64, BinaryInstruction)
DECL_SELECTION_IR(AND, BinaryInstruction)
DECL_SELECTION_IR(OR, BinaryInstruction)
DECL_SELECTION_IR(XOR, BinaryInstruction)
DECL_SELECTION_IR(I64AND, BinaryInstruction)
DECL_SELECTION_IR(I64OR, BinaryInstruction)
DECL_SELECTION_IR(I64XOR, BinaryInstruction)
DECL_SELECTION_IR(SHR, BinaryInstruction)
DECL_SELECTION_IR(SHL, BinaryInstruction)
DECL_SELECTION_IR(RSR, BinaryInstruction)
DECL_SELECTION_IR(RSL, BinaryInstruction)
DECL_SELECTION_IR(ASR, BinaryInstruction)
DECL_SELECTION_IR(I64SHR, I64ShiftInstruction)
DECL_SELECTION_IR(I64SHL, I64ShiftInstruction)
DECL_SELECTION_IR(I64ASR, I64ShiftInstruction)
DECL_SELECTION_IR(ADD, BinaryInstruction)
DECL_SELECTION_IR(I64ADD, BinaryWithTempInstruction)
DECL_SELECTION_IR(I64SATADD, I64SATADDInstruction)
DECL_SELECTION_IR(I64SUB, BinaryWithTempInstruction)
DECL_SELECTION_IR(I64SATSUB, I64SATSUBInstruction)
DECL_SELECTION_IR(MUL, BinaryInstruction)
DECL_SELECTION_IR(I64MUL, I64MULInstruction)
DECL_SELECTION_IR(I64DIV, I64DIVREMInstruction)
DECL_SELECTION_IR(I64REM, I64DIVREMInstruction)
DECL_SELECTION_IR(ATOMIC, AtomicInstruction)
DECL_SELECTION_IR(MACH, BinaryInstruction)
DECL_SELECTION_IR(CMP, CompareInstruction)
DECL_SELECTION_IR(I64CMP, I64CompareInstruction)
DECL_SELECTION_IR(SEL_CMP, CompareInstruction)
DECL_SELECTION_IR(MAD, TernaryInstruction)
DECL_SELECTION_IR(JMPI, JumpInstruction)
DECL_SELECTION_IR(EOT, EotInstruction)
DECL_SELECTION_IR(INDIRECT_MOVE, IndirectMoveInstruction)
DECL_SELECTION_IR(NOP, NoOpInstruction)
DECL_SELECTION_IR(WAIT, WaitInstruction)
DECL_SELECTION_IR(MATH, MathInstruction)
DECL_SELECTION_IR(BARRIER, BarrierInstruction)
DECL_SELECTION_IR(FENCE, FenceInstruction)
DECL_SELECTION_IR(UNTYPED_READ, UntypedReadInstruction)
DECL_SELECTION_IR(UNTYPED_WRITE, UntypedWriteInstruction)
DECL_SELECTION_IR(READ64, Read64Instruction)
DECL_SELECTION_IR(WRITE64, Write64Instruction)
DECL_SELECTION_IR(BYTE_GATHER, ByteGatherInstruction)
DECL_SELECTION_IR(BYTE_SCATTER, ByteScatterInstruction)
DECL_SELECTION_IR(DWORD_GATHER, DWordGatherInstruction)
DECL_SELECTION_IR(SAMPLE, SampleInstruction)
DECL_SELECTION_IR(TYPED_WRITE, TypedWriteInstruction)
DECL_SELECTION_IR(GET_IMAGE_INFO, GetImageInfoInstruction)
DECL_SELECTION_IR(SPILL_REG, SpillRegInstruction)
DECL_SELECTION_IR(UNSPILL_REG, UnSpillRegInstruction)
DECL_SELECTION_IR(MUL_HI, BinaryWithTempInstruction)
DECL_SELECTION_IR(I64_MUL_HI, I64MULHIInstruction)
DECL_SELECTION_IR(FBH, UnaryInstruction)
DECL_SELECTION_IR(FBL, UnaryInstruction)
DECL_SELECTION_IR(HADD, BinaryWithTempInstruction)
DECL_SELECTION_IR(RHADD, BinaryWithTempInstruction)
DECL_SELECTION_IR(I64HADD, I64HADDInstruction)
DECL_SELECTION_IR(I64RHADD, I64RHADDInstruction)
DECL_SELECTION_IR(UPSAMPLE_SHORT, BinaryInstruction)
DECL_SELECTION_IR(UPSAMPLE_INT, BinaryInstruction)
DECL_SELECTION_IR(UPSAMPLE_LONG, BinaryInstruction)
DECL_SELECTION_IR(CONVI_TO_I64, UnaryWithTempInstruction)
DECL_SELECTION_IR(CONVF_TO_I64, UnaryWithTempInstruction)
DECL_SELECTION_IR(CONVI64_TO_I, UnaryInstruction)
DECL_SELECTION_IR(CONVI64_TO_F, I64ToFloatInstruction)
DECL_SELECTION_IR(I64MADSAT, I64MADSATInstruction)
Release_v0.3/backend/src/backend/gen_program.cpp 0000664 0000000 0000000 00000012142 12231421770 0021754 0 ustar 00root root 0000000 0000000 /*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see .
*
* Author: Benjamin Segovia
*/
/**
* \file program.cpp
* \author Benjamin Segovia
*/
#include "backend/program.h"
#include "backend/gen_program.h"
#include "backend/gen_program.hpp"
#include "backend/gen_context.hpp"
#include "backend/gen_defs.hpp"
#include "backend/gen/gen_mesa_disasm.h"
#include "backend/gen_reg_allocation.hpp"
#include "ir/unit.hpp"
#include "llvm/llvm_to_gen.hpp"
#include
#include
#include
#include
#include
namespace gbe {
GenKernel::GenKernel(const std::string &name) :
Kernel(name), insns(NULL), insnNum(0)
{}
GenKernel::~GenKernel(void) { GBE_SAFE_DELETE_ARRAY(insns); }
const char *GenKernel::getCode(void) const { return (const char*) insns; }
const void GenKernel::setCode(const char * ins, size_t size) {
insns = (GenInstruction *)ins;
insnNum = size / sizeof(GenInstruction);
}
size_t GenKernel::getCodeSize(void) const { return insnNum * sizeof(GenInstruction); }
void GenKernel::printStatus(int indent, std::ostream& outs) {
Kernel::printStatus(indent, outs);
FILE *f = fopen("/dev/null", "w");
char *buf = new char[4096];
setbuffer(f, buf, 4096);
for (uint32_t i = 0; i < insnNum; i++) {
gen_disasm(f, insns+i);
outs << buf;
fflush(f);
setbuffer(f, NULL, 0);
setbuffer(f, buf, 4096);
}
setbuffer(f, NULL, 0);
delete [] buf;
fclose(f);
}
GenProgram::GenProgram(void) {}
GenProgram::~GenProgram(void) {}
/*! We must avoid spilling at all cost with Gen */
static const struct CodeGenStrategy {
uint32_t simdWidth;
bool limitRegisterPressure;
} codeGenStrategy[] = {
{16,false},
{16,true},
{8,false},
{8,true},
};
Kernel *GenProgram::compileKernel(const ir::Unit &unit, const std::string &name) {
// Be careful when the simdWidth is forced by the programmer. We can see it
// when the function already provides the simd width we need to use (i.e.
// non zero)
const ir::Function *fn = unit.getFunction(name);
const uint32_t codeGenNum = fn->getSimdWidth() != 0 ? 2 : 4;
uint32_t codeGen = fn->getSimdWidth() == 8 ? 2 : 0;
Kernel *kernel = NULL;
// Stop when compilation is successful
for (; codeGen < codeGenNum; ++codeGen) {
const uint32_t simdWidth = codeGenStrategy[codeGen].simdWidth;
const bool limitRegisterPressure = codeGenStrategy[codeGen].limitRegisterPressure;
// Force the SIMD width now and try to compile
unit.getFunction(name)->setSimdWidth(simdWidth);
Context *ctx = GBE_NEW(GenContext, unit, name, limitRegisterPressure);
kernel = ctx->compileKernel();
if (kernel != NULL) {
break;
}
GBE_DELETE(ctx);
}
// XXX spill must be implemented
GBE_ASSERTM(kernel != NULL, "Register spilling not supported yet!");
return kernel;
}
static gbe_program genProgramNewFromBinary(const char *binary, size_t size) {
using namespace gbe;
std::string binary_content;
binary_content.assign(binary, size);
GenProgram *program = GBE_NEW_NO_ARG(GenProgram);
std::istringstream ifs(binary_content, std::ostringstream::binary);
if (!program->deserializeFromBin(ifs)) {
delete program;
return NULL;
}
//program->printStatus(0, std::cout);
return reinterpret_cast(program);
}
static gbe_program genProgramNewFromLLVM(const char *fileName,
size_t stringSize,
char *err,
size_t *errSize)
{
using namespace gbe;
GenProgram *program = GBE_NEW_NO_ARG(GenProgram);
std::string error;
// Try to compile the program
if (program->buildFromLLVMFile(fileName, error) == false) {
if (err != NULL && errSize != NULL && stringSize > 0u) {
const size_t msgSize = std::min(error.size(), stringSize-1u);
std::memcpy(err, error.c_str(), msgSize);
*errSize = error.size();
}
GBE_DELETE(program);
return NULL;
}
// Everything run fine
return (gbe_program) program;
}
} /* namespace gbe */
void genSetupCallBacks(void)
{
gbe_program_new_from_binary = gbe::genProgramNewFromBinary;
gbe_program_new_from_llvm = gbe::genProgramNewFromLLVM;
}
sem_t llvm_semaphore;
void genSetupLLVMSemaphore(void)
{
sem_init(&llvm_semaphore, 0, 1);
}
Release_v0.3/backend/src/backend/gen_program.h 0000664 0000000 0000000 00000002332 12231421770 0021421 0 ustar 00root root 0000000 0000000 /*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see .
*
* Author: Benjamin Segovia
*/
/**
* \file program.h
* \author Benjamin Segovia
*
* C-like interface for the gen kernels and programs
*/
#ifndef __GBE_GEN_PROGRAM_H__
#define __GBE_GEN_PROGRAM_H__
#include
#include
#include
/*! This will make the compiler output Gen ISA code */
extern void genSetupCallBacks(void);
extern sem_t llvm_semaphore;
extern void genSetupLLVMSemaphore(void);
#endif /* __GBE_GEN_PROGRAM_H__ */
Release_v0.3/backend/src/backend/gen_program.hpp 0000664 0000000 0000000 00000004556 12231421770 0021773 0 ustar 00root root 0000000 0000000 /*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see .
*
* Author: Benjamin Segovia
*/
/**
* \file program.hpp
* \author Benjamin Segovia
*/
#ifndef __GBE_GEN_PROGRAM_HPP__
#define __GBE_GEN_PROGRAM_HPP__
#include "backend/program.h"
#include "backend/program.hpp"
// Gen ISA instruction
struct GenInstruction;
namespace gbe
{
/*! Describe a compiled kernel */
class GenKernel : public Kernel
{
public:
/*! Create an empty kernel with the given name */
GenKernel(const std::string &name);
/*! Destroy it */
virtual ~GenKernel(void);
/*! Implements base class */
virtual const char *getCode(void) const;
/*! Set the instruction stream (to be implemented) */
virtual const void setCode(const char *, size_t size);
/*! Implements get the code size */
virtual size_t getCodeSize(void) const;
/*! Implements printStatus*/
virtual void printStatus(int indent, std::ostream& outs);
GenInstruction *insns; //!< Instruction stream
uint32_t insnNum; //!< Number of instructions
GBE_CLASS(GenKernel); //!< Use custom allocators
};
/*! Describe a compiled program */
class GenProgram : public Program
{
public:
/*! Create an empty program */
GenProgram(void);
/*! Destroy the program */
virtual ~GenProgram(void);
/*! Implements base class */
virtual Kernel *compileKernel(const ir::Unit &unit, const std::string &name);
/*! Allocate an empty kernel. */
virtual Kernel *allocateKernel(const std::string &name) {
return GBE_NEW(GenKernel, name);
}
/*! Use custom allocators */
GBE_CLASS(GenProgram);
};
} /* namespace gbe */
#endif /* __GBE_GEN_PROGRAM_HPP__ */
Release_v0.3/backend/src/backend/gen_reg_allocation.cpp 0000664 0000000 0000000 00000070215 12231421770 0023274 0 ustar 00root root 0000000 0000000 /*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see .
*
* Author: Benjamin Segovia
*/
/**
* \file gen_reg_allocation.cpp
* \author Benjamin Segovia
*/
#include "ir/profile.hpp"
#include "ir/function.hpp"
#include "backend/gen_insn_selection.hpp"
#include "backend/gen_reg_allocation.hpp"
#include "backend/gen_register.hpp"
#include "backend/program.hpp"
#include "sys/exception.hpp"
#include
#include
#define RESERVED_REG_NUM_FOR_SPILL 6
namespace gbe
{
/////////////////////////////////////////////////////////////////////////////
// Register allocator internal implementation
/////////////////////////////////////////////////////////////////////////////
/*! Provides the location of a register in a vector */
typedef std::pair VectorLocation;
/*! Implements the register allocation */
class GenRegAllocator::Opaque
{
public:
/*! Initialize the register allocator */
Opaque(GenContext &ctx);
/*! Release all taken resources */
~Opaque(void);
/*! Perform the register allocation. Return true if success */
bool allocate(Selection &selection);
/*! Return the Gen register from the selection register */
GenRegister genReg(const GenRegister ®);
/*! Output the register allocation */
void outputAllocation(void);
private:
/*! Expire one GRF interval. Return true if one was successfully expired */
bool expireGRF(const GenRegInterval &limit);
/*! Expire a flag register. Return true if one was successfully expired */
bool expireFlag(const GenRegInterval &limit);
/*! Allocate the virtual boolean (== flags) registers */
void allocateFlags(Selection &selection);
/*! Allocate the GRF registers */
bool allocateGRFs(Selection &selection);
/*! Create gen registers for all preallocated curbe registers. */
void allocatePayloadRegs(void);
/*! Create a Gen register from a register set in the payload */
void allocatePayloadReg(ir::Register, uint32_t offset, uint32_t subOffset = 0);
/*! Create the intervals for each register */
/*! Allocate the vectors detected in the instruction selection pass */
void allocateVector(Selection &selection);
/*! Allocate the given interval. Return true if success */
bool createGenReg(const GenRegInterval &interval);
/*! Indicate if the registers are already allocated in vectors */
bool isAllocated(const SelectionVector *vector) const;
/*! Reallocate registers if needed to make the registers in the vector
* contigous in memory
*/
void coalesce(Selection &selection, SelectionVector *vector);
/*! The context owns the register allocator */
GenContext &ctx;
/*! Map virtual registers to offset in the (physical) register file */
map RA;
/*! Provides the position of each register in a vector */
map vectorMap;
/*! All vectors used in the selection */
vector vectors;
/*! The set of booleans that will go to GRF (cannot be kept into flags) */
set grfBooleans;
/*! All the register intervals */
vector intervals;
/*! Intervals sorting based on starting point positions */
vector starting;
/*! Intervals sorting based on ending point positions */
vector ending;
/*! registers that are spilled */
set spilled;
/* reserved registers for register spill/reload */
uint32_t reservedReg;
/*! Current vector to expire */
uint32_t expiringID;
/*! Use custom allocator */
GBE_CLASS(Opaque);
};
// Note that byte vector registers use two bytes per byte (and can be
// interleaved)
static const size_t familyVectorSize[] = {2,2,2,4,8};
static const size_t familyScalarSize[] = {2,1,2,4,8};
/*! Interval as used in linear scan allocator. Basically, stores the first and
* the last instruction where the register is alive
*/
struct GenRegInterval {
INLINE GenRegInterval(ir::Register reg) :
reg(reg), minID(INT_MAX), maxID(-INT_MAX) {}
ir::Register reg; //!< (virtual) register of the interval
int32_t minID, maxID; //!< Starting and ending points
};
GenRegAllocator::Opaque::Opaque(GenContext &ctx) : ctx(ctx) {}
GenRegAllocator::Opaque::~Opaque(void) {}
void GenRegAllocator::Opaque::allocatePayloadReg(ir::Register reg,
uint32_t offset,
uint32_t subOffset)
{
using namespace ir;
assert(offset >= GEN_REG_SIZE);
offset += subOffset;
RA.insert(std::make_pair(reg, offset));
GBE_ASSERT(reg != ocl::blockip || (offset % GEN_REG_SIZE == 0));
this->intervals[reg].minID = 0;
this->intervals[reg].maxID = 0;
}
INLINE void GenRegAllocator::Opaque::allocatePayloadRegs(void) {
using namespace ir;
for(auto &it : this->ctx.curbeRegs)
if (it.first.value() < 0x8000)
allocatePayloadReg(it.first, it.second);
// Allocate all pushed registers (i.e. structure kernel arguments)
const Function &fn = ctx.getFunction();
GBE_ASSERT(fn.getProfile() == PROFILE_OCL);
const Function::PushMap &pushMap = fn.getPushMap();
for (auto rit = pushMap.rbegin(); rit != pushMap.rend(); ++rit) {
const uint32_t argID = rit->second.argID;
const FunctionArgument arg = fn.getArg(argID);
const uint32_t subOffset = rit->second.offset;
const Register reg = rit->second.getRegister();
auto it = this->ctx.curbeRegs.find(arg.reg);
assert(it != ctx.curbeRegs.end());
allocatePayloadReg(reg, it->second, subOffset);
ctx.splitBlock(it->second, subOffset);
}
}
bool GenRegAllocator::Opaque::createGenReg(const GenRegInterval &interval) {
using namespace ir;
const ir::Register reg = interval.reg;
const uint32_t simdWidth = ctx.getSimdWidth();
if (RA.contains(reg) == true)
return true; // already allocated
GBE_ASSERT(ctx.isScalarReg(reg) == false);
const bool isScalar = ctx.sel->isScalarOrBool(reg);
const RegisterData regData = ctx.sel->getRegisterData(reg);
const RegisterFamily family = regData.family;
const uint32_t typeSize = isScalar ? familyScalarSize[family] : familyVectorSize[family];
const uint32_t regSize = isScalar ? typeSize : simdWidth*typeSize;
uint32_t grfOffset;
while ((grfOffset = ctx.allocate(regSize, regSize)) == 0) {
const bool success = this->expireGRF(interval);
if (UNLIKELY(success == false)) return false;
}
GBE_ASSERTM(grfOffset != 0, "Unable to register allocate");
RA.insert(std::make_pair(reg, grfOffset));
return true;
}
bool GenRegAllocator::Opaque::isAllocated(const SelectionVector *vector) const {
const ir::Register first = vector->reg[0].reg();
const auto it = vectorMap.find(first);
// If the first register is not allocated we are done
if (it == vectorMap.end())
return false;
// If there are more left registers than in the found vector, there are
// still registers to allocate
const SelectionVector *other = it->second.first;
const uint32_t otherFirst = it->second.second;
const uint32_t leftNum = other->regNum - otherFirst;
if (leftNum < vector->regNum)
return false;
// Now check that all the registers in the already allocated vector match
// the current vector
for (uint32_t regID = 1; regID < vector->regNum; ++regID) {
const ir::Register from = vector->reg[regID].reg();
const ir::Register to = other->reg[regID + otherFirst].reg();
if (from != to)
return false;
}
return true;
}
void GenRegAllocator::Opaque::coalesce(Selection &selection, SelectionVector *vector) {
for (uint32_t regID = 0; regID < vector->regNum; ++regID) {
const ir::Register reg = vector->reg[regID].reg();
const auto it = this->vectorMap.find(reg);
// case 1: the register is not already in a vector, so it can stay in this
// vector. Note that local IDs are *non-scalar* special registers but will
// require a MOV anyway since pre-allocated in the CURBE
if (it == vectorMap.end() &&
ctx.sel->isScalarOrBool(reg) == false &&
ctx.isSpecialReg(reg) == false)
{
const VectorLocation location = std::make_pair(vector, regID);
this->vectorMap.insert(std::make_pair(reg, location));
}
// case 2: the register is already in another vector, so we need to move
// it to a temporary register.
// TODO: we can do better than that if we analyze the liveness of the
// already allocated registers in the vector. If there is no inteference
// and the order is maintained, we can reuse the previous vector and avoid
// the MOVs
else {
ir::Register tmp;
if (vector->isSrc)
tmp = selection.replaceSrc(vector->insn, regID);
else
tmp = selection.replaceDst(vector->insn, regID);
const VectorLocation location = std::make_pair(vector, regID);
this->vectorMap.insert(std::make_pair(tmp, location));
}
}
}
/*! Will sort vector in decreasing order */
inline bool cmp(const SelectionVector *v0, const SelectionVector *v1) {
return v0->regNum > v1->regNum;
}
void GenRegAllocator::Opaque::allocateVector(Selection &selection) {
const uint32_t vectorNum = selection.getVectorNum();
this->vectors.resize(vectorNum);
// First we find and store all vectors
uint32_t vectorID = 0;
for (auto &block : *selection.blockList)
for (auto &v : block.vectorList)
this->vectors[vectorID++] = &v;
GBE_ASSERT(vectorID == vectorNum);
// Heuristic (really simple...): sort them by the number of registers they
// contain
std::sort(this->vectors.begin(), this->vectors.end(), cmp);
// Insert MOVs when this is required
for (vectorID = 0; vectorID < vectorNum; ++vectorID) {
SelectionVector *vector = this->vectors[vectorID];
if (this->isAllocated(vector))
continue;
this->coalesce(selection, vector);
}
}
template
inline bool cmp(const GenRegInterval *i0, const GenRegInterval *i1) {
return sortStartingPoint ? i0->minID < i1->minID : i0->maxID < i1->maxID;
}
bool GenRegAllocator::Opaque::expireGRF(const GenRegInterval &limit) {
while (this->expiringID != ending.size()) {
const GenRegInterval *toExpire = this->ending[this->expiringID];
const ir::Register reg = toExpire->reg;
// Dead code produced by the insn selection -> we skip it
if (toExpire->minID > toExpire->maxID) {
this->expiringID++;
continue;
}
//ignore register that already spilled
if(spilled.contains(reg)) {
this->expiringID++;
continue;
}
// Ignore booleans that were allocated with flags
// if (ctx.getRegisterFamily(reg) == ir::FAMILY_BOOL && !grfBooleans.contains(reg)) {
if (ctx.sel->getRegisterFamily(reg) == ir::FAMILY_BOOL) {
this->expiringID++;
continue;
}
if (toExpire->maxID >= limit.minID)
return false;
auto it = RA.find(reg);
GBE_ASSERT(it != RA.end());
// offset less than 32 means it is not managed by our reg allocator.
if (it->second < 32) {
this->expiringID++;
continue;
}
// Case 1 - it does not belong to a vector. Just remove it
ctx.deallocate(it->second);
this->expiringID++;
return true;
}
// We were not able to expire anything
return false;
}
void GenRegAllocator::Opaque::allocateFlags(Selection &selection) {
// Store the registers allocated in the map
map allocatedFlags;
GenRegInterval spill = ir::Register(ir::RegisterFile::MAX_INDEX);
// we have two flags we use for booleans f1.0 and f1.1
const uint32_t flagNum = 2;
uint32_t freeFlags[] = {0,1};
uint32_t freeNum = flagNum;
// Perform the linear scan allocator on the flag registers only. We only use
// two flags registers for the booleans right now: f1.0 and f1.1
const uint32_t regNum = ctx.sel->getRegNum();
uint32_t endID = 0; // interval to expire
for (uint32_t startID = 0; startID < regNum; ++startID) {
const GenRegInterval &interval = *this->starting[startID];
const ir::Register reg = interval.reg;
if (ctx.sel->getRegisterFamily(reg) != ir::FAMILY_BOOL)
continue; // Not a flag. We don't care
if (grfBooleans.contains(reg))
continue; // Cannot use a flag register
if (interval.maxID == -INT_MAX)
continue; // Unused register
if (freeNum != 0) {
spill = interval;
allocatedFlags.insert(std::make_pair(reg, freeFlags[--freeNum]));
}
else {
// Try to expire one register
while (endID != ending.size()) {
const GenRegInterval *toExpire = this->ending[endID];
const ir::Register reg = toExpire->reg;
// Dead code produced by the insn selection -> we skip it
if (toExpire->minID > toExpire->maxID) {
endID++;
continue;
}
// We cannot expire this interval and the next ones
if (toExpire->maxID >= interval.minID)
break;
// Must be a boolean allocated with a flag register
if (ctx.sel->getRegisterFamily(reg) != ir::FAMILY_BOOL || grfBooleans.contains(reg)) {
endID++;
continue;
}
// We reuse a flag from a previous interval (the oldest one)
auto it = allocatedFlags.find(toExpire->reg);
GBE_ASSERT(it != allocatedFlags.end());
freeFlags[freeNum++] = it->second;
endID++;
break;
}
// We need to spill one of the previous boolean values
if (freeNum == 0) {
GBE_ASSERT(uint16_t(spill.reg) != ir::RegisterFile::MAX_INDEX);
// We spill the last inserted boolean and use its flag instead for
// this one
if (spill.maxID > interval.maxID) {
auto it = allocatedFlags.find(spill.reg);
GBE_ASSERT(it != allocatedFlags.end());
allocatedFlags.insert(std::make_pair(reg, it->second));
allocatedFlags.erase(spill.reg);
grfBooleans.insert(spill.reg);
spill = interval;
}
// We will a grf for the current register
else
grfBooleans.insert(reg);
}
else
allocatedFlags.insert(std::make_pair(reg, freeFlags[--freeNum]));
}
}
// Now, we traverse all the selection instructions and we patch them to make
// them use flag registers
for (auto &block : *selection.blockList)
for (auto &insn : block.insnList) {
const uint32_t srcNum = insn.srcNum, dstNum = insn.dstNum;
// Patch the source booleans
for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
const GenRegister selReg = insn.src(srcID);
const ir::Register reg = selReg.reg();
if (selReg.physical || ctx.sel->getRegisterFamily(reg) != ir::FAMILY_BOOL)
continue;
auto it = allocatedFlags.find(reg);
if (it == allocatedFlags.end())
continue;
// Use a flag register for it now
insn.src(srcID) = GenRegister::flag(1,it->second);
}
// Patch the destination booleans
for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
const GenRegister selReg = insn.dst(dstID);
const ir::Register reg = selReg.reg();
if (selReg.physical || ctx.sel->getRegisterFamily(reg) != ir::FAMILY_BOOL)
continue;
auto it = allocatedFlags.find(reg);
if (it == allocatedFlags.end())
continue;
// Use a flag register for it now
insn.dst(dstID) = GenRegister::flag(1,it->second);
}
// Patch the predicate now. Note that only compares actually modify it (it
// is called a "conditional modifier"). The other instructions just read
// it
if (insn.state.physicalFlag == 0) {
auto it = allocatedFlags.find(ir::Register(insn.state.flagIndex));
// Just patch it if we can use a flag directly
if (it != allocatedFlags.end()) {
insn.state.flag = 1;
insn.state.subFlag = it->second;
insn.state.physicalFlag = 1;
}
// When we let the boolean in a GRF, use f0.1 as a temporary
else {
// Mov the GRF to the flag such that the flag can be read
SelectionInstruction *mov0 = selection.create(SEL_OP_MOV,1,1);
mov0->state = GenInstructionState(1);
mov0->state.predicate = GEN_PREDICATE_NONE;
mov0->state.noMask = 1;
mov0->src(0) = GenRegister::uw1grf(ir::Register(insn.state.flagIndex));
mov0->dst(0) = GenRegister::flag(0,1);
// Do not prepend if the flag is not read (== used only as a
// conditional modifier)
if (insn.state.predicate != GEN_PREDICATE_NONE)
insn.prepend(*mov0);
// We can use f0.1 (our "backdoor" flag)
insn.state.flag = 0;
insn.state.subFlag = 1;
insn.state.physicalFlag = 1;
// Compare instructions update the flags so we must copy it back to
// the GRF
if (insn.opcode == SEL_OP_CMP || insn.opcode == SEL_OP_I64CMP) {
SelectionInstruction *mov1 = selection.create(SEL_OP_MOV,1,1);
mov1->state = mov0->state;
mov1->dst(0) = mov0->src(0);
mov1->src(0) = mov0->dst(0);
insn.append(*mov1);
}
}
}
}
}
bool GenRegAllocator::Opaque::allocateGRFs(Selection &selection) {
// Perform the linear scan allocator
const uint32_t regNum = ctx.sel->getRegNum();
for (uint32_t startID = 0; startID < regNum; ++startID) {
const GenRegInterval &interval = *this->starting[startID];
const ir::Register reg = interval.reg;
if (interval.maxID == -INT_MAX)
continue; // Unused register
if (RA.contains(reg))
continue; // already allocated
// Case 1: the register belongs to a vector, allocate all the registers in
// one piece
auto it = vectorMap.find(reg);
if (it != vectorMap.end()) {
const SelectionVector *vector = it->second.first;
// all the reg in the SelectionVector are spilled
if(spilled.contains(vector->reg[0].reg()))
continue;
const uint32_t simdWidth = ctx.getSimdWidth();
const ir::RegisterData regData = ctx.sel->getRegisterData(reg);
const ir::RegisterFamily family = regData.family;
const uint32_t typeSize = familyVectorSize[family];
const uint32_t alignment = simdWidth*typeSize;
const uint32_t size = vector->regNum * alignment;
uint32_t grfOffset;
while ((grfOffset = ctx.allocate(size, alignment)) == 0) {
const bool success = this->expireGRF(interval);
if (success == false) {
// if no spill support, just return false, else simply spill the register
if(reservedReg == 0) return false;
break;
}
}
if(grfOffset == 0) {
// spill all the registers in the SelectionVector
// the tricky here is I need to use reservedReg+1 as scratch write payload.
// so, i need to write the first register to scratch memory first.
// the spillReg() will just append scratch write insn after the def. To spill
// the first register, need to call spillReg() last for the vector->reg[0]
GBE_ASSERT(vector->regNum < RESERVED_REG_NUM_FOR_SPILL);
for(int i = vector->regNum-1; i >= 0; i--) {
spilled.insert(vector->reg[i].reg());
selection.spillReg(vector->reg[i].reg(), reservedReg);
}
continue;
}
for (uint32_t regID = 0; regID < vector->regNum; ++regID) {
const ir::Register reg = vector->reg[regID].reg();
GBE_ASSERT(RA.contains(reg) == false
&& ctx.sel->getRegisterData(reg).family == family);
RA.insert(std::make_pair(reg, grfOffset + alignment * regID));
ctx.splitBlock(grfOffset, alignment * regID); //splitBlock will not split if regID == 0
}
}
// Case 2: This is a regular scalar register, allocate it alone
else if (this->createGenReg(interval) == false) {
if(reservedReg == 0) return false;
spilled.insert(reg);
selection.spillReg(reg, reservedReg);
}
}
return true;
}
INLINE bool GenRegAllocator::Opaque::allocate(Selection &selection) {
using namespace ir;
if (ctx.getSimdWidth() == 8) {
reservedReg = ctx.allocate(RESERVED_REG_NUM_FOR_SPILL * GEN_REG_SIZE, GEN_REG_SIZE);
reservedReg /= GEN_REG_SIZE;
} else {
reservedReg = 0;
}
// Allocate all the vectors first since they need to be contiguous
this->allocateVector(selection);
// schedulePreRegAllocation(ctx, selection);
// Now start the linear scan allocation
for (uint32_t regID = 0; regID < ctx.sel->getRegNum(); ++regID)
this->intervals.push_back(ir::Register(regID));
// Allocate the special registers (only those which are actually used)
this->allocatePayloadRegs();
// Group and barrier IDs are always allocated by the hardware in r0
RA.insert(std::make_pair(ocl::groupid0, 1*sizeof(float))); // r0.1
RA.insert(std::make_pair(ocl::groupid1, 6*sizeof(float))); // r0.6
RA.insert(std::make_pair(ocl::groupid2, 7*sizeof(float))); // r0.7
RA.insert(std::make_pair(ocl::barrierid, 2*sizeof(float))); // r0.2
// block IP used to handle the mask in SW is always allocated
// Compute the intervals
int32_t insnID = 0;
for (auto &block : *selection.blockList) {
int32_t lastID = insnID;
// Update the intervals of each used register. Note that we do not
// register allocate R0, so we skip all sub-registers in r0
for (auto &insn : block.insnList) {
const uint32_t srcNum = insn.srcNum, dstNum = insn.dstNum;
for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
const GenRegister &selReg = insn.src(srcID);
const ir::Register reg = selReg.reg();
if (selReg.file != GEN_GENERAL_REGISTER_FILE ||
reg == ir::ocl::barrierid ||
reg == ir::ocl::groupid0 ||
reg == ir::ocl::groupid1 ||
reg == ir::ocl::groupid2)
continue;
this->intervals[reg].minID = std::min(this->intervals[reg].minID, insnID);
this->intervals[reg].maxID = std::max(this->intervals[reg].maxID, insnID);
}
for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
const GenRegister &selReg = insn.dst(dstID);
const ir::Register reg = selReg.reg();
if (selReg.file != GEN_GENERAL_REGISTER_FILE ||
reg == ir::ocl::barrierid ||
reg == ir::ocl::groupid0 ||
reg == ir::ocl::groupid1 ||
reg == ir::ocl::groupid2)
continue;
this->intervals[reg].minID = std::min(this->intervals[reg].minID, insnID);
this->intervals[reg].maxID = std::max(this->intervals[reg].maxID, insnID);
}
// Flag registers can only go to src[0]
const SelectionOpcode opcode = SelectionOpcode(insn.opcode);
if (opcode == SEL_OP_AND || opcode == SEL_OP_OR || opcode == SEL_OP_XOR
|| opcode == SEL_OP_I64AND || opcode == SEL_OP_I64OR || opcode == SEL_OP_I64XOR) {
if (insn.src(1).physical == 0) {
const ir::Register reg = insn.src(1).reg();
if (ctx.sel->getRegisterFamily(reg) == ir::FAMILY_BOOL)
grfBooleans.insert(reg);
}
}
// OK, a flag is used as a predicate or a conditional modifier
if (insn.state.physicalFlag == 0) {
const ir::Register reg = ir::Register(insn.state.flagIndex);
this->intervals[reg].minID = std::min(this->intervals[reg].minID, insnID);
this->intervals[reg].maxID = std::max(this->intervals[reg].maxID, insnID);
}
lastID = insnID;
insnID++;
}
// All registers alive at the end of the block must have their intervals
// updated as well
const ir::BasicBlock *bb = block.bb;
const ir::Liveness::LiveOut &liveOut = ctx.getLiveOut(bb);
for (auto reg : liveOut) {
this->intervals[reg].minID = std::min(this->intervals[reg].minID, lastID);
this->intervals[reg].maxID = std::max(this->intervals[reg].maxID, lastID);
}
}
// Sort both intervals in starting point and ending point increasing orders
const uint32_t regNum = ctx.sel->getRegNum();
this->starting.resize(regNum);
this->ending.resize(regNum);
for (uint32_t regID = 0; regID < regNum; ++regID)
this->starting[regID] = this->ending[regID] = &intervals[regID];
std::sort(this->starting.begin(), this->starting.end(), cmp);
std::sort(this->ending.begin(), this->ending.end(), cmp);
// Remove the registers that were not allocated
this->expiringID = 0;
while (this->expiringID < regNum) {
const GenRegInterval *interval = ending[this->expiringID];
if (interval->maxID == -INT_MAX)
this->expiringID++;
else
break;
}
// First we try to put all booleans registers into flags
this->allocateFlags(selection);
// Allocate all the GRFs now (regular register and boolean that are not in
// flag registers)
return this->allocateGRFs(selection);
}
INLINE void GenRegAllocator::Opaque::outputAllocation(void) {
std::cout << "## register allocation ##" << std::endl;
for(auto &i : RA) {
int vReg = (int)i.first;
int offst = (int)i.second / sizeof(float);
int reg = offst / 8;
int subreg = offst % 8;
std::cout << "%" << vReg << " g" << reg << "." << subreg << "D" << std::endl;
}
std::set::iterator is;
std::cout << "## spilled registers:" << std::endl;
for(is = spilled.begin(); is != spilled.end(); is++)
std::cout << (int)*is << std::endl;
std::cout << std::endl;
}
INLINE GenRegister setGenReg(const GenRegister &src, uint32_t grfOffset) {
GenRegister dst;
dst = src;
dst.physical = 1;
dst.nr = grfOffset / GEN_REG_SIZE;
dst.subnr = grfOffset % GEN_REG_SIZE;
return dst;
}
INLINE GenRegister GenRegAllocator::Opaque::genReg(const GenRegister ®) {
if (reg.file == GEN_GENERAL_REGISTER_FILE) {
if(reg.physical == 1) {
return reg;
}
GBE_ASSERT(RA.contains(reg.reg()) != false);
const uint32_t grfOffset = RA.find(reg.reg())->second;
const uint32_t suboffset = reg.subphysical ? reg.subnr : 0;
const GenRegister dst = setGenReg(reg, grfOffset + suboffset);
if (reg.quarter != 0)
return GenRegister::Qn(dst, reg.quarter);
else
return dst;
}
else
return reg;
}
/////////////////////////////////////////////////////////////////////////////
// Register allocator public implementation
/////////////////////////////////////////////////////////////////////////////
GenRegAllocator::GenRegAllocator(GenContext &ctx) {
this->opaque = GBE_NEW(GenRegAllocator::Opaque, ctx);
}
GenRegAllocator::~GenRegAllocator(void) {
GBE_DELETE(this->opaque);
}
bool GenRegAllocator::allocate(Selection &selection) {
return this->opaque->allocate(selection);
}
GenRegister GenRegAllocator::genReg(const GenRegister ®) {
return this->opaque->genReg(reg);
}
void GenRegAllocator::outputAllocation(void) {
this->opaque->outputAllocation();
}
} /* namespace gbe */
Release_v0.3/backend/src/backend/gen_reg_allocation.hpp 0000664 0000000 0000000 00000004045 12231421770 0023277 0 ustar 00root root 0000000 0000000 /*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see .
*
* Author: Benjamin Segovia
*/
/**
* \file gen_reg_allocation.hpp
* \author Benjamin Segovia
*/
#ifndef __GBE_GEN_REG_ALLOCATION_HPP__
#define __GBE_GEN_REG_ALLOCATION_HPP__
#include "ir/register.hpp"
#include "backend/gen_register.hpp"
namespace gbe
{
class Selection; // Pre-register allocation code generation
class GenRegister; // Pre-register allocation Gen register
struct GenRegInterval; // Liveness interval for each register
class GenContext; // Gen specific context
/*! Register allocate (i.e. virtual to physical register mapping) */
class GenRegAllocator
{
public:
/*! Initialize the register allocator */
GenRegAllocator(GenContext &ctx);
/*! Release all taken resources */
~GenRegAllocator(void);
/*! Perform the register allocation */
bool allocate(Selection &selection);
/*! Virtual to physical translation */
GenRegister genReg(const GenRegister ®);
/*! Output the register allocation */
void outputAllocation(void);
private:
/*! Actual implementation of the register allocator (use Pimpl) */
class Opaque;
/*! Created and destroyed in cpp */
Opaque *opaque;
/*! Use custom allocator */
GBE_CLASS(GenRegAllocator);
};
} /* namespace gbe */
#endif /* __GBE_GEN_REG_ALLOCATION_HPP__ */
Release_v0.3/backend/src/backend/gen_register.hpp 0000664 0000000 0000000 00000076537 12231421770 0022160 0 ustar 00root root 0000000 0000000 /*
* Copyright 2012 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
/*
Copyright (C) Intel Corp. 2006. All Rights Reserved.
Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
develop this 3D driver.
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice (including the
next paragraph) shall be included in all copies or substantial
portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
**********************************************************************/
/*
* Authors:
* Keith Whitwell
*/
/**
* \file gen_register.hpp
* \author Benjamin Segovia
*/
#ifndef __GEN_REGISTER_HPP__
#define __GEN_REGISTER_HPP__
#include "backend/gen_defs.hpp"
#include "ir/register.hpp"
#include "sys/platform.hpp"
namespace gbe
{
/*! Type size in bytes for each Gen type */
INLINE int typeSize(uint32_t type) {
switch(type) {
case GEN_TYPE_DF:
case GEN_TYPE_UL:
case GEN_TYPE_L:
return 8;
case GEN_TYPE_UD:
case GEN_TYPE_D:
case GEN_TYPE_F:
return 4;
case GEN_TYPE_UW:
case GEN_TYPE_W:
return 2;
case GEN_TYPE_UB:
case GEN_TYPE_B:
return 1;
default:
assert(0);
return 0;
}
}
/*! Convert a hstride to a number of element */
INLINE uint32_t stride(uint32_t stride) {
switch (stride) {
case 0: return 0;
case 1: return 1;
case 2: return 2;
case 3: return 4;
case 4: return 8;
case 5: return 16;
default: assert(0); return 0;
}
}
/*! Encode the instruction state. Note that the flag register can be either
* physical (i.e. a real Gen flag) or a virtual boolean register. The flag
* register allocation will turn all virtual boolean registers into flag
* registers
*/
class GenInstructionState
{
public:
INLINE GenInstructionState(uint32_t simdWidth = 8) {
this->execWidth = simdWidth;
this->quarterControl = GEN_COMPRESSION_Q1;
this->nibControl = 0;
this->accWrEnable = 0;
this->noMask = 0;
this->flag = 0;
this->subFlag = 0;
this->predicate = GEN_PREDICATE_NORMAL;
this->inversePredicate = 0;
this->physicalFlag = 1;
this->flagIndex = 0;
this->saturate = GEN_MATH_SATURATE_NONE;
}
uint32_t physicalFlag:1; //!< Physical or virtual flag register
uint32_t flag:1; //!< Only if physical flag
uint32_t subFlag:1; //!< Only if physical flag
uint32_t flagIndex:16; //!< Only if virtual flag (index of the register)
uint32_t execWidth:5;
uint32_t quarterControl:1;
uint32_t nibControl:1;
uint32_t accWrEnable:1;
uint32_t noMask:1;
uint32_t predicate:4;
uint32_t inversePredicate:1;
uint32_t saturate:1;
void chooseNib(int nib) {
switch (nib) {
case 0:
quarterControl = 0;
nibControl = 0;
break;
case 1:
quarterControl = 0;
nibControl = 1;
break;
case 2:
quarterControl = 1;
nibControl = 0;
break;
case 3:
quarterControl = 1;
nibControl = 1;
break;
default:
NOT_IMPLEMENTED;
}
}
void useFlag(int nr, int subnr) {
flag = nr;
subFlag = subnr;
physicalFlag = 1;
}
};
/*! This is a book-keeping structure used to encode both virtual and physical
* registers
*/
class GenRegister
{
public:
/*! Empty constructor */
INLINE GenRegister(void) {}
/*! General constructor */
INLINE GenRegister(uint32_t file,
ir::Register reg,
uint32_t type,
uint32_t vstride,
uint32_t width,
uint32_t hstride)
{
this->type = type;
this->file = file;
this->physical = 0;
this->value.reg = reg;
this->negation = 0;
this->absolute = 0;
this->vstride = vstride;
this->width = width;
this->hstride = hstride;
this->quarter = 0;
this->nr = this->subnr = 0;
this->address_mode = GEN_ADDRESS_DIRECT;
}
/*! For specific physical registers only */
INLINE GenRegister(uint32_t file,
uint32_t nr,
uint32_t subnr,
uint32_t type,
uint32_t vstride,
uint32_t width,
uint32_t hstride)
{
this->type = type;
this->file = file;
this->nr = nr;
this->physical = 1;
this->subnr = subnr * typeSize(type);
this->negation = 0;
this->absolute = 0;
this->vstride = vstride;
this->width = width;
this->hstride = hstride;
this->quarter = 0;
this->address_mode = GEN_ADDRESS_DIRECT;
}
/*! Return the IR virtual register */
INLINE ir::Register reg(void) const { return ir::Register(value.reg); }
/*! For immediates or virtual register */
union {
double df;
float f;
int32_t d;
uint32_t ud;
uint16_t reg;
int64_t i64;
} value;
uint32_t nr:8; //!< Just for some physical registers (acc, null)
uint32_t subnr:8; //!< Idem
uint32_t physical:1; //!< 1 if physical, 0 otherwise
uint32_t subphysical:1;//!< 1 if subnr is physical, 0 otherwise
uint32_t type:4; //!< Gen type
uint32_t file:2; //!< Register file
uint32_t negation:1; //!< For source
uint32_t absolute:1; //!< For source
uint32_t vstride:4; //!< Vertical stride
uint32_t width:3; //!< Width
uint32_t hstride:2; //!< Horizontal stride
uint32_t quarter:1; //!< To choose which part we want (Q1 / Q2)
uint32_t address_mode:1; //!< direct or indirect
static INLINE GenRegister offset(GenRegister reg, int nr, int subnr = 0) {
GenRegister r = reg;
r.nr += nr;
r.subnr += subnr;
return r;
}
INLINE bool isint64(void) const {
if ((type == GEN_TYPE_UL || type == GEN_TYPE_L) && file == GEN_GENERAL_REGISTER_FILE)
return true;
return false;
}
INLINE bool isimmdf(void) const {
if (type == GEN_TYPE_DF && file == GEN_IMMEDIATE_VALUE)
return true;
return false;
}
INLINE GenRegister top_half(void) const {
GenRegister r = bottom_half();
r.subnr += 4;
return r;
}
INLINE GenRegister bottom_half(void) const {
GBE_ASSERT(isint64());
GenRegister r = *this;
r.type = type == GEN_TYPE_UL ? GEN_TYPE_UD : GEN_TYPE_D;
r.hstride = GEN_HORIZONTAL_STRIDE_2;
r.vstride = GEN_VERTICAL_STRIDE_16;
return r;
}
INLINE bool is_signed_int(void) const {
if ((type == GEN_TYPE_B || type == GEN_TYPE_W || type == GEN_TYPE_D || type == GEN_TYPE_L) && file == GEN_GENERAL_REGISTER_FILE)
return true;
return false;
}
INLINE bool isdf(void) const {
if (type == GEN_TYPE_DF && file == GEN_GENERAL_REGISTER_FILE)
return true;
return false;
}
INLINE int flag_nr(void) const {
return nr & 15;
}
INLINE int flag_subnr(void) const {
return subnr / typeSize(type);
}
static INLINE GenRegister h2(GenRegister reg) {
GenRegister r = reg;
r.hstride = GEN_HORIZONTAL_STRIDE_2;
return r;
}
static INLINE GenRegister QnVirtual(GenRegister reg, uint32_t quarter) {
GBE_ASSERT(reg.physical == 0);
if (reg.hstride == GEN_HORIZONTAL_STRIDE_0) // scalar register
return reg;
else {
reg.quarter = quarter;
return reg;
}
}
static INLINE GenRegister QnPhysical(GenRegister reg, uint32_t quarter) {
GBE_ASSERT(reg.physical);
if (reg.hstride == GEN_HORIZONTAL_STRIDE_0) // scalar register
return reg;
else {
const uint32_t typeSz = typeSize(reg.type);
const uint32_t horizontal = stride(reg.hstride);
const uint32_t grfOffset = reg.nr*GEN_REG_SIZE + reg.subnr;
const uint32_t nextOffset = grfOffset + 8*quarter*horizontal*typeSz;
reg.nr = nextOffset / GEN_REG_SIZE;
reg.subnr = (nextOffset % GEN_REG_SIZE);
return reg;
}
}
static INLINE GenRegister Qn(GenRegister reg, uint32_t quarter) {
if (reg.physical)
return QnPhysical(reg, quarter);
else
return QnVirtual(reg, quarter);
}
static INLINE GenRegister vec16(uint32_t file, ir::Register reg) {
return GenRegister(file,
reg,
GEN_TYPE_F,
GEN_VERTICAL_STRIDE_8,
GEN_WIDTH_8,
GEN_HORIZONTAL_STRIDE_1);
}
static INLINE GenRegister vec8(uint32_t file, ir::Register reg) {
return GenRegister(file,
reg,
GEN_TYPE_F,
GEN_VERTICAL_STRIDE_8,
GEN_WIDTH_8,
GEN_HORIZONTAL_STRIDE_1);
}
static INLINE GenRegister vec4(uint32_t file, ir::Register reg) {
return GenRegister(file,
reg,
GEN_TYPE_F,
GEN_VERTICAL_STRIDE_4,
GEN_WIDTH_4,
GEN_HORIZONTAL_STRIDE_1);
}
static INLINE GenRegister vec2(uint32_t file, ir::Register reg) {
return GenRegister(file,
reg,
GEN_TYPE_F,
GEN_VERTICAL_STRIDE_2,
GEN_WIDTH_2,
GEN_HORIZONTAL_STRIDE_1);
}
static INLINE GenRegister vec1(uint32_t file, ir::Register reg) {
return GenRegister(file,
reg,
GEN_TYPE_F,
GEN_VERTICAL_STRIDE_0,
GEN_WIDTH_1,
GEN_HORIZONTAL_STRIDE_0);
}
static INLINE GenRegister retype(GenRegister reg, uint32_t type) {
reg.type = type;
return reg;
}
static INLINE GenRegister df16(uint32_t file, ir::Register reg) {
return retype(vec16(file, reg), GEN_TYPE_DF);
}
static INLINE GenRegister df8(uint32_t file, ir::Register reg) {
return retype(vec8(file, reg), GEN_TYPE_DF);
}
static INLINE GenRegister df1(uint32_t file, ir::Register reg) {
return retype(vec1(file, reg), GEN_TYPE_DF);
}
static INLINE GenRegister ud16(uint32_t file, ir::Register reg) {
return retype(vec16(file, reg), GEN_TYPE_UD);
}
static INLINE GenRegister ud8(uint32_t file, ir::Register reg) {
return retype(vec8(file, reg), GEN_TYPE_UD);
}
static INLINE GenRegister ud1(uint32_t file, ir::Register reg) {
return retype(vec1(file, reg), GEN_TYPE_UD);
}
static INLINE GenRegister d8(uint32_t file, ir::Register reg) {
return retype(vec8(file, reg), GEN_TYPE_D);
}
static INLINE GenRegister uw16(uint32_t file, ir::Register reg) {
return retype(vec16(file, reg), GEN_TYPE_UW);
}
static INLINE GenRegister uw8(uint32_t file, ir::Register reg) {
return retype(vec8(file, reg), GEN_TYPE_UW);
}
static INLINE GenRegister uw1(uint32_t file, ir::Register reg) {
return retype(vec1(file, reg), GEN_TYPE_UW);
}
static INLINE GenRegister ub16(uint32_t file, ir::Register reg) {
return GenRegister(file,
reg,
GEN_TYPE_UB,
GEN_VERTICAL_STRIDE_16,
GEN_WIDTH_8,
GEN_HORIZONTAL_STRIDE_2);
}
static INLINE GenRegister ub8(uint32_t file, ir::Register reg) {
return GenRegister(file,
reg,
GEN_TYPE_UB,
GEN_VERTICAL_STRIDE_16,
GEN_WIDTH_8,
GEN_HORIZONTAL_STRIDE_2);
}
static INLINE GenRegister ub1(uint32_t file, ir::Register reg) {
return retype(vec1(file, reg), GEN_TYPE_UB);
}
static INLINE GenRegister unpacked_uw(ir::Register reg) {
return GenRegister(GEN_GENERAL_REGISTER_FILE,
reg,
GEN_TYPE_UW,
GEN_VERTICAL_STRIDE_16,
GEN_WIDTH_8,
GEN_HORIZONTAL_STRIDE_2);
}
static INLINE GenRegister unpacked_ub(ir::Register reg) {
return GenRegister(GEN_GENERAL_REGISTER_FILE,
reg,
GEN_TYPE_UB,
GEN_VERTICAL_STRIDE_32,
GEN_WIDTH_8,
GEN_HORIZONTAL_STRIDE_4);
}
static INLINE GenRegister imm(uint32_t type) {
return GenRegister(GEN_IMMEDIATE_VALUE,
0,
0,
type,
GEN_VERTICAL_STRIDE_0,
GEN_WIDTH_1,
GEN_HORIZONTAL_STRIDE_0);
}
static INLINE GenRegister immint64(int64_t i) {
GenRegister immediate = imm(GEN_TYPE_L);
immediate.value.i64 = i;
return immediate;
}
static INLINE GenRegister immdf(double df) {
GenRegister immediate = imm(GEN_TYPE_DF);
immediate.value.df = df;
return immediate;
}
static INLINE GenRegister immf(float f) {
GenRegister immediate = imm(GEN_TYPE_F);
immediate.value.f = f;
return immediate;
}
static INLINE GenRegister immd(int d) {
GenRegister immediate = imm(GEN_TYPE_D);
immediate.value.d = d;
return immediate;
}
static INLINE GenRegister immud(uint32_t ud) {
GenRegister immediate = imm(GEN_TYPE_UD);
immediate.value.ud = ud;
return immediate;
}
static INLINE GenRegister immuw(uint16_t uw) {
GenRegister immediate = imm(GEN_TYPE_UW);
immediate.value.ud = uw | (uw << 16);
return immediate;
}
static INLINE GenRegister immw(int16_t w) {
GenRegister immediate = imm(GEN_TYPE_W);
immediate.value.d = w | (w << 16);
return immediate;
}
static INLINE GenRegister immv(uint32_t v) {
GenRegister immediate = imm(GEN_TYPE_V);
immediate.vstride = GEN_VERTICAL_STRIDE_0;
immediate.width = GEN_WIDTH_8;
immediate.hstride = GEN_HORIZONTAL_STRIDE_1;
immediate.value.ud = v;
return immediate;
}
static INLINE GenRegister immvf(uint32_t v) {
GenRegister immediate = imm(GEN_TYPE_VF);
immediate.vstride = GEN_VERTICAL_STRIDE_0;
immediate.width = GEN_WIDTH_4;
immediate.hstride = GEN_HORIZONTAL_STRIDE_1;
immediate.value.ud = v;
return immediate;
}
static INLINE GenRegister immvf4(uint32_t v0, uint32_t v1, uint32_t v2, uint32_t v3) {
GenRegister immediate = imm(GEN_TYPE_VF);
immediate.vstride = GEN_VERTICAL_STRIDE_0;
immediate.width = GEN_WIDTH_4;
immediate.hstride = GEN_HORIZONTAL_STRIDE_1;
immediate.value.ud = ((v0 << 0) | (v1 << 8) | (v2 << 16) | (v3 << 24));
return immediate;
}
static INLINE GenRegister f1grf(ir::Register reg) {
return vec1(GEN_GENERAL_REGISTER_FILE, reg);
}
static INLINE GenRegister f2grf(ir::Register reg) {
return vec2(GEN_GENERAL_REGISTER_FILE, reg);
}
static INLINE GenRegister f4grf(ir::Register reg) {
return vec4(GEN_GENERAL_REGISTER_FILE, reg);
}
static INLINE GenRegister f8grf(ir::Register reg) {
return vec8(GEN_GENERAL_REGISTER_FILE, reg);
}
static INLINE GenRegister f16grf(ir::Register reg) {
return vec16(GEN_GENERAL_REGISTER_FILE, reg);
}
static INLINE GenRegister df1grf(ir::Register reg) {
return df1(GEN_GENERAL_REGISTER_FILE, reg);
}
static INLINE GenRegister df8grf(ir::Register reg) {
return df8(GEN_GENERAL_REGISTER_FILE, reg);
}
static INLINE GenRegister df16grf(ir::Register reg) {
return df16(GEN_GENERAL_REGISTER_FILE, reg);
}
static INLINE GenRegister ud16grf(ir::Register reg) {
return ud16(GEN_GENERAL_REGISTER_FILE, reg);
}
static INLINE GenRegister ud8grf(ir::Register reg) {
return ud8(GEN_GENERAL_REGISTER_FILE, reg);
}
static INLINE GenRegister ud1grf(ir::Register reg) {
return ud1(GEN_GENERAL_REGISTER_FILE, reg);
}
static INLINE GenRegister uw1grf(ir::Register reg) {
return uw1(GEN_GENERAL_REGISTER_FILE, reg);
}
static INLINE GenRegister uw8grf(ir::Register reg) {
return uw8(GEN_GENERAL_REGISTER_FILE, reg);
}
static INLINE GenRegister uw16grf(ir::Register reg) {
return uw16(GEN_GENERAL_REGISTER_FILE, reg);
}
static INLINE GenRegister ub1grf(ir::Register reg) {
return ub1(GEN_GENERAL_REGISTER_FILE, reg);
}
static INLINE GenRegister ub8grf(ir::Register reg) {
return ub8(GEN_GENERAL_REGISTER_FILE, reg);
}
static INLINE GenRegister ub16grf(ir::Register reg) {
return ub16(GEN_GENERAL_REGISTER_FILE, reg);
}
static INLINE GenRegister null(void) {
return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
GEN_ARF_NULL,
0,
GEN_TYPE_F,
GEN_VERTICAL_STRIDE_8,
GEN_WIDTH_8,
GEN_HORIZONTAL_STRIDE_1);
}
static INLINE bool isNull(GenRegister reg) {
return (reg.file == GEN_ARCHITECTURE_REGISTER_FILE
&& reg.nr == GEN_ARF_NULL);
}
static INLINE GenRegister acc(void) {
return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
GEN_ARF_ACCUMULATOR,
0,
GEN_TYPE_F,
GEN_VERTICAL_STRIDE_8,
GEN_WIDTH_8,
GEN_HORIZONTAL_STRIDE_1);
}
static INLINE GenRegister ip(void) {
return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
GEN_ARF_IP,
0,
GEN_TYPE_D,
GEN_VERTICAL_STRIDE_4,
GEN_WIDTH_1,
GEN_HORIZONTAL_STRIDE_0);
}
static INLINE GenRegister notification1(void) {
return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
GEN_ARF_NOTIFICATION_COUNT,
0,
GEN_TYPE_UD,
GEN_VERTICAL_STRIDE_0,
GEN_WIDTH_1,
GEN_HORIZONTAL_STRIDE_0);
}
static INLINE GenRegister flag(uint32_t nr, uint32_t subnr) {
return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
GEN_ARF_FLAG | nr,
subnr,
GEN_TYPE_UW,
GEN_VERTICAL_STRIDE_0,
GEN_WIDTH_1,
GEN_HORIZONTAL_STRIDE_0);
}
static INLINE GenRegister next(GenRegister reg) {
if (reg.physical)
reg.nr++;
else
reg.quarter++;
return reg;
}
/*! Build an indirectly addressed source */
static INLINE GenRegister indirect(uint32_t type, uint32_t subnr, uint32_t width) {
GenRegister reg;
reg.type = type;
reg.file = GEN_GENERAL_REGISTER_FILE;
reg.address_mode = GEN_ADDRESS_REGISTER_INDIRECT_REGISTER;
reg.width = width;
reg.subnr = subnr;
reg.nr = 0;
reg.negation = 0;
reg.absolute = 0;
reg.vstride = 0;
reg.hstride = 0;
return reg;
}
static INLINE GenRegister vec16(uint32_t file, uint32_t nr, uint32_t subnr) {
return GenRegister(file,
nr,
subnr,
GEN_TYPE_F,
GEN_VERTICAL_STRIDE_8,
GEN_WIDTH_8,
GEN_HORIZONTAL_STRIDE_1);
}
static INLINE GenRegister vec8(uint32_t file, uint32_t nr, uint32_t subnr) {
return GenRegister(file,
nr,
subnr,
GEN_TYPE_F,
GEN_VERTICAL_STRIDE_8,
GEN_WIDTH_8,
GEN_HORIZONTAL_STRIDE_1);
}
static INLINE GenRegister vec4(uint32_t file, uint32_t nr, uint32_t subnr) {
return GenRegister(file,
nr,
subnr,
GEN_TYPE_F,
GEN_VERTICAL_STRIDE_4,
GEN_WIDTH_4,
GEN_HORIZONTAL_STRIDE_1);
}
static INLINE GenRegister vec2(uint32_t file, uint32_t nr, uint32_t subnr) {
return GenRegister(file,
nr,
subnr,
GEN_TYPE_F,
GEN_VERTICAL_STRIDE_2,
GEN_WIDTH_2,
GEN_HORIZONTAL_STRIDE_1);
}
static INLINE GenRegister vec1(uint32_t file, uint32_t nr, uint32_t subnr) {
return GenRegister(file,
nr,
subnr,
GEN_TYPE_F,
GEN_VERTICAL_STRIDE_0,
GEN_WIDTH_1,
GEN_HORIZONTAL_STRIDE_0);
}
static INLINE int hstride_size(GenRegister reg) {
switch (reg.hstride) {
case GEN_HORIZONTAL_STRIDE_0: return 0;
case GEN_HORIZONTAL_STRIDE_1: return 1;
case GEN_HORIZONTAL_STRIDE_2: return 2;
case GEN_HORIZONTAL_STRIDE_4: return 4;
default: NOT_IMPLEMENTED; return 0;
}
}
static INLINE GenRegister suboffset(GenRegister reg, uint32_t delta) {
if (reg.hstride != GEN_HORIZONTAL_STRIDE_0) {
reg.subnr += delta * typeSize(reg.type);
reg.nr += reg.subnr / 32;
reg.subnr %= 32;
}
return reg;
}
static INLINE GenRegister df16(uint32_t file, uint32_t nr, uint32_t subnr) {
return retype(vec16(file, nr, subnr), GEN_TYPE_DF);
}
static INLINE GenRegister df8(uint32_t file, uint32_t nr, uint32_t subnr) {
return retype(vec8(file, nr, subnr), GEN_TYPE_DF);
}
static INLINE GenRegister df1(uint32_t file, uint32_t nr, uint32_t subnr) {
return retype(vec1(file, nr, subnr), GEN_TYPE_DF);
}
static INLINE GenRegister ud16(uint32_t file, uint32_t nr, uint32_t subnr) {
return retype(vec16(file, nr, subnr), GEN_TYPE_UD);
}
static INLINE GenRegister ud8(uint32_t file, uint32_t nr, uint32_t subnr) {
return retype(vec8(file, nr, subnr), GEN_TYPE_UD);
}
static INLINE GenRegister ud1(uint32_t file, uint32_t nr, uint32_t subnr) {
return retype(vec1(file, nr, subnr), GEN_TYPE_UD);
}
static INLINE GenRegister d8(uint32_t file, uint32_t nr, uint32_t subnr) {
return retype(vec8(file, nr, subnr), GEN_TYPE_D);
}
static INLINE GenRegister uw16(uint32_t file, uint32_t nr, uint32_t subnr) {
return suboffset(retype(vec16(file, nr, 0), GEN_TYPE_UW), subnr);
}
static INLINE GenRegister uw8(uint32_t file, uint32_t nr, uint32_t subnr) {
return suboffset(retype(vec8(file, nr, 0), GEN_TYPE_UW), subnr);
}
static INLINE GenRegister uw1(uint32_t file, uint32_t nr, uint32_t subnr) {
return suboffset(retype(vec1(file, nr, 0), GEN_TYPE_UW), subnr);
}
static INLINE GenRegister ub16(uint32_t file, uint32_t nr, uint32_t subnr) {
return GenRegister(file,
nr,
subnr,
GEN_TYPE_UB,
GEN_VERTICAL_STRIDE_16,
GEN_WIDTH_8,
GEN_HORIZONTAL_STRIDE_2);
}
static INLINE GenRegister ub8(uint32_t file, uint32_t nr, uint32_t subnr) {
return GenRegister(file,
nr,
subnr,
GEN_TYPE_UB,
GEN_VERTICAL_STRIDE_16,
GEN_WIDTH_8,
GEN_HORIZONTAL_STRIDE_2);
}
static INLINE GenRegister ub1(uint32_t file, uint32_t nr, uint32_t subnr) {
return suboffset(retype(vec1(file, nr, 0), GEN_TYPE_UB), subnr);
}
static INLINE GenRegister f1grf(uint32_t nr, uint32_t subnr) {
return vec1(GEN_GENERAL_REGISTER_FILE, nr, subnr);
}
static INLINE GenRegister f2grf(uint32_t nr, uint32_t subnr) {
return vec2(GEN_GENERAL_REGISTER_FILE, nr, subnr);
}
static INLINE GenRegister f4grf(uint32_t nr, uint32_t subnr) {
return vec4(GEN_GENERAL_REGISTER_FILE, nr, subnr);
}
static INLINE GenRegister f8grf(uint32_t nr, uint32_t subnr) {
return vec8(GEN_GENERAL_REGISTER_FILE, nr, subnr);
}
static INLINE GenRegister f16grf(uint32_t nr, uint32_t subnr) {
return vec16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
}
static INLINE GenRegister df16grf(uint32_t nr, uint32_t subnr) {
return df16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
}
static INLINE GenRegister df8grf(uint32_t nr, uint32_t subnr) {
return df8(GEN_GENERAL_REGISTER_FILE, nr, subnr);
}
static INLINE GenRegister df1grf(uint32_t nr, uint32_t subnr) {
return df1(GEN_GENERAL_REGISTER_FILE, nr, subnr);
}
static INLINE GenRegister ud16grf(uint32_t nr, uint32_t subnr) {
return ud16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
}
static INLINE GenRegister ud8grf(uint32_t nr, uint32_t subnr) {
return ud8(GEN_GENERAL_REGISTER_FILE, nr, subnr);
}
static INLINE GenRegister ud1grf(uint32_t nr, uint32_t subnr) {
return ud1(GEN_GENERAL_REGISTER_FILE, nr, subnr);
}
static INLINE GenRegister ud1arf(uint32_t nr, uint32_t subnr) {
return ud1(GEN_ARCHITECTURE_REGISTER_FILE, nr, subnr);
}
static INLINE GenRegister uw1grf(uint32_t nr, uint32_t subnr) {
return uw1(GEN_GENERAL_REGISTER_FILE, nr, subnr);
}
static INLINE GenRegister uw8grf(uint32_t nr, uint32_t subnr) {
return uw8(GEN_GENERAL_REGISTER_FILE, nr, subnr);
}
static INLINE GenRegister uw16grf(uint32_t nr, uint32_t subnr) {
return uw16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
}
static INLINE GenRegister ub1grf(uint32_t nr, uint32_t subnr) {
return ub1(GEN_GENERAL_REGISTER_FILE, nr, subnr);
}
static INLINE GenRegister ub8grf(uint32_t nr, uint32_t subnr) {
return ub8(GEN_GENERAL_REGISTER_FILE, nr, subnr);
}
static INLINE GenRegister ub16grf(uint32_t nr, uint32_t subnr) {
return ub16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
}
static INLINE GenRegister unpacked_uw(uint32_t nr, uint32_t subnr) {
return GenRegister(GEN_GENERAL_REGISTER_FILE,
nr,
subnr,
GEN_TYPE_UW,
GEN_VERTICAL_STRIDE_16,
GEN_WIDTH_8,
GEN_HORIZONTAL_STRIDE_2);
}
static INLINE GenRegister packed_ud(uint32_t nr, uint32_t subnr) {
return GenRegister(GEN_GENERAL_REGISTER_FILE,
nr,
subnr,
GEN_TYPE_UD,
GEN_VERTICAL_STRIDE_8,
GEN_WIDTH_4,
GEN_HORIZONTAL_STRIDE_1);
}
static INLINE GenRegister unpacked_ud(uint32_t nr, uint32_t subnr) {
return GenRegister(GEN_GENERAL_REGISTER_FILE,
nr,
subnr,
GEN_TYPE_UD,
GEN_VERTICAL_STRIDE_8,
GEN_WIDTH_4,
GEN_HORIZONTAL_STRIDE_2);
}
static INLINE GenRegister mask(uint32_t subnr) {
return uw1(GEN_ARCHITECTURE_REGISTER_FILE, GEN_ARF_MASK, subnr);
}
static INLINE GenRegister addr1(uint32_t subnr) {
return uw1(GEN_ARCHITECTURE_REGISTER_FILE, GEN_ARF_ADDRESS, subnr);
}
static INLINE GenRegister addr8(uint32_t subnr) {
return uw8(GEN_ARCHITECTURE_REGISTER_FILE, GEN_ARF_ADDRESS, subnr);
}
static INLINE GenRegister negate(GenRegister reg) {
if (reg.file != GEN_IMMEDIATE_VALUE)
reg.negation ^= 1;
else {
if (reg.type == GEN_TYPE_F)
reg.value.f = -reg.value.f;
else if (reg.type == GEN_TYPE_UD)
reg.value.ud = -reg.value.ud;
else if (reg.type == GEN_TYPE_D)
reg.value.d = -reg.value.d;
else if (reg.type == GEN_TYPE_UW) {
const uint16_t uw = reg.value.ud & 0xffff;
reg = GenRegister::immuw(-uw);
} else if (reg.type == GEN_TYPE_W) {
const uint16_t uw = reg.value.ud & 0xffff;
reg = GenRegister::immw(-(int16_t)uw);
} else
NOT_SUPPORTED;
}
return reg;
}
static INLINE GenRegister abs(GenRegister reg) {
reg.absolute = 1;
reg.negation = 0;
return reg;
}
/*! Generate register encoding with run-time simdWidth */
#define DECL_REG_ENCODER(NAME, SIMD16, SIMD8, SIMD1) \
template \
static INLINE GenRegister NAME(uint32_t simdWidth, Args... values) { \
if (simdWidth == 16) \
return SIMD16(values...); \
else if (simdWidth == 8) \
return SIMD8(values...); \
else if (simdWidth == 1) \
return SIMD1(values...); \
else { \
NOT_IMPLEMENTED; \
return SIMD1(values...); \
} \
}
DECL_REG_ENCODER(dfxgrf, df16grf, df8grf, df1grf);
DECL_REG_ENCODER(fxgrf, f16grf, f8grf, f1grf);
DECL_REG_ENCODER(uwxgrf, uw16grf, uw8grf, uw1grf);
DECL_REG_ENCODER(udxgrf, ud16grf, ud8grf, ud1grf);
#undef DECL_REG_ENCODER
};
} /* namespace gbe */
#endif /* __GEN_REGISTER_HPP__ */
Release_v0.3/backend/src/backend/program.cpp 0000664 0000000 0000000 00000070445 12231421770 0021135 0 ustar 00root root 0000000 0000000 /*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see .
*
* Author: Benjamin Segovia
*/
/**
* \file callback interface for the compiler
* \author Benjamin Segovia
*/
#include "program.h"
#include "program.hpp"
#include "gen_program.h"
#include "sys/platform.hpp"
#include "sys/cvar.hpp"
#include "ir/liveness.hpp"
#include "ir/value.hpp"
#include "ir/unit.hpp"
#include "llvm/llvm_to_gen.hpp"
#include "llvm/Config/config.h"
#include
#include
#include
#include
#include
#include
#include
/* Not defined for LLVM 3.0 */
#if !defined(LLVM_VERSION_MAJOR)
#define LLVM_VERSION_MAJOR 3
#endif /* !defined(LLVM_VERSION_MAJOR) */
/* Not defined for LLVM 3.0 */
#if !defined(LLVM_VERSION_MINOR)
#define LLVM_VERSION_MINOR 0
#endif /* !defined(LLVM_VERSION_MINOR) */
#include
#include
#include
#if LLVM_VERSION_MINOR <= 1
#include
#else
#include
#endif /* LLVM_VERSION_MINOR <= 1 */
#include
#include
#include
#include
#include
#if LLVM_VERSION_MINOR <= 2
#include
#else
#include
#endif /* LLVM_VERSION_MINOR <= 2 */
#include
#include
#include "src/GBEConfig.h"
namespace gbe {
Kernel::Kernel(const std::string &name) :
name(name), args(NULL), argNum(0), curbeSize(0), stackSize(0), useSLM(false), slmSize(0), ctx(NULL), samplerSet(NULL), imageSet(NULL)
{}
Kernel::~Kernel(void) {
if(ctx) GBE_DELETE(ctx);
if(samplerSet) GBE_DELETE(samplerSet);
if(imageSet) GBE_DELETE(imageSet);
GBE_SAFE_DELETE_ARRAY(args);
}
int32_t Kernel::getCurbeOffset(gbe_curbe_type type, uint32_t subType) const {
const PatchInfo patch(type, subType);
const auto it = std::lower_bound(patches.begin(), patches.end(), patch);
if (it == patches.end()) return -1; // nothing found
if (patch < *it) return -1; // they are not equal
return it->offset; // we found it!
}
Program::Program(void) : constantSet(NULL) {}
Program::~Program(void) {
for (auto &kernel : kernels) GBE_DELETE(kernel.second);
if (constantSet) delete constantSet;
}
BVAR(OCL_OUTPUT_GEN_IR, false);
bool Program::buildFromLLVMFile(const char *fileName, std::string &error) {
ir::Unit unit;
if (llvmToGen(unit, fileName) == false) {
error = std::string(fileName) + " not found";
return false;
}
this->buildFromUnit(unit, error);
return true;
}
bool Program::buildFromUnit(const ir::Unit &unit, std::string &error) {
constantSet = new ir::ConstantSet(unit.getConstantSet());
const auto &set = unit.getFunctionSet();
const uint32_t kernelNum = set.size();
if (OCL_OUTPUT_GEN_IR) std::cout << unit;
if (kernelNum == 0) return true;
for (const auto &pair : set) {
const std::string &name = pair.first;
Kernel *kernel = this->compileKernel(unit, name);
kernel->setSamplerSet(pair.second->getSamplerSet());
kernel->setImageSet(pair.second->getImageSet());
kernels.insert(std::make_pair(name, kernel));
}
return true;
}
#define OUT_UPDATE_SZ(elt) SERIALIZE_OUT(elt, outs, ret_size)
#define IN_UPDATE_SZ(elt) DESERIALIZE_IN(elt, ins, total_size)
size_t Program::serializeToBin(std::ostream& outs) {
size_t ret_size = 0;
size_t ker_num = kernels.size();
int has_constset = 0;
OUT_UPDATE_SZ(magic_begin);
if (constantSet) {
has_constset = 1;
OUT_UPDATE_SZ(has_constset);
size_t sz = constantSet->serializeToBin(outs);
if (!sz)
return 0;
ret_size += sz;
} else {
OUT_UPDATE_SZ(has_constset);
}
OUT_UPDATE_SZ(ker_num);
for (auto ker : kernels) {
size_t sz = ker.second->serializeToBin(outs);
if (!sz)
return 0;
ret_size += sz;
}
OUT_UPDATE_SZ(magic_end);
OUT_UPDATE_SZ(ret_size);
return ret_size;
}
size_t Program::deserializeFromBin(std::istream& ins) {
size_t total_size = 0;
int has_constset = 0;
size_t ker_num;
uint32_t magic;
IN_UPDATE_SZ(magic);
if (magic != magic_begin)
return 0;
IN_UPDATE_SZ(has_constset);
if(has_constset) {
constantSet = new ir::ConstantSet;
size_t sz = constantSet->deserializeFromBin(ins);
if (sz == 0) {
return 0;
}
total_size += sz;
}
IN_UPDATE_SZ(ker_num);
for (size_t i = 0; i < ker_num; i++) {
size_t ker_serial_sz;
std::string ker_name; // Just a empty name here.
Kernel* ker = allocateKernel(ker_name);
if(!(ker_serial_sz = ker->deserializeFromBin(ins)))
return 0;
kernels.insert(std::make_pair(ker->getName(), ker));
total_size += ker_serial_sz;
}
IN_UPDATE_SZ(magic);
if (magic != magic_end)
return 0;
size_t total_bytes;
IN_UPDATE_SZ(total_bytes);
if (total_bytes + sizeof(total_size) != total_size)
return 0;
return total_size;
}
size_t Kernel::serializeToBin(std::ostream& outs) {
unsigned int i;
size_t ret_size = 0;
int has_samplerset = 0;
int has_imageset = 0;
OUT_UPDATE_SZ(magic_begin);
OUT_UPDATE_SZ(name.size());
outs.write(name.c_str(), name.size());
ret_size += sizeof(char)*name.size();
OUT_UPDATE_SZ(argNum);
for (i = 0; i < argNum; i++) {
KernelArgument& arg = args[i];
OUT_UPDATE_SZ(arg.type);
OUT_UPDATE_SZ(arg.size);
OUT_UPDATE_SZ(arg.bufSize);
}
OUT_UPDATE_SZ(patches.size());
for (auto patch : patches) {
unsigned int tmp;
tmp = patch.type;
OUT_UPDATE_SZ(tmp);
tmp = patch.subType;
OUT_UPDATE_SZ(tmp);
tmp = patch.offset;
OUT_UPDATE_SZ(tmp);
}
OUT_UPDATE_SZ(curbeSize);
OUT_UPDATE_SZ(simdWidth);
OUT_UPDATE_SZ(stackSize);
OUT_UPDATE_SZ(scratchSize);
OUT_UPDATE_SZ(useSLM);
OUT_UPDATE_SZ(slmSize);
/* samplers. */
if (samplerSet) {
has_samplerset = 1;
OUT_UPDATE_SZ(has_samplerset);
size_t sz = samplerSet->serializeToBin(outs);
if (!sz)
return 0;
ret_size += sz;
} else {
OUT_UPDATE_SZ(has_samplerset);
}
/* images. */
if (imageSet) {
has_imageset = 1;
OUT_UPDATE_SZ(has_imageset);
size_t sz = imageSet->serializeToBin(outs);
if (!sz)
return 0;
ret_size += sz;
} else {
OUT_UPDATE_SZ(has_imageset);
}
/* Code. */
const char * code = getCode();
OUT_UPDATE_SZ(getCodeSize());
outs.write(code, getCodeSize()*sizeof(char));
ret_size += getCodeSize()*sizeof(char);
OUT_UPDATE_SZ(magic_end);
OUT_UPDATE_SZ(ret_size);
return ret_size;
}
size_t Kernel::deserializeFromBin(std::istream& ins) {
size_t total_size = 0;
int has_samplerset = 0;
int has_imageset = 0;
size_t code_size = 0;
uint32_t magic = 0;
size_t patch_num = 0;
IN_UPDATE_SZ(magic);
if (magic != magic_begin)
return 0;
size_t name_len;
IN_UPDATE_SZ(name_len);
char* c_name = new char[name_len+1];
ins.read(c_name, name_len*sizeof(char));
total_size += sizeof(char)*name_len;
c_name[name_len] = 0;
name = c_name;
delete[] c_name;
IN_UPDATE_SZ(argNum);
args = GBE_NEW_ARRAY_NO_ARG(KernelArgument, argNum);
for (uint32_t i = 0; i < argNum; i++) {
KernelArgument& arg = args[i];
IN_UPDATE_SZ(arg.type);
IN_UPDATE_SZ(arg.size);
IN_UPDATE_SZ(arg.bufSize);
}
IN_UPDATE_SZ(patch_num);
for (uint32_t i = 0; i < patch_num; i++) {
unsigned int tmp;
PatchInfo patch;
IN_UPDATE_SZ(tmp);
patch.type = tmp;
IN_UPDATE_SZ(tmp);
patch.subType = tmp;
IN_UPDATE_SZ(tmp);
patch.offset = tmp;
patches.push_back(patch);
}
IN_UPDATE_SZ(curbeSize);
IN_UPDATE_SZ(simdWidth);
IN_UPDATE_SZ(stackSize);
IN_UPDATE_SZ(scratchSize);
IN_UPDATE_SZ(useSLM);
IN_UPDATE_SZ(slmSize);
IN_UPDATE_SZ(has_samplerset);
if (has_samplerset) {
samplerSet = GBE_NEW(ir::SamplerSet);
size_t sz = samplerSet->deserializeFromBin(ins);
if (sz == 0) {
return 0;
}
total_size += sz;
}
IN_UPDATE_SZ(has_imageset);
if (has_imageset) {
imageSet = GBE_NEW(ir::ImageSet);
size_t sz = imageSet->deserializeFromBin(ins);
if (sz == 0) {
return 0;
}
total_size += sz;
}
IN_UPDATE_SZ(code_size);
if (code_size) {
char* code = GBE_NEW_ARRAY_NO_ARG(char, code_size);
ins.read(code, code_size*sizeof(char));
total_size += sizeof(char)*code_size;
setCode(code, code_size);
}
IN_UPDATE_SZ(magic);
if (magic != magic_end)
return 0;
size_t total_bytes;
IN_UPDATE_SZ(total_bytes);
if (total_bytes + sizeof(total_size) != total_size)
return 0;
return total_size;
}
#undef OUT_UPDATE_SZ
#undef IN_UPDATE_SZ
void Program::printStatus(int indent, std::ostream& outs) {
using namespace std;
string spaces = indent_to_str(indent);
outs << spaces << "=============== Begin Program ===============" << "\n";
if (constantSet) {
constantSet->printStatus(indent + 4, outs);
}
for (auto ker : kernels) {
ker.second->printStatus(indent + 4, outs);
}
outs << spaces << "================ End Program ================" << "\n";
}
void Kernel::printStatus(int indent, std::ostream& outs) {
using namespace std;
string spaces = indent_to_str(indent);
string spaces_nl = indent_to_str(indent + 4);
int num;
outs << spaces << "+++++++++++ Begin Kernel +++++++++++" << "\n";
outs << spaces_nl << "Kernel Name: " << name << "\n";
outs << spaces_nl << " curbeSize: " << curbeSize << "\n";
outs << spaces_nl << " simdWidth: " << simdWidth << "\n";
outs << spaces_nl << " stackSize: " << stackSize << "\n";
outs << spaces_nl << " scratchSize: " << scratchSize << "\n";
outs << spaces_nl << " useSLM: " << useSLM << "\n";
outs << spaces_nl << " slmSize: " << slmSize << "\n";
outs << spaces_nl << " Argument Number is " << argNum << "\n";
for (uint32_t i = 0; i < argNum; i++) {
KernelArgument& arg = args[i];
outs << spaces_nl << " Arg " << i << ":\n";
outs << spaces_nl << " type value: "<< arg.type << "\n";
outs << spaces_nl << " size: "<< arg.size << "\n";
outs << spaces_nl << " bufSize: "<< arg.bufSize << "\n";
}
outs << spaces_nl << " Patches Number is " << patches.size() << "\n";
num = 0;
for (auto patch : patches) {
num++;
outs << spaces_nl << " patch " << num << ":\n";
outs << spaces_nl << " type value: "<< patch.type << "\n";
outs << spaces_nl << " subtype value: "<< patch.subType << "\n";
outs << spaces_nl << " offset: "<< patch.offset << "\n";
}
if (samplerSet) {
samplerSet->printStatus(indent + 4, outs);
}
if (imageSet) {
imageSet->printStatus(indent + 4, outs);
}
outs << spaces << "++++++++++++ End Kernel ++++++++++++" << "\n";
}
/*********************** End of Program class member function *************************/
static void programDelete(gbe_program gbeProgram) {
gbe::Program *program = (gbe::Program*)(gbeProgram);
GBE_SAFE_DELETE(program);
}
static void buildModuleFromSource(const char* input, const char* output, std::string options) {
// Arguments to pass to the clang frontend
vector args;
bool bOpt = true;
bool bFastMath = false;
vector useless; //hold substrings to avoid c_str free
size_t start = 0, end = 0;
/* clang unsupport options:
-cl-denorms-are-zero, -cl-strict-aliasing
-cl-no-signed-zeros, -cl-fp32-correctly-rounded-divide-sqrt
all support options, refer to clang/include/clang/Driver/Options.inc
Maybe can filter these options to avoid warning
*/
while (end != std::string::npos) {
end = options.find(' ', start);
std::string str = options.substr(start, end - start);
start = end + 1;
if(str.size() == 0)
continue;
if(str == "-cl-opt-disable") bOpt = false;
if(str == "-cl-fast-relaxed-math") bFastMath = true;
useless.push_back(str);
args.push_back(str.c_str());
}
args.push_back("-mllvm");
args.push_back("-inline-threshold=200000");
#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
args.push_back("-DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND");
#endif
args.push_back("-emit-llvm");
// XXX we haven't implement those builtin functions,
// so disable it currently.
args.push_back("-fno-builtin");
if(bOpt)
args.push_back("-O2");
if(bFastMath)
args.push_back("-D __FAST_RELAXED_MATH__=1");
#if LLVM_VERSION_MINOR <= 2
args.push_back("-triple");
args.push_back("nvptx");
#else
args.push_back("-x");
args.push_back("cl");
args.push_back("-triple");
args.push_back("spir");
#endif /* LLVM_VERSION_MINOR <= 2 */
args.push_back(input);
// The compiler invocation needs a DiagnosticsEngine so it can report problems
#if LLVM_VERSION_MINOR <= 1
args.push_back("-triple");
args.push_back("ptx32");
clang::TextDiagnosticPrinter *DiagClient =
new clang::TextDiagnosticPrinter(llvm::errs(), clang::DiagnosticOptions());
llvm::IntrusiveRefCntPtr DiagID(new clang::DiagnosticIDs());
clang::DiagnosticsEngine Diags(DiagID, DiagClient);
#else
args.push_back("-ffp-contract=off");
llvm::IntrusiveRefCntPtr DiagOpts = new clang::DiagnosticOptions();
clang::TextDiagnosticPrinter *DiagClient =
new clang::TextDiagnosticPrinter(llvm::errs(), &*DiagOpts);
llvm::IntrusiveRefCntPtr DiagID(new clang::DiagnosticIDs());
clang::DiagnosticsEngine Diags(DiagID, &*DiagOpts, DiagClient);
#endif /* LLVM_VERSION_MINOR <= 1 */
// Create the compiler invocation
llvm::OwningPtr CI(new clang::CompilerInvocation);
clang::CompilerInvocation::CreateFromArgs(*CI,
&args[0],
&args[0] + args.size(),
Diags);
// Create the compiler instance
clang::CompilerInstance Clang;
Clang.setInvocation(CI.take());
// Get ready to report problems
#if LLVM_VERSION_MINOR <= 2
Clang.createDiagnostics(args.size(), &args[0]);
#else
Clang.createDiagnostics();
#endif /* LLVM_VERSION_MINOR <= 2 */
if (!Clang.hasDiagnostics())
return;
// Set Language
clang::LangOptions & lang_opts = Clang.getLangOpts();
lang_opts.OpenCL = 1;
//llvm flags need command line parsing to take effect
if (!Clang.getFrontendOpts().LLVMArgs.empty()) {
unsigned NumArgs = Clang.getFrontendOpts().LLVMArgs.size();
const char **Args = new const char*[NumArgs + 2];
Args[0] = "clang (LLVM option parsing)";
for (unsigned i = 0; i != NumArgs; ++i){
Args[i + 1] = Clang.getFrontendOpts().LLVMArgs[i].c_str();
}
Args[NumArgs + 1] = 0;
llvm::cl::ParseCommandLineOptions(NumArgs + 1, Args);
delete [] Args;
}
// Create an action and make the compiler instance carry it out
llvm::OwningPtr Act(new clang::EmitLLVMOnlyAction());
sem_wait(&llvm_semaphore);
auto retVal = Clang.ExecuteAction(*Act);
sem_post(&llvm_semaphore);
if (!retVal)
return;
llvm::Module *module = Act->takeModule();
std::string ErrorInfo;
#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR > 3)
auto mode = llvm::sys::fs::F_Binary;
#else
auto mode = llvm::raw_fd_ostream::F_Binary;
#endif
llvm::raw_fd_ostream OS(output, ErrorInfo, mode);
//still write to temp file for code simply, otherwise need add another function.
//because gbe_program_new_from_llvm also be used by cl_program_create_from_llvm, can't be removed
//TODO: Pass module to llvmToGen, if use module, should return Act and use OwningPtr out of this funciton
llvm::WriteBitcodeToFile(module, OS);
OS.close();
}
extern std::string ocl_stdlib_str;
BVAR(OCL_USE_PCH, true);
static gbe_program programNewFromSource(const char *source,
size_t stringSize,
const char *options,
char *err,
size_t *errSize)
{
char clStr[L_tmpnam+1], llStr[L_tmpnam+1];
const std::string clName = std::string(tmpnam_r(clStr)) + ".cl"; /* unsafe! */
const std::string llName = std::string(tmpnam_r(llStr)) + ".ll"; /* unsafe! */
std::string pchHeaderName;
std::string clOpt;
FILE *clFile = fopen(clName.c_str(), "w");
FATAL_IF(clFile == NULL, "Failed to open temporary file");
bool usePCH = false;
if(options)
clOpt += options;
if (options || !OCL_USE_PCH) {
/* Some building option may cause the prebuild pch header file
not compatible with the XXX.cl source. We need rebuild all here.*/
usePCH = false;
} else {
std::string dirs = PCH_OBJECT_DIR;
std::istringstream idirs(dirs);
while (getline(idirs, pchHeaderName, ';')) {
if(access(pchHeaderName.c_str(), R_OK) == 0) {
usePCH = true;
break;
}
}
}
if (usePCH) {
clOpt += " -include-pch ";
clOpt += pchHeaderName;
clOpt += " ";
} else
fwrite(ocl_stdlib_str.c_str(), strlen(ocl_stdlib_str.c_str()), 1, clFile);
// Write the source to the cl file
fwrite(source, strlen(source), 1, clFile);
fclose(clFile);
buildModuleFromSource(clName.c_str(), llName.c_str(), clOpt.c_str());
remove(clName.c_str());
// Now build the program from llvm
gbe_program p = gbe_program_new_from_llvm(llName.c_str(), stringSize, err, errSize);
remove(llName.c_str());
return p;
}
static size_t programGetGlobalConstantSize(gbe_program gbeProgram) {
if (gbeProgram == NULL) return 0;
const gbe::Program *program = (const gbe::Program*) gbeProgram;
return program->getGlobalConstantSize();
}
static void programGetGlobalConstantData(gbe_program gbeProgram, char *mem) {
if (gbeProgram == NULL) return;
const gbe::Program *program = (const gbe::Program*) gbeProgram;
program->getGlobalConstantData(mem);
}
static uint32_t programGetKernelNum(gbe_program gbeProgram) {
if (gbeProgram == NULL) return 0;
const gbe::Program *program = (const gbe::Program*) gbeProgram;
return program->getKernelNum();
}
static gbe_kernel programGetKernelByName(gbe_program gbeProgram, const char *name) {
if (gbeProgram == NULL) return NULL;
const gbe::Program *program = (gbe::Program*) gbeProgram;
return (gbe_kernel) program->getKernel(std::string(name));
}
static gbe_kernel programGetKernel(const gbe_program gbeProgram, uint32_t ID) {
if (gbeProgram == NULL) return NULL;
const gbe::Program *program = (gbe::Program*) gbeProgram;
return (gbe_kernel) program->getKernel(ID);
}
static const char *kernelGetName(gbe_kernel genKernel) {
if (genKernel == NULL) return NULL;
const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
return kernel->getName();
}
static const char *kernelGetCode(gbe_kernel genKernel) {
if (genKernel == NULL) return NULL;
const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
return kernel->getCode();
}
static size_t kernelGetCodeSize(gbe_kernel genKernel) {
if (genKernel == NULL) return 0u;
const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
return kernel->getCodeSize();
}
static uint32_t kernelGetArgNum(gbe_kernel genKernel) {
if (genKernel == NULL) return 0u;
const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
return kernel->getArgNum();
}
static uint32_t kernelGetArgSize(gbe_kernel genKernel, uint32_t argID) {
if (genKernel == NULL) return 0u;
const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
return kernel->getArgSize(argID);
}
static gbe_arg_type kernelGetArgType(gbe_kernel genKernel, uint32_t argID) {
if (genKernel == NULL) return GBE_ARG_INVALID;
const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
return kernel->getArgType(argID);
}
static uint32_t kernelGetSIMDWidth(gbe_kernel genKernel) {
if (genKernel == NULL) return GBE_ARG_INVALID;
const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
return kernel->getSIMDWidth();
}
static int32_t kernelGetCurbeOffset(gbe_kernel genKernel, gbe_curbe_type type, uint32_t subType) {
if (genKernel == NULL) return 0;
const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
return kernel->getCurbeOffset(type, subType);
}
static int32_t kernelGetCurbeSize(gbe_kernel genKernel) {
if (genKernel == NULL) return 0;
const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
return kernel->getCurbeSize();
}
static int32_t kernelGetStackSize(gbe_kernel genKernel) {
if (genKernel == NULL) return 0;
const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
return kernel->getStackSize();
}
static int32_t kernelGetScratchSize(gbe_kernel genKernel) {
if (genKernel == NULL) return 0;
const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
return kernel->getScratchSize();
}
static int32_t kernelUseSLM(gbe_kernel genKernel) {
if (genKernel == NULL) return 0;
const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
return kernel->getUseSLM() ? 1 : 0;
}
static int32_t kernelGetSLMSize(gbe_kernel genKernel) {
if (genKernel == NULL) return 0;
const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
return kernel->getSLMSize();
}
static int32_t kernelSetConstBufSize(gbe_kernel genKernel, uint32_t argID, size_t sz) {
if (genKernel == NULL) return -1;
gbe::Kernel *kernel = (gbe::Kernel*) genKernel;
return kernel->setConstBufSize(argID, sz);
}
static size_t kernelGetSamplerSize(gbe_kernel gbeKernel) {
if (gbeKernel == NULL) return 0;
const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
return kernel->getSamplerSize();
}
static void kernelGetSamplerData(gbe_kernel gbeKernel, uint32_t *samplers) {
if (gbeKernel == NULL) return;
const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
kernel->getSamplerData(samplers);
}
static size_t kernelGetImageSize(gbe_kernel gbeKernel) {
if (gbeKernel == NULL) return 0;
const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
return kernel->getImageSize();
}
static void kernelGetImageData(gbe_kernel gbeKernel, ImageInfo *images) {
if (gbeKernel == NULL) return;
const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
kernel->getImageData(images);
}
static uint32_t gbeImageBaseIndex = 0;
static void setImageBaseIndex(uint32_t baseIdx) {
gbeImageBaseIndex = baseIdx;
}
static uint32_t getImageBaseIndex() {
return gbeImageBaseIndex;
}
static uint32_t kernelGetRequiredWorkGroupSize(gbe_kernel kernel, uint32_t dim) {
return 0u;
}
} /* namespace gbe */
GBE_EXPORT_SYMBOL gbe_program_new_from_source_cb *gbe_program_new_from_source = NULL;
GBE_EXPORT_SYMBOL gbe_program_new_from_binary_cb *gbe_program_new_from_binary = NULL;
GBE_EXPORT_SYMBOL gbe_program_new_from_llvm_cb *gbe_program_new_from_llvm = NULL;
GBE_EXPORT_SYMBOL gbe_program_get_global_constant_size_cb *gbe_program_get_global_constant_size = NULL;
GBE_EXPORT_SYMBOL gbe_program_get_global_constant_data_cb *gbe_program_get_global_constant_data = NULL;
GBE_EXPORT_SYMBOL gbe_program_delete_cb *gbe_program_delete = NULL;
GBE_EXPORT_SYMBOL gbe_program_get_kernel_num_cb *gbe_program_get_kernel_num = NULL;
GBE_EXPORT_SYMBOL gbe_program_get_kernel_by_name_cb *gbe_program_get_kernel_by_name = NULL;
GBE_EXPORT_SYMBOL gbe_program_get_kernel_cb *gbe_program_get_kernel = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_name_cb *gbe_kernel_get_name = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_code_cb *gbe_kernel_get_code = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_code_size_cb *gbe_kernel_get_code_size = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_arg_num_cb *gbe_kernel_get_arg_num = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_arg_size_cb *gbe_kernel_get_arg_size = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_arg_type_cb *gbe_kernel_get_arg_type = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_simd_width_cb *gbe_kernel_get_simd_width = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_curbe_offset_cb *gbe_kernel_get_curbe_offset = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_curbe_size_cb *gbe_kernel_get_curbe_size = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_stack_size_cb *gbe_kernel_get_stack_size = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_scratch_size_cb *gbe_kernel_get_scratch_size = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_set_const_buffer_size_cb *gbe_kernel_set_const_buffer_size = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_required_work_group_size_cb *gbe_kernel_get_required_work_group_size = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_use_slm_cb *gbe_kernel_use_slm = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_slm_size_cb *gbe_kernel_get_slm_size = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_sampler_size_cb *gbe_kernel_get_sampler_size = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_sampler_data_cb *gbe_kernel_get_sampler_data = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_image_size_cb *gbe_kernel_get_image_size = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_image_data_cb *gbe_kernel_get_image_data = NULL;
GBE_EXPORT_SYMBOL gbe_set_image_base_index_cb *gbe_set_image_base_index = NULL;
GBE_EXPORT_SYMBOL gbe_get_image_base_index_cb *gbe_get_image_base_index = NULL;
namespace gbe
{
/* Use pre-main to setup the call backs */
struct CallBackInitializer
{
CallBackInitializer(void) {
gbe_program_new_from_source = gbe::programNewFromSource;
gbe_program_get_global_constant_size = gbe::programGetGlobalConstantSize;
gbe_program_get_global_constant_data = gbe::programGetGlobalConstantData;
gbe_program_delete = gbe::programDelete;
gbe_program_get_kernel_num = gbe::programGetKernelNum;
gbe_program_get_kernel_by_name = gbe::programGetKernelByName;
gbe_program_get_kernel = gbe::programGetKernel;
gbe_kernel_get_name = gbe::kernelGetName;
gbe_kernel_get_code = gbe::kernelGetCode;
gbe_kernel_get_code_size = gbe::kernelGetCodeSize;
gbe_kernel_get_arg_num = gbe::kernelGetArgNum;
gbe_kernel_get_arg_size = gbe::kernelGetArgSize;
gbe_kernel_get_arg_type = gbe::kernelGetArgType;
gbe_kernel_get_simd_width = gbe::kernelGetSIMDWidth;
gbe_kernel_get_curbe_offset = gbe::kernelGetCurbeOffset;
gbe_kernel_get_curbe_size = gbe::kernelGetCurbeSize;
gbe_kernel_get_stack_size = gbe::kernelGetStackSize;
gbe_kernel_get_scratch_size = gbe::kernelGetScratchSize;
gbe_kernel_set_const_buffer_size = gbe::kernelSetConstBufSize;
gbe_kernel_get_required_work_group_size = gbe::kernelGetRequiredWorkGroupSize;
gbe_kernel_use_slm = gbe::kernelUseSLM;
gbe_kernel_get_slm_size = gbe::kernelGetSLMSize;
gbe_kernel_get_sampler_size = gbe::kernelGetSamplerSize;
gbe_kernel_get_sampler_data = gbe::kernelGetSamplerData;
gbe_kernel_get_image_size = gbe::kernelGetImageSize;
gbe_kernel_get_image_data = gbe::kernelGetImageData;
gbe_get_image_base_index = gbe::getImageBaseIndex;
gbe_set_image_base_index = gbe::setImageBaseIndex;
genSetupCallBacks();
genSetupLLVMSemaphore();
}
};
static CallBackInitializer cbInitializer;
} /* namespace gbe */
Release_v0.3/backend/src/backend/program.h 0000664 0000000 0000000 00000021463 12231421770 0020576 0 ustar 00root root 0000000 0000000 /*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see .
*
* Author: Benjamin Segovia
*/
/**
* \file program.h
* \author Benjamin Segovia
*
* C interface for the Gen kernels and programs (either real Gen ISA or Gen
* simulator). This is the only thing the run-time can see from the compiler
*/
#ifndef __GBE_PROGRAM_H__
#define __GBE_PROGRAM_H__
#include
#include
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
/*! Opaque structure that interfaces a GBE program */
typedef struct _gbe_program *gbe_program;
/*! Opaque structure that interfaces a GBE kernel (ie one OCL function) */
typedef struct _gbe_kernel *gbe_kernel;
/*! Argument type for each function call */
enum gbe_arg_type {
GBE_ARG_VALUE = 0, // int, float and so on
GBE_ARG_GLOBAL_PTR = 1, // __global
GBE_ARG_CONSTANT_PTR = 2, // __constant
GBE_ARG_LOCAL_PTR = 3, // __local
GBE_ARG_IMAGE = 4, // image2d_t, image3d_t
GBE_ARG_SAMPLER = 5, // sampler_t
GBE_ARG_INVALID = 0xffffffff
};
/*! Constant buffer values (ie values to setup in the constant buffer) */
enum gbe_curbe_type {
GBE_CURBE_LOCAL_ID_X = 0,
GBE_CURBE_LOCAL_ID_Y,
GBE_CURBE_LOCAL_ID_Z,
GBE_CURBE_LOCAL_SIZE_X,
GBE_CURBE_LOCAL_SIZE_Y,
GBE_CURBE_LOCAL_SIZE_Z,
GBE_CURBE_GLOBAL_SIZE_X,
GBE_CURBE_GLOBAL_SIZE_Y,
GBE_CURBE_GLOBAL_SIZE_Z,
GBE_CURBE_GLOBAL_OFFSET_X,
GBE_CURBE_GLOBAL_OFFSET_Y,
GBE_CURBE_GLOBAL_OFFSET_Z,
GBE_CURBE_GROUP_NUM_X,
GBE_CURBE_GROUP_NUM_Y,
GBE_CURBE_GROUP_NUM_Z,
GBE_CURBE_WORK_DIM,
GBE_CURBE_SAMPLER_INFO,
GBE_CURBE_IMAGE_INFO,
GBE_CURBE_STACK_POINTER,
GBE_CURBE_KERNEL_ARGUMENT,
GBE_CURBE_EXTRA_ARGUMENT,
GBE_CURBE_BLOCK_IP,
GBE_CURBE_THREAD_NUM
};
/*! Extra arguments use the negative range of sub-values */
enum gbe_extra_argument {
GBE_STACK_BUFFER = 0, /* Give stack location in curbe */
GBE_CONSTANT_BUFFER = 1 /* constant buffer argument location in curbe */
};
typedef struct ImageInfo {
int32_t arg_idx;
int32_t idx;
int32_t wSlot;
int32_t hSlot;
int32_t depthSlot;
int32_t dataTypeSlot;
int32_t channelOrderSlot;
int32_t dimOrderSlot;
} ImageInfo;
typedef void (gbe_set_image_base_index_cb)(uint32_t base_idx);
extern gbe_set_image_base_index_cb *gbe_set_image_base_index;
typedef uint32_t (gbe_get_image_base_index_cb)();
extern gbe_get_image_base_index_cb *gbe_get_image_base_index;
/*! Get the size of defined images */
typedef size_t (gbe_kernel_get_image_size_cb)(gbe_kernel gbeKernel);
extern gbe_kernel_get_image_size_cb *gbe_kernel_get_image_size;
/*! Get the content of defined images */
typedef void (gbe_kernel_get_image_data_cb)(gbe_kernel gbeKernel, ImageInfo *images);
extern gbe_kernel_get_image_data_cb *gbe_kernel_get_image_data;
/*! Create a new program from the given source code (zero terminated string) */
typedef gbe_program (gbe_program_new_from_source_cb)(const char *source,
size_t stringSize,
const char *options,
char *err,
size_t *err_size);
extern gbe_program_new_from_source_cb *gbe_program_new_from_source;
/*! Create a new program from the given blob */
typedef gbe_program (gbe_program_new_from_binary_cb)(const char *binary, size_t size);
extern gbe_program_new_from_binary_cb *gbe_program_new_from_binary;
/*! Create a new program from the given LLVM file */
typedef gbe_program (gbe_program_new_from_llvm_cb)(const char *fileName,
size_t string_size,
char *err,
size_t *err_size);
extern gbe_program_new_from_llvm_cb *gbe_program_new_from_llvm;
/*! Get the size of global constants */
typedef size_t (gbe_program_get_global_constant_size_cb)(gbe_program gbeProgram);
extern gbe_program_get_global_constant_size_cb *gbe_program_get_global_constant_size;
/*! Get the content of global constants */
typedef void (gbe_program_get_global_constant_data_cb)(gbe_program gbeProgram, char *mem);
extern gbe_program_get_global_constant_data_cb *gbe_program_get_global_constant_data;
/*! Get the size of defined samplers */
typedef size_t (gbe_kernel_get_sampler_size_cb)(gbe_kernel gbeKernel);
extern gbe_kernel_get_sampler_size_cb *gbe_kernel_get_sampler_size;
/*! Get the content of defined samplers */
typedef void (gbe_kernel_get_sampler_data_cb)(gbe_kernel gbeKernel, uint32_t *samplers);
extern gbe_kernel_get_sampler_data_cb *gbe_kernel_get_sampler_data;
/*! Destroy and deallocate the given program */
typedef void (gbe_program_delete_cb)(gbe_program);
extern gbe_program_delete_cb *gbe_program_delete;
/*! Get the number of functions in the program */
typedef uint32_t (gbe_program_get_kernel_num_cb)(gbe_program);
extern gbe_program_get_kernel_num_cb *gbe_program_get_kernel_num;
/*! Get the kernel from its name */
typedef gbe_kernel (gbe_program_get_kernel_by_name_cb)(gbe_program, const char *name);
extern gbe_program_get_kernel_by_name_cb *gbe_program_get_kernel_by_name;
/*! Get the kernel from its ID */
typedef gbe_kernel (gbe_program_get_kernel_cb)(gbe_program, uint32_t ID);
extern gbe_program_get_kernel_cb *gbe_program_get_kernel;
/*! Get the kernel name */
typedef const char *(gbe_kernel_get_name_cb)(gbe_kernel);
extern gbe_kernel_get_name_cb *gbe_kernel_get_name;
/*! Get the kernel source code */
typedef const char *(gbe_kernel_get_code_cb)(gbe_kernel);
extern gbe_kernel_get_code_cb *gbe_kernel_get_code;
/*! Get the size of the source code */
typedef size_t (gbe_kernel_get_code_size_cb)(gbe_kernel);
extern gbe_kernel_get_code_size_cb *gbe_kernel_get_code_size;
/*! Get the total number of arguments */
typedef uint32_t (gbe_kernel_get_arg_num_cb)(gbe_kernel);
extern gbe_kernel_get_arg_num_cb *gbe_kernel_get_arg_num;
/*! Get the size of the given argument */
typedef uint32_t (gbe_kernel_get_arg_size_cb)(gbe_kernel, uint32_t argID);
extern gbe_kernel_get_arg_size_cb *gbe_kernel_get_arg_size;
/*! Get the type of the given argument */
typedef enum gbe_arg_type (gbe_kernel_get_arg_type_cb)(gbe_kernel, uint32_t argID);
extern gbe_kernel_get_arg_type_cb *gbe_kernel_get_arg_type;
/*! Get the simd width for the kernel */
typedef uint32_t (gbe_kernel_get_simd_width_cb)(gbe_kernel);
extern gbe_kernel_get_simd_width_cb *gbe_kernel_get_simd_width;
/*! Get the curbe size required by the kernel */
typedef int32_t (gbe_kernel_get_curbe_size_cb)(gbe_kernel);
extern gbe_kernel_get_curbe_size_cb *gbe_kernel_get_curbe_size;
/*! Get the stack size (zero if no stack is required) */
typedef int32_t (gbe_kernel_get_stack_size_cb)(gbe_kernel);
extern gbe_kernel_get_stack_size_cb *gbe_kernel_get_stack_size;
/*! Get the scratch size (zero if no scratch is required) */
typedef int32_t (gbe_kernel_get_scratch_size_cb)(gbe_kernel);
extern gbe_kernel_get_scratch_size_cb *gbe_kernel_get_scratch_size;
/*! Get the curbe offset where to put the data. Returns -1 if not required */
typedef int32_t (gbe_kernel_get_curbe_offset_cb)(gbe_kernel, enum gbe_curbe_type type, uint32_t sub_type);
extern gbe_kernel_get_curbe_offset_cb *gbe_kernel_get_curbe_offset;
/*! Set the constant pointer arg size and return the cb offset in curbe */
typedef int32_t (gbe_kernel_set_const_buffer_size_cb)(gbe_kernel, uint32_t argID, size_t sz);
extern gbe_kernel_set_const_buffer_size_cb *gbe_kernel_set_const_buffer_size;
/*! Indicates if a work group size is required. Return the required width or 0
* if none
*/
typedef uint32_t (gbe_kernel_get_required_work_group_size_cb)(gbe_kernel, uint32_t dim);
extern gbe_kernel_get_required_work_group_size_cb *gbe_kernel_get_required_work_group_size;
/*! Says if SLM is used. Required to reconfigure the L3 complex */
typedef int32_t (gbe_kernel_use_slm_cb)(gbe_kernel);
extern gbe_kernel_use_slm_cb *gbe_kernel_use_slm;
/*! Get slm size needed for kernel local variables */
typedef int32_t (gbe_kernel_get_slm_size_cb)(gbe_kernel);
extern gbe_kernel_get_slm_size_cb *gbe_kernel_get_slm_size;
#ifdef __cplusplus
}
#endif /* __cplusplus */
#endif /* __GBE_PROGRAM_H__ */
Release_v0.3/backend/src/backend/program.hpp 0000664 0000000 0000000 00000023101 12231421770 0021125 0 ustar 00root root 0000000 0000000 /*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see .
*
* Author: Benjamin Segovia
*/
/**
* \file program.hpp
* \author Benjamin Segovia
*/
#ifndef __GBE_PROGRAM_HPP__
#define __GBE_PROGRAM_HPP__
#include "backend/program.h"
#include "backend/context.hpp"
#include "ir/constant.hpp"
#include "ir/unit.hpp"
#include "ir/function.hpp"
#include "ir/sampler.hpp"
#include "sys/hash_map.hpp"
#include "sys/vector.hpp"
#include
namespace gbe {
namespace ir {
class Unit; // Compilation unit. Contains the program to compile
} /* namespace ir */
} /* namespace gbe */
namespace gbe {
/*! Info for the kernel argument */
struct KernelArgument {
gbe_arg_type type; //!< Pointer, structure, image, regular value?
uint32_t size; //!< Size of the argument
uint32_t bufSize; //!< Contant buffer size
};
/*! Stores the offset where to patch where to patch */
struct PatchInfo {
INLINE PatchInfo(gbe_curbe_type type, uint32_t subType = 0u, uint32_t offset = 0u) :
type(uint32_t(type)), subType(subType), offset(offset) {}
INLINE PatchInfo(void) {}
uint64_t type : 16; //!< Type of the patch (see program.h for the list)
uint64_t subType : 32; //!< Optional sub-type of the patch (see program.h)
uint64_t offset : 16; //!< Optional offset to encode
};
/*! We will sort PatchInfo to make binary search */
INLINE bool operator< (PatchInfo i0, PatchInfo i1) {
if (i0.type != i1.type) return i0.type < i1.type;
return i0.subType < i1.subType;
}
/*! Describe a compiled kernel */
class Kernel : public NonCopyable, public Serializable
{
public:
/*! Create an empty kernel with the given name */
Kernel(const std::string &name);
/*! Destroy it */
virtual ~Kernel(void);
/*! Return the instruction stream (to be implemented) */
virtual const char *getCode(void) const = 0;
/*! Set the instruction stream.*/
virtual const void setCode(const char *, size_t size) = 0;
/*! Return the instruction stream size (to be implemented) */
virtual size_t getCodeSize(void) const = 0;
/*! Get the kernel name */
INLINE const char *getName(void) const { return name.c_str(); }
/*! Return the number of arguments for the kernel call */
INLINE uint32_t getArgNum(void) const { return argNum; }
/*! Return the size of the given argument */
INLINE uint32_t getArgSize(uint32_t argID) const {
return argID >= argNum ? 0u : args[argID].size;
}
/*! Return the type of the given argument */
INLINE gbe_arg_type getArgType(uint32_t argID) const {
return argID >= argNum ? GBE_ARG_INVALID : args[argID].type;
}
/*! Get the offset where to patch. Returns -1 if no patch needed */
int32_t getCurbeOffset(gbe_curbe_type type, uint32_t subType) const;
/*! Get the curbe size required by the kernel */
INLINE uint32_t getCurbeSize(void) const { return this->curbeSize; }
/*! Return the size of the stack (zero if none) */
INLINE uint32_t getStackSize(void) const { return this->stackSize; }
/*! Return the size of the scratch memory needed (zero if none) */
INLINE uint32_t getScratchSize(void) const { return this->scratchSize; }
/*! Get the SIMD width for the kernel */
INLINE uint32_t getSIMDWidth(void) const { return this->simdWidth; }
/*! Says if SLM is needed for it */
INLINE bool getUseSLM(void) const { return this->useSLM; }
/*! get slm size for kernel local variable */
INLINE uint32_t getSLMSize(void) const { return this->slmSize; }
/*! set constant buffer size and return the cb curbe offset */
int32_t setConstBufSize(uint32_t argID, size_t sz) {
if(argID >= argNum) return -1;
if(args[argID].type != GBE_ARG_CONSTANT_PTR) return -1;
if(args[argID].bufSize != sz) {
args[argID].bufSize = sz;
return ctx->allocConstBuf(argID);
}
return -1;
}
/*! Set sampler set. */
void setSamplerSet(ir::SamplerSet *from) {
samplerSet = from;
}
/*! Get defined sampler size */
size_t getSamplerSize(void) const { return samplerSet->getDataSize(); }
/*! Get defined sampler value array */
void getSamplerData(uint32_t *samplers) const { samplerSet->getData(samplers); }
/*! Set image set. */
void setImageSet(ir::ImageSet * from) {
imageSet = from;
}
/*! Get defined image size */
size_t getImageSize(void) const { return imageSet->getDataSize(); }
/*! Get defined image value array */
void getImageData(ImageInfo *images) const { imageSet->getData(images); }
static const uint32_t magic_begin = TO_MAGIC('K', 'E', 'R', 'N');
static const uint32_t magic_end = TO_MAGIC('N', 'R', 'E', 'K');
/* format:
magic_begin |
name_size |
name |
arg_num |
args |
PatchInfo_num |
PatchInfo |
curbeSize |
simdWidth |
stackSize |
scratchSize |
useSLM |
slmSize |
samplers |
images |
code_size |
code |
magic_end
*/
/*! Implements the serialization. */
virtual size_t serializeToBin(std::ostream& outs);
virtual size_t deserializeFromBin(std::istream& ins);
virtual void printStatus(int indent, std::ostream& outs);
protected:
friend class Context; //!< Owns the kernels
std::string name; //!< Kernel name
KernelArgument *args; //!< Each argument
vector patches; //!< Indicates how to build the curbe
uint32_t argNum; //!< Number of function arguments
uint32_t curbeSize; //!< Size of the data to push
uint32_t simdWidth; //!< SIMD size for the kernel (lane number)
uint32_t stackSize; //!< Stack size (may be 0 if unused)
uint32_t scratchSize; //!< Scratch memory size (may be 0 if unused)
bool useSLM; //!< SLM requires a special HW config
uint32_t slmSize; //!< slm size for kernel variable
Context *ctx; //!< Save context after compiler to alloc constant buffer curbe
ir::SamplerSet *samplerSet;//!< Copy from the corresponding function.
ir::ImageSet *imageSet; //!< Copy from the corresponding function.
GBE_CLASS(Kernel); //!< Use custom allocators
};
/*! Describe a compiled program */
class Program : public NonCopyable, public Serializable
{
public:
/*! Create an empty program */
Program(void);
/*! Destroy the program */
virtual ~Program(void);
/*! Get the number of kernels in the program */
uint32_t getKernelNum(void) const { return kernels.size(); }
/*! Get the kernel from its name */
Kernel *getKernel(const std::string &name) const {
auto it = kernels.find(name);
if (it == kernels.end())
return NULL;
else
return it->second;
}
/*! Get the kernel from its ID */
Kernel *getKernel(uint32_t ID) const {
uint32_t currID = 0;
Kernel *kernel = NULL;
for (const auto &pair : kernels) {
if (currID == ID) {
kernel = pair.second;
break;
}
currID++;
}
return kernel;
}
/*! Build a program from a ir::Unit */
bool buildFromUnit(const ir::Unit &unit, std::string &error);
/*! Buils a program from a LLVM source code */
bool buildFromLLVMFile(const char *fileName, std::string &error);
/*! Buils a program from a OCL string */
bool buildFromSource(const char *source, std::string &error);
/*! Get size of the global constant arrays */
size_t getGlobalConstantSize(void) const { return constantSet->getDataSize(); }
/*! Get the content of global constant arrays */
void getGlobalConstantData(char *mem) const { constantSet->getData(mem); }
static const uint32_t magic_begin = TO_MAGIC('P', 'R', 'O', 'G');
static const uint32_t magic_end = TO_MAGIC('G', 'O', 'R', 'P');
/* format:
magic_begin |
constantSet_flag |
constSet_data |
kernel_num |
kernel_1 |
........ |
kernel_n |
magic_end |
total_size
*/
/*! Implements the serialization. */
virtual size_t serializeToBin(std::ostream& outs);
virtual size_t deserializeFromBin(std::istream& ins);
virtual void printStatus(int indent, std::ostream& outs);
protected:
/*! Compile a kernel */
virtual Kernel *compileKernel(const ir::Unit &unit, const std::string &name) = 0;
/*! Allocate an empty kernel. */
virtual Kernel *allocateKernel(const std::string &name) = 0;
/*! Kernels sorted by their name */
hash_map kernels;
/*! Global (constants) outside any kernel */
ir::ConstantSet *constantSet;
/*! Use custom allocators */
GBE_CLASS(Program);
};
} /* namespace gbe */
#endif /* __GBE_PROGRAM_HPP__ */
Release_v0.3/backend/src/builtin_vector_proto.def 0000664 0000000 0000000 00000021472 12231421770 0022322 0 ustar 00root root 0000000 0000000 ##math
gentype acos (gentype)
gentype acosh (gentype)
gentype acospi (gentype x)
gentype asin (gentype)
gentype asinh (gentype)
gentype asinpi (gentype x)
gentype atan (gentype y_over_x)
gentype atan2 (gentype y, gentype x)
gentype atanh (gentype)
gentype atanpi (gentype x)
gentype atan2pi (gentype y, gentype x)
gentype cbrt (gentype)
gentype ceil (gentype)
gentype copysign (gentype x, gentype y)
gentype cos (gentype)
gentype cosh (gentype)
gentype cospi (gentype x)
gentype erfc (gentype)
gentype erf (gentype)
gentype exp (gentype x)
gentype exp2 (gentype)
gentype exp10 (gentype)
gentype expm1 (gentype x)
gentype fabs (gentype)
gentype fdim (gentype x, gentype y)
gentype floor (gentype)
# XXX we use madd for fma
#gentype fma (gentype a, gentype b, gentype c)
gentype fmax (gentype x, gentype y)
gentypef fmax (gentypef x, float y)
gentyped fmax (gentyped x, double y)
gentype fmin (gentype x, gentype y)
gentypef fmin (gentypef x, float y)
gentyped fmin (gentyped x, double y)
gentype fmod (gentype x, gentype y)
gentype fract (gentype x, __global gentype *iptr)
gentype fract (gentype x, __local gentype *iptr)
gentype fract (gentype x, __private gentype *iptr)
floatn frexp (floatn x, __global intn *exp)
floatn frexp (floatn x, __local intn *exp)
floatn frexp (floatn x, __private intn *exp)
float frexp (float x, __global int *exp)
float frexp (float x, __local int *exp)
float frexp (float x, __private int *exp)
doublen frexp (doublen x, __global intn *exp)
doublen frexp (doublen x, __local intn *exp)
doublen frexp (doublen x, __private intn *exp)
double frexp (double x, __global int *exp)
double frexp (double x, __local int *exp)
double frexp (double x, __private int *exp)
gentype hypot (gentype x, gentype y)
intn ilogb (floatn x)
int ilogb (float x)
intn ilogb (doublen x)
int ilogb (double x)
floatn ldexp (floatn x, intn k)
floatn ldexp (floatn x, int k)
float ldexp (float x, int k)
doublen ldexp (doublen x, intn k)
doublen ldexp (doublen x, int k)
double ldexp (double x, int k)
gentype lgamma (gentype x)
floatn lgamma_r (floatn x, __global intn *signp)
floatn lgamma_r (floatn x, __local intn *signp)
floatn lgamma_r (floatn x, __private intn *signp)
float lgamma_r (float x, __global int *signp)
float lgamma_r (float x, __local int *signp)
float lgamma_r (float x, __private int *signp)
#doublen lgamma_r (doublen x, __global intn *signp)
#doublen lgamma_r (doublen x, __local intn *signp)
#doublen lgamma_r (doublen x, __private intn *signp)
#double lgamma_r (double x, __global int *signp)
#double lgamma_r (double x, __local int *signp)
#double lgamma_r (double x, __private int *signp)
gentype log (gentype)
gentype log2 (gentype)
gentype log10 (gentype)
gentype log1p (gentype x)
gentype logb (gentype x)
gentype mad (gentype a, gentype b, gentype c)
gentype maxmag (gentype x, gentype y)
gentype minmag (gentype x, gentype y)
gentype modf (gentype x, __global gentype *iptr)
gentype modf (gentype x, __local gentype *iptr)
gentype modf (gentype x, __private gentype *iptr)
floatn nan (uintn nancode)
float nan (uint nancode)
doublen nan (ulongn nancode)
double nan (ulong nancode)
gentype nextafter (gentype x, gentype y)
gentype pow (gentype x, gentype y)
floatn pown (floatn x, intn y)
float pown (float x, int y)
doublen pown (doublen x, intn y)
double pown (double x, int y)
#XXX we define powr as pow
#gentype powr (gentype x, gentype y)
gentype remainder (gentype x, gentype y)
floatn remquo (floatn x, floatn y, __global intn *quo)
floatn remquo (floatn x, floatn y, __local intn *quo)
floatn remquo (floatn x, floatn y, __private intn *quo)
float remquo (float x, float y, __global int *quo)
float remquo (float x, float y, __local int *quo)
float remquo (float x, float y, __private int *quo)
doublen remquo (doublen x, doublen y, __global intn *quo)
doublen remquo (doublen x, doublen y, __local intn *quo)
doublen remquo (doublen x, doublen y, __private intn *quo)
double remquo (double x, double y, __global int *quo)
double remquo (double x, double y, __local int *quo)
double remquo (double x, double y, __private int *quo)
gentype rint (gentype)
floatn rootn (floatn x, intn y)
doublen rootn (doublen x, intn y)
doublen rootn (double x, int y)
gentype round (gentype x)
gentype rsqrt (gentype)
gentype sin (gentype)
gentype sincos (gentype x, __global gentype *cosval)
gentype sincos (gentype x, __local gentype *cosval)
gentype sincos (gentype x, __private gentype *cosval)
gentype sinh (gentype)
gentype sinpi (gentype x)
gentype sqrt (gentype)
gentype tan (gentype)
gentype tanh (gentype)
gentype tanpi (gentype x)
gentype tgamma (gentype)
gentype trunc (gentype)
##half_native_math
#gentype half_cos (gentype x)
#gentype half_divide (gentype x, gentype y)
#gentype half_exp (gentype x)
#gentype half_exp2 (gentype x)
#gentype half_exp10 (gentype x)
#gentype half_log (gentype x)
#gentype half_log2 (gentype x)
#gentype half_log10 (gentype x)
#gentype half_powr (gentype x, gentype y)
#gentype half_recip (gentype x)
#gentype half_rsqrt (gentype x)
#gentype half_sin (gentype x)
#gentype half_sqrt (gentype x)
#gentype half_tan (gentype x)
# XXX we already defined all native and non-native
# functions to the same one.
#gentype native_cos (gentype x)
#gentype native_divide (gentype x, gentype y)
#gentype native_exp (gentype x)
#gentype native_exp2 (gentype x)
#gentype native_exp10 (gentype x)
#gentype native_log (gentype x)
#gentype native_log2 (gentype x)
#gentype native_log10 (gentype x)
#gentype native_powr (gentype x, gentype y)
gentype native_recip (gentype x)
#gentype native_rsqrt (gentype x)
#gentype native_sin (gentype x)
#gentype native_sqrt (gentype x)
#gentype native_tan (gentype x)
##integer
ugentype abs (gentype x)
ugentype abs_diff (gentype x, gentype y)
gentype add_sat (gentype x, gentype y)
gentype hadd (gentype x, gentype y)
gentype rhadd (gentype x, gentype y)
gentype clamp (gentype x, gentype minval, gentype maxval)
gentype clamp (gentype x, sgentype minval, sgentype maxval)
gentype clz (gentype x)
gentype mad_hi (gentype a, gentype b, gentype c)
gentype mad_sat (gentype a, gentype b, gentype c)
gentype max (gentype x, gentype y)
gentype max (gentype x, sgentype y)
gentype min (gentype x, gentype y)
gentype min (gentype x, sgentype y)
gentype mul_hi (gentype x, gentype y)
gentype rotate (gentype v, gentype i)
gentype sub_sat (gentype x, gentype y)
shortn upsample (charn hi, ucharn lo)
ushortn upsample (ucharn hi, ucharn lo)
intn upsample (shortn hi, ushortn lo)
uintn upsample (ushortn hi, ushortn lo)
longn upsample (intn hi, uintn lo)
ulongn upsample (uintn hi, uintn lo)
# XXX not implemented
#gentype popcount (gentype x)
##fast_integer
gentype mad24 (gentype x, gentype y, gentype z)
gentype mul24 (gentype x, gentype y)
##common
gentype clamp (gentype x, gentype minval, gentype maxval)
gentypef clamp (gentypef x, float minval, float maxval)
gentyped clamp (gentyped x, double minval, double maxval)
gentype degrees (gentype radians)
gentype max (gentype x, gentype y)
gentypef max (gentypef x, float y)
gentyped max (gentyped x, double y)
gentype min (gentype x, gentype y)
gentypef min (gentypef x, float y)
gentyped min (gentyped x, double y)
gentype mix (gentype x, gentype y, gentype a)
gentypef mix (gentypef x, gentypef y, float a)
gentyped mix (gentyped x, gentyped y, double a)
gentype radians (gentype degrees)
gentype step (gentype edge, gentype x)
gentypef step (float edge, gentypef x)
gentyped step (double edge, gentyped x)
gentype smoothstep (gentype edge0, gentype edge1, gentype x)
gentypef smoothstep (float edge0, float edge1, gentypef x)
gentyped smoothstep (double edge0, double edge1, gentyped x)
gentype sign (gentype x)
##relational
intn isequal (floatn x, floatn y)
longn isequal (doublen x, doublen y)
intn isnotequal (floatn x, floatn y)
longn isnotequal (doublen x, doublen y)
intn isgreater (floatn x, floatn y)
longn isgreater (doublen x, doublen y)
intn isgreaterequal (floatn x, floatn y)
longn isgreaterequal (doublen x, doublen y)
intn isless (floatn x, floatn y)
longn isless (doublen x, doublen y)
intn islessequal (floatn x, floatn y)
longn islessequal (doublen x, doublen y)
# XXX not implemented
intn islessgreater (floatn x, floatn y)
longn islessgreater (doublen x, doublen y)
intn isfinite (floatn
longn isfinite (doublen)
intn isinf (floatn)
longn isinf (doublen)
intn isnan (floatn)
longn isnan (doublen)
intn isnormal (floatn)
longn isnormal (doublen)
# XXX not implemented
intn isordered (floatn x, floatn y)
longn isordered (doublen x, doublen y)
# XXX not implemented
intn isunordered (floatn x, floatn y)
longn isunordered (doublen x, doublen y)
intn signbit (floatn)
longn signbit (doublen)
int any (igentype x)
int all (igentype x)
# XXX need to revisit select latter
#gentype bitselect (gentype a, gentype b, gentype c)
gentype select (gentype a, gentype b, igentype c)
gentype select (gentype a, gentype b, ugentype c)
##misc
#gentypen shuffle (gentypem x, ugentypen mask)
#gentypen shuffle2 (gentypem x, gentypem y, ugentypen mask)
Release_v0.3/backend/src/gbe_bin_generater.cpp 0000664 0000000 0000000 00000022620 12231421770 0021510 0 ustar 00root root 0000000 0000000 /*
* Copyright © 2013 Intel Corporation
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the
* Free Software Foundation; either version 2 of the License, or (at your
* option) any later version.
*
* This library is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this library. If not, see .
*
*/
/*******************************************************************************
This file is used to generating the gbe kernel binary. These binary may be
used in CL API, such as enqueue memory We generate the binary in build time
to improve the performance.
*******************************************************************************/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "backend/program.h"
#include "backend/program.hpp"
using namespace std;
#define FILE_NOT_FIND_ERR 1
#define FILE_MAP_ERR 2
#define FILE_BUILD_FAILED 3
#define FILE_SERIALIZATION_FAILED 4
class program_build_instance {
protected:
string prog_path;
string build_opt;
static string bin_path;
static bool str_fmt_out;
int fd;
int file_len;
const char* code;
gbe::Program* gbe_prog;
public:
program_build_instance (void) : fd(-1), file_len(0), code(NULL), gbe_prog(NULL) { }
explicit program_build_instance (const char* file_path, const char* option = NULL)
: prog_path(file_path), build_opt(option), fd(-1), file_len(0),
code(NULL), gbe_prog(NULL) { }
~program_build_instance () {
if (code) {
munmap((void *)(code), file_len);
code = NULL;
}
if (fd >= 0)
close(fd);
if (gbe_prog)
gbe_program_delete(reinterpret_cast(gbe_prog));
}
program_build_instance(program_build_instance&& other) = default;
#if 0
{
#define SWAP(ELT) \
do { \
auto elt = this->ELT; \
this->ELT = other.ELT; \
other.ELT = elt; \
} while(0)
SWAP(fd);
SWAP(code);
SWAP(file_len);
SWAP(prog_path);
SWAP(build_opt);
#undef SWAP
}
#endif
explicit program_build_instance(const program_build_instance& other) = delete;
program_build_instance& operator= (const program_build_instance& other) {
/* we do not want to be Lvalue copied, but operator is needed to instance the
template of vector. */
assert(1);
return *this;
}
const char* file_map_open (void) throw (int);
const char* get_code (void) {
return code;
}
const string& get_program_path (void) {
return prog_path;
}
int get_size (void) {
return file_len;
}
void print_file (void) {
cout << code << endl;
}
void dump (void) {
cout << "program path: " << prog_path << endl;
cout << "Build option: " << build_opt << endl;
print_file();
}
static void set_str_fmt_out (bool flag) {
str_fmt_out = flag;
}
static int set_bin_path (const char* path) {
if (bin_path.size())
return 0;
bin_path = path;
return 1;
}
void build_program(void) throw(int);
void serialize_program(void) throw(int);
};
string program_build_instance::bin_path;
bool program_build_instance::str_fmt_out = false;
void program_build_instance::serialize_program(void) throw(int)
{
ofstream ofs;
ostringstream oss;
size_t sz;
ofs.open(bin_path, ofstream::out | ofstream::app | ofstream::binary);
if (str_fmt_out) {
string array_name = "Unkown_name_array";
unsigned long last_slash = bin_path.rfind("/");
unsigned long last_dot = bin_path.rfind(".");
if (last_slash != string::npos && last_dot != string::npos)
array_name = bin_path.substr(last_slash + 1, last_dot - 1 - last_slash);
ofs << "char " << array_name << "[] = {" << "\n";
sz = gbe_prog->serializeToBin(oss);
for (size_t i = 0; i < sz; i++) {
unsigned char c = oss.str().c_str()[i];
char asic_str[9];
sprintf(asic_str, "%2.2x", c);
ofs << "0x";
ofs << asic_str << ((i == sz - 1) ? "" : ", ");
}
ofs << "};\n";
string array_size = array_name + "_size";
ofs << "int " << array_size << " = " << sz << ";" << "\n";
} else {
sz = gbe_prog->serializeToBin(ofs);
}
ofs.close();
if (!sz) {
throw FILE_SERIALIZATION_FAILED;
}
}
void program_build_instance::build_program(void) throw(int)
{
gbe_program opaque = gbe_program_new_from_source(code, 0, build_opt.c_str(), NULL, NULL);
if (!opaque)
throw FILE_BUILD_FAILED;
gbe_prog = reinterpret_cast(opaque);
assert(gbe_program_get_kernel_num(opaque));
}
const char* program_build_instance::file_map_open(void) throw(int)
{
void * address;
/* Open the file */
fd = ::open(prog_path.c_str(), O_RDONLY);
if (fd < 0) {
throw FILE_NOT_FIND_ERR;
}
/* Map it */
file_len = lseek(fd, 0, SEEK_END);
lseek(fd, 0, SEEK_SET);
address = mmap(0, file_len, PROT_READ, MAP_SHARED, fd, 0);
if (address == NULL) {
throw FILE_MAP_ERR;
}
code = reinterpret_cast(address);
return code;
}
typedef vector prog_vector;
int main (int argc, const char **argv)
{
prog_vector prog_insts;
vector argv_saved;
const char* build_opt;
const char* file_path;
int i;
int oc;
deque used_index;
if (argc < 2) {
cout << "Usage: kernel_path [-pbuild_parameter]\n[-obin_path]" << endl;
return 0;
}
used_index.assign(argc, 0);
/* because getopt will re-sort the argv, so we save here. */
for (i=0; i< argc; i++) {
argv_saved.push_back(string(argv[i]));
}
while ( (oc = getopt(argc, (char * const *)argv, "o:p:s")) != -1 ) {
switch (oc) {
case 'p':
{
int opt_index;
if (argv[optind-1][0] == '-') {// -pXXX like
opt_index = optind - 1;
} else { // Must be -p XXXX mode
opt_index = optind - 2;
used_index[opt_index + 1] = 1;
}
/* opt must follow the file name.*/
if ((opt_index < 2 ) || argv[opt_index-1][0] == '-') {
cout << "Usage note: Building option must follow file name" << endl;
return 1;
}
file_path = argv[opt_index - 1];
build_opt = optarg;
prog_insts.push_back(program_build_instance(file_path, build_opt));
break;
}
case 'o':
if (!program_build_instance::set_bin_path(optarg)) {
cout << "Can not specify the bin path more than once." << endl;
return 1;
}
used_index[optind-1] = 1;
break;
case 's':
program_build_instance::set_str_fmt_out(true);
used_index[optind-1] = 1;
break;
case ':':
cout << "Miss the file option argument" << endl;
return 1;
default:
cout << "Unknown opt" << endl;
}
}
for (i=1; i < argc; i++) {
//cout << argv_saved[i] << endl;
if (argv_saved[i].size() && argv_saved[i][0] != '-') {
if (used_index[i])
continue;
string file_name = argv_saved[i];
prog_vector::iterator result = find_if(prog_insts.begin(), prog_insts.end(),
[&](program_build_instance & prog_inst)-> bool {
bool result = false;
if (prog_inst.get_program_path() == file_name)
result = true;
return result;
});
if (result == prog_insts.end()) {
prog_insts.push_back(program_build_instance(file_name.c_str(), ""));
}
}
}
for (auto& inst : prog_insts) {
try {
inst.file_map_open();
inst.build_program();
inst.serialize_program();
}
catch (int & err_no) {
if (err_no == FILE_NOT_FIND_ERR) {
cout << "can not open the file " <<
inst.get_program_path() << endl;
} else if (err_no == FILE_MAP_ERR) {
cout << "map the file " <<
inst.get_program_path() << " failed" << endl;
} else if (err_no == FILE_BUILD_FAILED) {
cout << "build the file " <<
inst.get_program_path() << " failed" << endl;
} else if (err_no == FILE_SERIALIZATION_FAILED) {
cout << "Serialize the file " <<
inst.get_program_path() << " failed" << endl;
}
return -1;
}
}
//for (auto& inst : prog_insts) {
// inst.dump();
//}
return 0;
}
Release_v0.3/backend/src/gen_as.sh 0000775 0000000 0000000 00000007356 12231421770 0017167 0 ustar 00root root 0000000 0000000 #! /bin/sh -e
. ./genconfig.sh
# Generate list of union sizes
for type in $TYPES; do
size=`IFS=:; set -- dummy $type; echo $3`
for vector_length in $VECTOR_LENGTHS; do
union_sizes="$union_sizes `expr $vector_length \* $size`"
done
done
union_sizes="`echo $union_sizes | tr ' ' '\n' | sort -n | uniq`"
# For each union size
for union_size in $union_sizes; do
# Define an union that contains all vector types that have the same size as the union
unionname="union _type_cast_${union_size}_b"
echo "$unionname {"
for type in $TYPES; do
basetype=`IFS=:; set -- dummy $type; echo $2`
basesize=`IFS=:; set -- dummy $type; echo $3`
for vector_length in $VECTOR_LENGTHS; do
vector_size_in_union="`expr $vector_length \* $basesize`"
if test $union_size -ne $vector_size_in_union; then
continue
fi
if test $vector_length -eq 1; then
vectortype=$basetype
else
vectortype=$basetype$vector_length
fi
echo " $vectortype _$vectortype;"
done
done
echo "};"
echo
# For each tuple of vector types that has the same size as the current union size,
# define an as_* function that converts types without changing binary representation.
for ftype in $TYPES; do
fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
fbasesize=`IFS=:; set -- dummy $ftype; echo $3`
for fvector_length in $VECTOR_LENGTHS; do
fvector_size_in_union="`expr $fvector_length \* $fbasesize`"
if test $union_size -ne $fvector_size_in_union; then
continue
fi
if test $fvector_length -eq 1; then
fvectortype=$fbasetype
else
fvectortype=$fbasetype$fvector_length
fi
for ttype in $TYPES; do
tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
tbasesize=`IFS=:; set -- dummy $ttype; echo $3`
if test $fbasetype = $tbasetype; then
continue
fi
for tvector_length in $VECTOR_LENGTHS; do
tvector_size_in_union="`expr $tvector_length \* $tbasesize`"
if test $union_size -ne $tvector_size_in_union; then
continue
fi
if test $tvector_length -eq 1; then
tvectortype=$tbasetype
else
tvectortype=$tbasetype$tvector_length
fi
echo "INLINE OVERLOADABLE $tvectortype as_$tvectortype($fvectortype v) {"
echo " $unionname u;"
echo " u._$fvectortype = v;"
echo " return u._$tvectortype;"
echo "}"
echo
done
done
done
done
done
Release_v0.3/backend/src/gen_builtin_vector.py 0000775 0000000 0000000 00000031354 12231421770 0021625 0 ustar 00root root 0000000 0000000 #!/usr/bin/env python
#
# Copyright (C) 2012 Intel Corporation
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library. If not, see .
#
# Author: Zhigang Gong
#/
# This file is to generate inline code to lower down those builtin
# vector functions to scalar functions.
import re
import sys
import os
if len(sys.argv) != 3:
print "Invalid argument {}".format(sys.argv)
print "use {} spec_file_name output_file_name".format(sys.argv[0])
raise
all_vector = 1,2,3,4,8,16
# generate generic type sets
def gen_vector_type(type_set, vector_set = all_vector):
ret = []
for t in type_set:
for i in vector_set:
ret.append((t, i))
return ret
def set_vector_memspace(vector_type_set, memspace):
ret = []
if memspace == '':
return vector_type_set
for t in vector_type_set:
ret.append((t[0], t[1], memspace))
return ret
# if we have 3 elements in the type tuple, we are a pointer with a memory space type
# at the third element.
def isPointer(t):
return len(t) == 3
all_itype = "char","short","int","long"
all_utype = "uchar","ushort","uint","ulong"
all_int_type = all_itype + all_utype
all_float_type = "float","double"
all_type = all_int_type + all_float_type
# all vector/scalar types
for t in all_type:
exec "{0}n = [\"{0}n\", gen_vector_type([\"{0}\"])]".format(t)
exec "s{0} = [\"{0}\", gen_vector_type([\"{0}\"], [1])]".format(t)
# Predefined type sets according to the Open CL spec.
math_gentype = ["math_gentype", gen_vector_type(all_float_type)]
math_gentypef = ["math_gentypef", gen_vector_type(["float"])]
math_gentyped = ["math_gentyped", gen_vector_type(["double"])]
half_native_math_gentype = ["half_native_math_gentype", gen_vector_type(["float"])]
integer_gentype = ["integer_gentype", gen_vector_type(all_int_type)]
integer_ugentype = ["integer_ugentype", gen_vector_type(all_utype)]
integer_sgentype = ["integer_sgentype", gen_vector_type(all_int_type, [1])]
fast_integer_gentype = ["fast_integer_gentype", gen_vector_type(["uint", "int"])]
common_gentype = ["common_gentype", gen_vector_type(all_float_type)]
common_gentypef = ["common_gentypef", gen_vector_type(["float"])]
common_gentyped = ["common_gentyped", gen_vector_type(["double"])]
relational_gentype = ["relational_gentype", gen_vector_type(all_type)]
relational_igentype = ["relational_igentype", gen_vector_type(all_itype)]
relational_ugentype = ["relational_ugentype", gen_vector_type(all_utype)]
misc_gentypem = ["misc_gentypem", gen_vector_type(all_type, [2, 4, 8, 16])]
misc_gentypen = ["misc_gentypen", gen_vector_type(all_type, [2, 4, 8, 16])]
misc_ugentypem = ["misc_ugentypem", gen_vector_type(all_utype, [2, 4, 8, 16])]
misc_ugentypen = ["misc_ugentypen", gen_vector_type(all_utype, [2, 4, 8, 16])]
all_predefined_type = math_gentype, math_gentypef, math_gentyped, \
half_native_math_gentype, integer_gentype,integer_sgentype,\
integer_ugentype, charn, ucharn, shortn, ushortn, intn, \
uintn, longn, ulongn, floatn, doublen, \
fast_integer_gentype, common_gentype, common_gentypef, \
common_gentyped, relational_gentype, relational_igentype, \
relational_ugentype, schar, suchar, sshort, sint, suint, \
slong, sulong, sfloat, sdouble, misc_gentypem, \
misc_ugentypem, misc_gentypen, misc_ugentypen
# type dictionary contains all the predefined type sets.
type_dict = {}
for t in all_predefined_type:
type_dict.update({t[0]:t[1]})
def _prefix(prefix, dtype):
if dtype.count("gentype") != 0:
return prefix + '_' + dtype
return dtype
memspaces = ["__local ", "__private ", "__global "]
def stripMemSpace(t):
if t[0:2] == '__':
for memspace in memspaces :
if t[0:len(memspace)] == memspace:
return memspace, t[len(memspace):]
return '', t
def check_type(types):
for t in types:
memspace, t = stripMemSpace(t)
if not t in type_dict:
print t
raise "found invalid type."
def match_unsigned(dtype):
if dtype[0] == 'float':
return ["uint", dtype[1]]
if dtype[0] == 'double':
return ["ulong", dtype[1]]
if dtype[0][0] == 'u':
return dtype
return ['u' + dtype[0], dtype[1]]
def match_signed(dtype):
if dtype[0] == 'float':
return ["int", dtype[1]]
if dtype[0] == 'double':
return ["long", dtype[1]]
if dtype[0][0] != 'u':
return dtype
return [dtype[0][1:], dtype[1]]
def match_scalar(dtype):
return [dtype[0], 1]
# The dstType is the expected type, srcType is
# the reference type. Sometimes, the dstType and
# srcType are different. We need to fix this issue
# and return correct dst type.
def fixup_type(dstType, srcType, n):
if dstType == srcType:
return dstType[n]
if dstType != srcType:
# scalar dst type
if len(dstType) == 1:
return dstType[0]
# dst is not scalar bug src is scalar
if len(srcType) == 1:
return dstType[n]
if dstType == integer_sgentype[1] and srcType == integer_gentype[1]:
return match_scalar(srcType[n])
if dstType == integer_gentype[1] and \
(srcType == integer_sgentype[1] or \
srcType == integer_ugentype[1]):
return dstType[n]
if dstType == integer_ugentype[1] and srcType == integer_gentype[1]:
return match_unsigned(srcType[n])
if dstType == relational_igentype[1] and srcType == relational_gentype[1]:
return match_signed(srcType[n])
if dstType == relational_ugentype[1] and srcType == relational_gentype[1]:
return match_unsigned(srcType[n])
if dstType == relational_gentype[1] and \
(srcType == relational_igentype[1] or \
srcType == relational_ugentype[1]):
return dstType[n]
if (len(dstType) == len(srcType)):
return dstType[n]
print dstType, srcType
raise "type mispatch"
class builtinProto():
valueTypeStr = ""
functionName = ""
paramTypeStrs = []
paramCount = 0
outputStr = []
prefix = ""
def init(self, sectionHeader, sectionPrefix):
self.valueTypeStr = ""
self.functionName = ""
self.paramTypeStrs = []
self.paramCount = 0
if sectionHeader != "":
self.outputStr = [sectionHeader]
else:
self.outputStr = []
if sectionPrefix != "":
self.prefix = sectionPrefix
self.indent = 0
def append(self, line, nextInit = ""):
self.outputStr.append(line);
return nextInit;
def indentSpace(self):
ret = ""
for i in range(self.indent):
ret += ' '
return ret
def init_from_line(self, t):
self.append('//{}'.format(t))
line = filter(None, re.split(',| |\(', t.rstrip(')\n')))
self.paramCount = 0
stripped = 0
memSpace = ''
for i, text in enumerate(line):
idx = i - stripped
if idx == 0:
self.valueTypeStr = _prefix(self.prefix, line[i])
continue
if idx == 1:
self.functionName = line[i];
continue
if idx % 2 == 0:
if line[i][0] == '(':
tmpType = line[i][1:]
else:
tmpType = line[i]
if tmpType == '__local' or \
tmpType == '__private' or \
tmpType == '__global':
memSpace = tmpType + ' '
stripped += 1
continue
self.paramTypeStrs.append(memSpace + _prefix(self.prefix, tmpType))
memSpace = ''
self.paramCount += 1
def gen_proto_str_1(self, vtypeSeq, ptypeSeqs, i):
for n in range(0, self.paramCount):
ptype = fixup_type(ptypeSeqs[n], vtypeSeq, i);
vtype = fixup_type(vtypeSeq, ptypeSeqs[n], i);
# XXX FIXME now skip all double vector, as we don't
# defined those scalar version's prototype.
if ptype[0].find('double') != -1 or \
vtype[0].find('double') != -1:
return
if (n == 0):
formatStr = 'INLINE_OVERLOADABLE {}{} {} ('.format(vtype[0], vtype[1], self.functionName)
else:
formatStr += ', '
if vtype[1] == 1:
return
if isPointer(ptype):
formatStr += ptype[2]
pointerStr = '*'
else:
pointerStr = ''
if ptype[1] != 1:
formatStr += '{}{} {}param{}'.format(ptype[0], ptype[1], pointerStr, n)
else:
formatStr += '{} {}param{}'.format(ptype[0], pointerStr, n)
formatStr += ')'
formatStr = self.append(formatStr, '{{return ({}{})('.format(vtype[0], vtype[1]))
self.indent = len(formatStr)
for j in range(0, vtype[1]):
if (j != 0):
formatStr += ','
if (j + 1) % 2 == 0:
formatStr += ' '
if j % 2 == 0:
formatStr = self.append(formatStr, self.indentSpace())
formatStr += '{}('.format(self.functionName)
for n in range(0, self.paramCount):
if n != 0:
formatStr += ', '
ptype = fixup_type(ptypeSeqs[n], vtypeSeq, i)
vtype = fixup_type(vtypeSeq, ptypeSeqs[n], i)
if vtype[1] != ptype[1]:
if ptype[1] != 1:
raise "parameter is not a scalar but has different width with result value."
if isPointer(ptype):
formatStr += '&'
formatStr += 'param{}'.format(n)
continue
if (isPointer(ptype)):
formatStr += '({} {} *)param{} + {:2d}'.format(ptype[2], ptype[0], n, j)
else:
if (self.functionName == 'select' and n == 2):
formatStr += '({0})(param{1}.s{2:x} & (({0})1 << (sizeof({0})*8 - 1)))'.format(ptype[0], n, j)
else:
formatStr += 'param{}.s{:x}'.format(n, j)
formatStr += ')'
formatStr += '); }\n'
self.append(formatStr)
return formatStr
def output(self):
for line in self.outputStr:
print line
def output(self, outFile):
for line in self.outputStr:
outFile.write('{}\n'.format(line))
def gen_proto_str(self):
check_type([self.valueTypeStr] + self.paramTypeStrs)
vtypeSeq = type_dict[self.valueTypeStr]
ptypeSeqs = []
count = len(vtypeSeq);
for t in self.paramTypeStrs:
memspace,t = stripMemSpace(t)
ptypeSeqs.append(set_vector_memspace(type_dict[t], memspace))
count = max(count, len(type_dict[t]))
for i in range(count):
formatStr = self.gen_proto_str_1(vtypeSeq, ptypeSeqs, i)
self.append("")
def safeUnlink(filename):
try:
os.remove(filename)
except OSError:
pass
# save the prototypes into ocl_vector.h
specFile = open(sys.argv[1], 'r')
headerFileName = sys.argv[2]
tempHeaderFileName = sys.argv[2] + '.tmp'
safeUnlink(headerFileName)
tempHeader = open(tempHeaderFileName, 'w')
tempHeader.write("//This file is autogenerated by {}.\n".format(sys.argv[0]))
tempHeader.write("//Don't modify it manually.\n")
functionProto = builtinProto()
for line in specFile:
if line.isspace():
continue
if line[0] == '#':
if line[1] == '#':
sectionHeader = "//{} builtin functions".format(line[2:].rstrip())
sectionPrefix=(line[2:].split())[0]
continue
functionProto.init(sectionHeader, sectionPrefix)
sectionHeader = ""
setionPrefix = ""
functionProto.init_from_line(line)
functionProto.gen_proto_str()
functionProto.output(tempHeader)
tempHeader.close()
os.rename(tempHeaderFileName, headerFileName)
Release_v0.3/backend/src/gen_convert.sh 0000775 0000000 0000000 00000016511 12231421770 0020235 0 ustar 00root root 0000000 0000000 #! /bin/sh -e
. ./genconfig.sh
# For all vector lengths and types, generate conversion functions
for vector_length in $VECTOR_LENGTHS; do
if test $vector_length -eq 1; then
for ftype in $TYPES; do
fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
for ttype in $TYPES; do
tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
if test $fbasetype = $tbasetype; then
continue
fi
echo "INLINE OVERLOADABLE $tbasetype convert_$tbasetype($fbasetype v) {"
echo " return ($tbasetype)v;"
echo "}"
echo
done
done
else
for ftype in $TYPES; do
fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
for ttype in $TYPES; do
tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
if test $fbasetype = $tbasetype; then
if test $vector_length -gt 1; then
fvectortype=$fbasetype$vector_length
tvectortype=$tbasetype$vector_length
echo "INLINE OVERLOADABLE $tvectortype convert_$tvectortype($fvectortype v) { return v; }"
else
echo "INLINE OVERLOADABLE $tbasetype convert_$tbasetype($fbasetype v) { return v; }"
fi
continue
fi
fvectortype=$fbasetype$vector_length
tvectortype=$tbasetype$vector_length
construct="($tbasetype)(v.s0)"
if test $vector_length -gt 1; then
construct="$construct, ($tbasetype)(v.s1)"
fi
if test $vector_length -gt 2; then
construct="$construct, ($tbasetype)(v.s2)"
fi
if test $vector_length -gt 3; then
construct="$construct, ($tbasetype)(v.s3)"
fi
if test $vector_length -gt 4; then
construct="$construct, ($tbasetype)(v.s4)"
construct="$construct, ($tbasetype)(v.s5)"
construct="$construct, ($tbasetype)(v.s6)"
construct="$construct, ($tbasetype)(v.s7)"
fi
if test $vector_length -gt 8; then
construct="$construct, ($tbasetype)(v.s8)"
construct="$construct, ($tbasetype)(v.s9)"
construct="$construct, ($tbasetype)(v.sA)"
construct="$construct, ($tbasetype)(v.sB)"
construct="$construct, ($tbasetype)(v.sC)"
construct="$construct, ($tbasetype)(v.sD)"
construct="$construct, ($tbasetype)(v.sE)"
construct="$construct, ($tbasetype)(v.sF)"
fi
echo "INLINE OVERLOADABLE $tvectortype convert_$tvectortype($fvectortype v) {"
echo " return ($tvectortype)($construct);"
echo "}"
echo
done
done
fi
done
echo '
#define DEF(DSTTYPE, SRCTYPE) \
OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x);
DEF(char, uchar);
DEF(char, short);
DEF(char, ushort);
DEF(char, int);
DEF(char, uint);
DEF(char, float);
DEF(uchar, char);
DEF(uchar, short);
DEF(uchar, ushort);
DEF(uchar, int);
DEF(uchar, uint);
DEF(uchar, float);
DEF(short, ushort);
DEF(short, int);
DEF(short, uint);
DEF(short, float);
DEF(ushort, short);
DEF(ushort, int);
DEF(ushort, uint);
DEF(ushort, float);
DEF(int, uint);
DEF(int, float);
DEF(uint, int);
DEF(uint, float);
#undef DEF
#define DEF(DSTTYPE, SRCTYPE, MIN, MAX) \
INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
return x > MAX ? (DSTTYPE)MAX : x < MIN ? (DSTTYPE)MIN : x; \
}
DEF(char, long, -128, 127);
DEF(uchar, long, 0, 255);
DEF(short, long, -32768, 32767);
DEF(ushort, long, 0, 65535);
DEF(int, long, -0x7fffffff-1, 0x7fffffff);
DEF(uint, long, 0, 0xffffffffu);
DEF(long, float, -9.223372036854776e+18f, 9.223372036854776e+18f);
DEF(ulong, float, 0, 1.8446744073709552e+19f);
#undef DEF
#define DEF(DSTTYPE, SRCTYPE, MAX) \
INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
return x > MAX ? (DSTTYPE)MAX : x; \
}
DEF(char, ulong, 127);
DEF(uchar, ulong, 255);
DEF(short, ulong, 32767);
DEF(ushort, ulong, 65535);
DEF(int, ulong, 0x7fffffff);
DEF(uint, ulong, 0xffffffffu);
#undef DEF
INLINE_OVERLOADABLE long convert_long_sat(ulong x) {
ulong MAX = 0x7ffffffffffffffful;
return x > MAX ? MAX : x;
}
INLINE_OVERLOADABLE ulong convert_ulong_sat(long x) {
return x < 0 ? 0 : x;
}
#define DEF(DSTTYPE, SRCTYPE) \
INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
return x; \
}
DEF(char, char);
DEF(uchar, uchar);
DEF(short, char);
DEF(short, uchar);
DEF(short, short);
DEF(ushort, char);
DEF(ushort, uchar);
DEF(ushort, ushort);
DEF(int, char);
DEF(int, uchar);
DEF(int, short);
DEF(int, ushort);
DEF(int, int);
DEF(uint, char);
DEF(uint, uchar);
DEF(uint, short);
DEF(uint, ushort);
DEF(uint, uint);
DEF(long, char);
DEF(long, uchar);
DEF(long, short);
DEF(long, ushort);
DEF(long, int);
DEF(long, uint);
DEF(long, long);
DEF(ulong, char);
DEF(ulong, uchar);
DEF(ulong, short);
DEF(ulong, ushort);
DEF(ulong, int);
DEF(ulong, uint);
DEF(ulong, ulong);
#undef DEF
'
# vector convert_DSTTYPE_sat function
for vector_length in $VECTOR_LENGTHS; do
if test $vector_length -eq 1; then continue; fi
for ftype in $TYPES; do
fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
if test $fbasetype = "double"; then continue; fi
for ttype in $TYPES; do
tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
if test $tbasetype = "double" -o $tbasetype = "float"; then continue; fi
fvectortype=$fbasetype$vector_length
tvectortype=$tbasetype$vector_length
conv="convert_${tbasetype}_sat"
construct="$conv(v.s0)"
if test $vector_length -gt 1; then
construct="$construct, $conv(v.s1)"
fi
if test $vector_length -gt 2; then
construct="$construct, $conv(v.s2)"
fi
if test $vector_length -gt 3; then
construct="$construct, $conv(v.s3)"
fi
if test $vector_length -gt 4; then
construct="$construct, $conv(v.s4)"
construct="$construct, $conv(v.s5)"
construct="$construct, $conv(v.s6)"
construct="$construct, $conv(v.s7)"
fi
if test $vector_length -gt 8; then
construct="$construct, $conv(v.s8)"
construct="$construct, $conv(v.s9)"
construct="$construct, $conv(v.sA)"
construct="$construct, $conv(v.sB)"
construct="$construct, $conv(v.sC)"
construct="$construct, $conv(v.sD)"
construct="$construct, $conv(v.sE)"
construct="$construct, $conv(v.sF)"
fi
echo "INLINE OVERLOADABLE $tvectortype convert_${tvectortype}_sat($fvectortype v) {"
echo " return ($tvectortype)($construct);"
echo "}"
echo
done
done
done
Release_v0.3/backend/src/genconfig.sh 0000664 0000000 0000000 00000000442 12231421770 0017654 0 ustar 00root root 0000000 0000000 #! /bin/false
# This is to be sourced by the generation scripts
# Supported base types and their lengths
TYPES="long:8 ulong:8 int:4 uint:4 short:2 ushort:2 char:1 uchar:1 double:8 float:4"
# Supported vector lengths
VECTOR_LENGTHS="1 2 3 4 8 16"
## No user serviceable parts below here
Release_v0.3/backend/src/ir/ 0000775 0000000 0000000 00000000000 12231421770 0015773 5 ustar 00root root 0000000 0000000 Release_v0.3/backend/src/ir/constant.cpp 0000664 0000000 0000000 00000010074 12231421770 0020332 0 ustar 00root root 0000000 0000000 /*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see .
*
* Author: Benjamin Segovia
*/
/**
* \file constant.hpp
*
* \author Benjamin Segovia
*/
#include "constant.hpp"
namespace gbe {
namespace ir {
void ConstantSet::append(const char *data,
const std::string &name,
uint32_t size,
uint32_t alignment)
{
const uint32_t offset = ALIGN(this->data.size(), alignment);
const uint32_t padding = offset - this->data.size();
const Constant constant(name, size, alignment, offset);
constants.push_back(constant);
for (uint32_t i = 0; i < padding; ++i) this->data.push_back(0);
for (uint32_t i = 0; i < size; ++i) this->data.push_back(data[i]);
}
#define OUT_UPDATE_SZ(elt) SERIALIZE_OUT(elt, outs, ret_size)
#define IN_UPDATE_SZ(elt) DESERIALIZE_IN(elt, ins, total_size)
size_t ConstantSet::serializeToBin(std::ostream& outs) {
size_t ret_size = 0;
OUT_UPDATE_SZ(magic_begin);
/* output the const data. */
OUT_UPDATE_SZ((data.size()*sizeof(char)));
if(data.size() > 0) {
outs.write(data.data(), data.size()*sizeof(char));
ret_size += data.size()*sizeof(char);
}
OUT_UPDATE_SZ(constants.size());
for (auto const &cnst : constants) {
size_t bytes = sizeof(cnst.getName().size()) //name length self
+ cnst.getName().size()*sizeof(char) //name
+ sizeof(cnst.getSize()) //size
+ sizeof(cnst.getAlignment()) //alignment
+ sizeof(cnst.getOffset()); //offset
OUT_UPDATE_SZ(bytes);
OUT_UPDATE_SZ(cnst.getName().size());
outs.write(cnst.getName().c_str(), cnst.getName().size());
ret_size += sizeof(char)*cnst.getName().size();
OUT_UPDATE_SZ(cnst.getSize());
OUT_UPDATE_SZ(cnst.getAlignment());
OUT_UPDATE_SZ(cnst.getOffset());
}
OUT_UPDATE_SZ(magic_end);
OUT_UPDATE_SZ(ret_size);
return ret_size;
}
size_t ConstantSet::deserializeFromBin(std::istream& ins) {
size_t total_size = 0;
size_t global_data_sz = 0;
size_t const_num;
uint32_t magic;
IN_UPDATE_SZ(magic);
if (magic != magic_begin)
return 0;
IN_UPDATE_SZ(global_data_sz);
for (size_t i = 0; i < global_data_sz; i++) {
char elt;
IN_UPDATE_SZ(elt);
data.push_back(elt);
}
IN_UPDATE_SZ(const_num);
for (size_t i = 0; i < const_num; i++) {
size_t bytes;
IN_UPDATE_SZ(bytes);
size_t name_len;
IN_UPDATE_SZ(name_len);
char* c_name = new char[name_len+1];
ins.read(c_name, name_len);
total_size += sizeof(char)*name_len;
c_name[name_len] = 0;
uint32_t size, align, offset;
IN_UPDATE_SZ(size);
IN_UPDATE_SZ(align);
IN_UPDATE_SZ(offset);
ir::Constant constant(c_name, size, align, offset);
constants.push_back(constant);
delete[] c_name;
/* Saint check */
if (bytes != sizeof(name_len) + sizeof(char)*name_len + sizeof(size)
+ sizeof(align) + sizeof(offset))
return 0;
}
IN_UPDATE_SZ(magic);
if (magic != magic_end)
return 0;
size_t total_bytes;
IN_UPDATE_SZ(total_bytes);
if (total_bytes + sizeof(total_size) != total_size)
return 0;
return total_size;
}
} /* namespace ir */
} /* namespace gbe */
Release_v0.3/backend/src/ir/constant.hpp 0000664 0000000 0000000 00000010552 12231421770 0020340 0 ustar 00root root 0000000 0000000 /*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see .
*
* Author: Benjamin Segovia
*/
/**
* \file constant.cpp
*
* \author Benjamin Segovia
*/
#ifndef __GBE_IR_CONSTANT_HPP__
#define __GBE_IR_CONSTANT_HPP__
#include "sys/vector.hpp"
namespace gbe {
namespace ir {
/*! Describe one constant (may be a scalar or an array) */
class Constant
{
public:
/*! Build a constant description */
INLINE Constant(const std::string &name, uint32_t size, uint32_t alignment, uint32_t offset) :
name(name), size(size), alignment(alignment), offset(offset) {}
/*! Copy constructor */
INLINE Constant(const Constant &other) :
name(other.name), size(other.size), alignment(other.alignment), offset(other.offset) {}
/*! Copy operator */
INLINE Constant& operator= (const Constant &other) {
this->name = other.name;
this->size = other.size;
this->alignment = other.alignment;
this->offset = other.offset;
return *this;
}
/*! Nothing happens here */
INLINE ~Constant(void) {}
const std::string& getName(void) const { return name; }
uint32_t getSize (void) const { return size; }
uint32_t getAlignment (void) const { return alignment; }
uint32_t getOffset(void) const { return offset; }
private:
std::string name; //!< Optional name of the constant
uint32_t size; //!< Size of the constant
uint32_t alignment; //!< Alignment required for each constant
uint32_t offset; //!< Offset of the constant in the data segment
GBE_CLASS(Constant);
};
/*! A constant set is a set of immutable data associated to a compilation
* unit
*/
class ConstantSet : public Serializable
{
public:
/*! Append a new constant in the constant set */
void append(const char*, const std::string&, uint32_t size, uint32_t alignment);
/*! Number of constants */
size_t getConstantNum(void) const { return constants.size(); }
/*! Get a special constant */
Constant& getConstant(size_t i) { return constants[i]; }
/*! Get a special constant */
Constant& getConstant(const std::string & name) {
for (auto & c : constants) {
if (c.getName() == name)
return c;
}
GBE_ASSERT(false);
return *(Constant *)nullptr;
}
/*! Number of bytes of serialized constant data */
size_t getDataSize(void) const { return data.size(); }
/*! Store serialized constant data into an array */
void getData(char *mem) const {
for (size_t i = 0; i < data.size(); i ++)
mem[i] = data[i];
}
ConstantSet() {}
ConstantSet(const ConstantSet& other) : Serializable(other),
data(other.data), constants(other.constants) {}
ConstantSet & operator = (const ConstantSet& other) {
if (&other != this) {
data = other.data;
constants = other.constants;
}
return *this;
}
static const uint32_t magic_begin = TO_MAGIC('C', 'N', 'S', 'T');
static const uint32_t magic_end = TO_MAGIC('T', 'S', 'N', 'C');
/* format:
magic_begin |
const_data_size |
const_data |
constant_1_size |
constant_1 |
........ |
constant_n_size |
constant_n |
magic_end |
total_size
*/
/*! Implements the serialization. */
virtual size_t serializeToBin(std::ostream& outs);
virtual size_t deserializeFromBin(std::istream& ins);
private:
vector data; //!< The constant data serialized in one array
vector constants;//!< Each constant description
GBE_CLASS(ConstantSet);
};
} /* namespace ir */
} /* namespace gbe */
#endif /* __GBE_IR_CONSTANT_HPP__ */
Release_v0.3/backend/src/ir/context.cpp 0000664 0000000 0000000 00000013344 12231421770 0020170 0 ustar 00root root 0000000 0000000 /*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see .
*
* Author: Benjamin Segovia
*/
/**
* \file context.cpp
* \author Benjamin Segovia
*/
#include "ir/context.hpp"
#include "ir/unit.hpp"
#include "ir/lowering.hpp"
namespace gbe {
namespace ir {
Context::Context(Unit &unit) :
unit(unit), fn(NULL), bb(NULL), usedLabels(NULL) {}
Context::~Context(void) {
for (const auto &elem : fnStack) GBE_SAFE_DELETE(elem.usedLabels);
GBE_SAFE_DELETE(usedLabels);
}
Function &Context::getFunction(void) {
GBE_ASSERTM(fn != NULL, "No function currently defined");
return *fn;
}
void Context::appendPushedConstant(Register reg, const PushLocation &pushed)
{
GBE_ASSERTM(fn != NULL, "No function currently defined");
GBE_ASSERTM(fn->pushMap.contains(reg) == false, "Register already pushed");
fn->pushMap.insert(std::make_pair(reg, pushed));
fn->locationMap.insert(std::make_pair(pushed, reg));
}
void Context::startFunction(const std::string &name) {
fnStack.push_back(StackElem(fn,bb,usedLabels));
fn = unit.newFunction(name);
usedLabels = GBE_NEW_NO_ARG(vector);
bb = NULL;
}
void Context::endFunction(void) {
GBE_ASSERTM(fn != NULL, "No function to end");
GBE_ASSERT(fnStack.size() != 0);
GBE_ASSERT(usedLabels != NULL);
// Empty function -> append a return
if (fn->blockNum() == 0) this->RET();
// Check first that all branch instructions point to valid labels
GBE_ASSERT(usedLabels);
#if GBE_DEBUG
for (auto usage : *usedLabels)
GBE_ASSERTM(usage != LABEL_IS_POINTED, "A label is used and not defined");
#endif /* GBE_DEBUG */
GBE_DELETE(usedLabels);
// Remove all returns and insert one unique return block at the end of the
// function
lowerReturn(unit, fn->getName());
// Spill function argument to the stack if required and identify which
// function arguments can use constant push
lowerFunctionArguments(unit, fn->getName());
// Properly order labels and compute the CFG
fn->sortLabels();
fn->computeCFG();
const StackElem elem = fnStack.back();
fnStack.pop_back();
fn = elem.fn;
bb = elem.bb;
usedLabels = elem.usedLabels;
}
Register Context::reg(RegisterFamily family) {
GBE_ASSERTM(fn != NULL, "No function currently defined");
return fn->newRegister(family);
}
LabelIndex Context::label(void) {
GBE_ASSERTM(fn != NULL, "No function currently defined");
const LabelIndex index = fn->newLabel();
if (index >= usedLabels->size()) {
usedLabels->resize(index + 1);
(*usedLabels)[index] = 0;
}
return index;
}
void Context::input(const std::string &name, FunctionArgument::Type type, Register reg, uint32_t elementSize) {
GBE_ASSERTM(fn != NULL, "No function currently defined");
GBE_ASSERTM(reg < fn->file.regNum(), "Out-of-bound register");
FunctionArgument *arg = GBE_NEW(FunctionArgument, type, reg, elementSize, name);
fn->args.push_back(arg);
}
void Context::output(Register reg) {
GBE_ASSERTM(fn != NULL, "No function currently defined");
GBE_ASSERTM(reg < fn->file.regNum(), "Out-of-bound register");
fn->outputs.push_back(reg);
}
void Context::startBlock(void) {
GBE_ASSERTM(fn != NULL, "No function currently defined");
this->bb = GBE_NEW(BasicBlock, *fn);
fn->blocks.push_back(bb);
}
void Context::endBlock(void) {
this->bb = NULL;
}
void Context::append(const Instruction &insn) {
GBE_ASSERTM(fn != NULL, "No function currently defined");
// Start a new block if this is a label
if (insn.isMemberOf() == true) {
this->endBlock();
this->startBlock();
const LabelIndex index = cast(insn).getLabelIndex();
GBE_ASSERTM(index < fn->labelNum(), "Out-of-bound label");
GBE_ASSERTM(fn->labels[index] == NULL, "Label used in a previous block");
fn->labels[index] = bb;
// Now the label index is properly defined
GBE_ASSERT(index < usedLabels->size());
(*usedLabels)[index] |= LABEL_IS_DEFINED;
}
// We create a new label for a new block if the user did not do it
else if (bb == NULL) {
// this->startBlock();
const LabelIndex index = this->label();
const Instruction insn = ir::LABEL(index);
this->append(insn);
}
// Append the instruction in the stream
Instruction *insnPtr = fn->newInstruction(insn);
bb->append(*insnPtr);
#if GBE_DEBUG
std::string whyNot;
GBE_ASSERTM(insnPtr->wellFormed(whyNot), whyNot.c_str());
#endif /* GBE_DEBUG */
// Close the current block if this is a branch
if (insn.isMemberOf() == true) {
// We must book keep the fact that the label is used
if (insn.getOpcode() == OP_BRA) {
const BranchInstruction &branch = cast(insn);
const LabelIndex index = branch.getLabelIndex();
GBE_ASSERT(index < usedLabels->size());
(*usedLabels)[index] |= LABEL_IS_POINTED;
}
this->endBlock();
}
}
} /* namespace ir */
} /* namespace gbe */
Release_v0.3/backend/src/ir/context.hpp 0000664 0000000 0000000 00000020241 12231421770 0020167 0 ustar 00root root 0000000 0000000 /*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see .
*
* Author: Benjamin Segovia
*/
/**
* \file context.hpp
* \author Benjamin Segovia
*/
#ifndef __GBE_IR_CONTEXT_HPP__
#define __GBE_IR_CONTEXT_HPP__
#include "ir/instruction.hpp"
#include "ir/function.hpp"
#include "ir/register.hpp"
#include "ir/immediate.hpp"
#include "ir/unit.hpp"
#include "sys/vector.hpp"
#include